├── main
├── __init__.py
├── settings.py
├── wsgi.py
├── urls.py
└── settings_default.py
├── catalog
├── __init__.py
├── management
│ ├── __init__.py
│ └── commands
│ │ ├── __init__.py
│ │ └── import_csv.py
├── tests.py
├── views.py
├── admin.py
└── models.py
├── crawler
├── __init__.py
├── models.py
├── tests.py
├── admin.py
├── views.py
├── exceptions.py
├── helpers.py
├── backends
│ └── __init__.py
├── tasks.py
└── resources.py
├── .gitignore
├── docs
└── screenshots
│ ├── screencapture-localhost-8000-admin-catalog-product-2.png
│ └── screencapture-localhost-8000-admin-catalog-product.png
├── requirements.pip
├── manage.py
├── README.md
└── Untitled Diagram.xml
/main/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/catalog/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/crawler/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/catalog/management/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/catalog/management/commands/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/catalog/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/crawler/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 |
3 | # Create your models here.
4 |
--------------------------------------------------------------------------------
/crawler/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/catalog/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 |
3 | # Create your views here.
4 |
--------------------------------------------------------------------------------
/crawler/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | # Register your models here.
4 |
--------------------------------------------------------------------------------
/crawler/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 |
3 | # Create your views here.
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.swo
3 | .idea
4 | db.*
5 | static/media
6 | *.pyc
7 | celerybeat-schedule
8 | settings_local.py
--------------------------------------------------------------------------------
/crawler/exceptions.py:
--------------------------------------------------------------------------------
1 | class ConnectionError(Exception):
2 | pass
3 |
4 |
5 | class ParseError(Exception):
6 | pass
--------------------------------------------------------------------------------
/main/settings.py:
--------------------------------------------------------------------------------
1 | from settings_default import *
2 | try:
3 | from settings_local import *
4 | except ImportError:
5 | pass
--------------------------------------------------------------------------------
/docs/screenshots/screencapture-localhost-8000-admin-catalog-product-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibrahimgunduz34/havuc/HEAD/docs/screenshots/screencapture-localhost-8000-admin-catalog-product-2.png
--------------------------------------------------------------------------------
/docs/screenshots/screencapture-localhost-8000-admin-catalog-product.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibrahimgunduz34/havuc/HEAD/docs/screenshots/screencapture-localhost-8000-admin-catalog-product.png
--------------------------------------------------------------------------------
/requirements.pip:
--------------------------------------------------------------------------------
1 | MySQL-python==1.2.5
2 | PIL==1.1.7
3 | redis==2.10.3
4 | https://pypi.python.org/packages/source/l/lxml/lxml-3.4.1.tar.gz#md5=b7696a3f33d5610b215a343216ab5624
5 | Django==1.6.2
6 | django-mptt==0.6.1
7 | django-celery==3.1.16
8 |
--------------------------------------------------------------------------------
/manage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import sys
4 |
5 | if __name__ == "__main__":
6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "main.settings")
7 |
8 | from django.core.management import execute_from_command_line
9 |
10 | execute_from_command_line(sys.argv)
11 |
--------------------------------------------------------------------------------
/main/wsgi.py:
--------------------------------------------------------------------------------
1 | """
2 | WSGI config for cmp project.
3 |
4 | It exposes the WSGI callable as a module-level variable named ``application``.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/1.6/howto/deployment/wsgi/
8 | """
9 |
10 | import os
11 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "main.settings")
12 |
13 | from django.core.wsgi import get_wsgi_application
14 | application = get_wsgi_application()
15 |
--------------------------------------------------------------------------------
/main/urls.py:
--------------------------------------------------------------------------------
1 | from django.conf import settings
2 | from django.conf.urls import patterns, include, url
3 | from django.conf.urls.static import static
4 |
5 | from django.contrib import admin
6 | admin.autodiscover()
7 |
8 | urlpatterns = patterns('',
9 | # Examples:
10 | # url(r'^$', 'cmp.views.home', name='home'),
11 | # url(r'^blog/', include('blog.urls')),
12 |
13 | url(r'^admin/', include(admin.site.urls)),
14 |
15 | )
16 |
17 |
18 | if settings.DEBUG:
19 | urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
--------------------------------------------------------------------------------
/crawler/helpers.py:
--------------------------------------------------------------------------------
1 | import importlib
2 |
3 |
4 | def load_class(full_class_path):
5 | splitted_class_path = full_class_path.split('.')
6 | moduleName = '.'.join(splitted_class_path[0:-1])
7 | if splitted_class_path.count > 1:
8 | className = splitted_class_path[-1]
9 | module = importlib.import_module(moduleName)
10 | if not hasattr(module, className):
11 | raise ImportError(
12 | 'No class exists %s in %s' % (className, moduleName))
13 | return getattr(module, className)
14 | else:
15 | return importlib.import_module(moduleName)
16 |
17 |
18 | def load_resource(resource_name):
19 | class_path = 'crawler.resources.%s' % resource_name
20 | return load_class(class_path)
--------------------------------------------------------------------------------
/crawler/backends/__init__.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | from crawler.exceptions import ConnectionError
3 |
4 | class WebCrawler(object):
5 | @classmethod
6 | def send_request(cls, url):
7 | try:
8 | request = urllib2.Request(url, headers={
9 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
10 | 'AppleWebKit/537.36 (KHTML, like '
11 | 'Gecko) Chrome/38.0.2125.122 Safari/537.36'
12 | })
13 | return urllib2.urlopen(request).read()
14 | except urllib2.URLError:
15 | raise ConnectionError('Connection failed. Url: %s' % url)
16 |
17 | @classmethod
18 | def crawle_resource(cls, resource):
19 | response = WebCrawler.send_request(resource.get_url())
20 | resource.prepare_document(response)
21 | return resource
--------------------------------------------------------------------------------
/catalog/management/commands/import_csv.py:
--------------------------------------------------------------------------------
1 | import csv
2 | from optparse import make_option
3 |
4 | from django.core.management.base import BaseCommand, CommandError
5 | from django.db.utils import IntegrityError
6 |
7 | from catalog.models import Resource, ProviderProduct
8 |
9 |
10 | class Command(BaseCommand):
11 | option_list = BaseCommand.option_list + (
12 | make_option('--filename', '-f'),
13 | make_option('--resource_slug'),
14 | )
15 |
16 | def validate_options(self, **options):
17 | filename = options.get('filename')
18 | resource_slug = options.get('resource_slug')
19 |
20 | if not filename:
21 | raise CommandError('filename is required!')
22 |
23 | if not resource_slug:
24 | raise CommandError('resource_slug is required!')
25 |
26 | def get_resource(self, resource_slug):
27 | try:
28 | return Resource.objects.get(slug=resource_slug)
29 | except Resource.DoesNotExist:
30 | raise CommandError('The specified resource is not found.')
31 |
32 | def encode_utf8(self, data):
33 | for line in data:
34 | yield line.decode('utf-8').encode('utf8')
35 |
36 | def handle(self, *args, **options):
37 | self.validate_options(**options)
38 | resource = self.get_resource(options.get('resource_slug'))
39 | with open(options.get('filename'), 'r') as fh:
40 | reader = csv.reader(self.encode_utf8(fh), delimiter=",")
41 | for row in reader:
42 | try:
43 | ProviderProduct.objects.create(
44 | name=unicode(row[0], 'utf8'),
45 | url=row[1], resource=resource)
46 | except IntegrityError:
47 | self.stdout.write('%s is already exist.' % row[1])
48 |
--------------------------------------------------------------------------------
/crawler/tasks.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from celery import task
4 |
5 | from catalog.models import ProductResource, ProductPrice
6 |
7 | from crawler.backends import WebCrawler
8 | from crawler.exceptions import ConnectionError, ParseError
9 | from crawler.helpers import load_resource
10 |
11 |
12 |
13 | @task(name="crawler.tasks.crawler_job")
14 | def crawler_job():
15 | product_resources = ProductResource.objects.filter(
16 | is_active=True, product__is_active=True)
17 | for product_resource in product_resources:
18 | crawle_resource.delay(product_resource)
19 |
20 |
21 | @task(name="crawler.tasks.crawle_resource")
22 | def crawle_resource(product_resource):
23 |
24 | web_resource = load_resource(product_resource.resource.resource_name)(
25 | product_resource.url)
26 | try:
27 | WebCrawler.crawle_resource(web_resource)
28 | except (ConnectionError, ParseError):
29 | raise Exception('Crawler error. Url: %s ' % web_resource.get_url())
30 |
31 | product = product_resource.product
32 | resource = product_resource.resource
33 |
34 | product.last_check_date = datetime.now()
35 |
36 | try:
37 | latest_price = ProductPrice.objects.filter(
38 | product=product, resource=resource).latest('id')
39 | if latest_price.price == web_resource.get_price() and \
40 | latest_price.currency == web_resource.get_currency():
41 | product.save()
42 | return False
43 | except ProductPrice.DoesNotExist:
44 | pass
45 |
46 | product_price = ProductPrice.objects.create(
47 | product=product, resource=resource,
48 | price=web_resource.get_price(),
49 | currency=web_resource.get_currency())
50 |
51 | min_price = ProductPrice.objects.filter(product=product).order_by('price')[0]
52 |
53 | product.last_price = min_price.price
54 | product.last_currency = min_price.currency
55 | product.last_change_date = datetime.now()
56 | product.save()
57 |
58 |
--------------------------------------------------------------------------------
/catalog/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | from catalog.models import (Product, Resource, ProductResource,
4 | Category, ProductPrice, ProviderProduct)
5 |
6 | from mptt.admin import MPTTModelAdmin
7 |
8 |
9 | class ChildCategory(admin.TabularInline):
10 | model = Category
11 |
12 |
13 | class CategoryAdmin(MPTTModelAdmin):
14 | mptt_level_indent = 20
15 | inlines = [ChildCategory, ]
16 |
17 |
18 | class ProductResourceAdmin(admin.TabularInline):
19 | model = ProductResource
20 |
21 |
22 | class ProductPriceAdmin(admin.TabularInline):
23 | model = ProductPrice
24 | readonly_fields = ['creation_date']
25 | ordering = ['-creation_date', 'resource__name', 'price']
26 |
27 |
28 | class ProductAdmin(admin.ModelAdmin):
29 | inlines = [ProductResourceAdmin, ProductPriceAdmin]
30 | list_display = ['name', 'category', 'last_price', 'last_currency',
31 | 'last_check_date', 'last_change_date']
32 | readonly_fields = ['admin_detail_image']
33 |
34 |
35 | class ResourceAdmin(admin.ModelAdmin):
36 | list_display = ['name', 'admin_image']
37 | prepopulated_fields = {'slug': ('name', )}
38 |
39 |
40 | class ProviderProductIsMatchedFilter(admin.SimpleListFilter):
41 | title = 'Is matched ?'
42 | parameter_name = 'is_matched'
43 |
44 | def lookups(self, request, model_admin):
45 | return (
46 | (1, 'Matched'),
47 | (0, 'Not Matched'),
48 | )
49 |
50 | def queryset(self, request, queryset):
51 | if self.value() is not None:
52 | return queryset.filter(product__isnull=not bool(int(self.value())))
53 |
54 |
55 | class ProviderProductAdmin(admin.ModelAdmin):
56 | raw_id_fields = ('product', )
57 | list_display = ['name', 'resource', 'is_matched']
58 | list_filter = ['resource', ProviderProductIsMatchedFilter]
59 |
60 |
61 | admin.site.register(Category, CategoryAdmin)
62 | admin.site.register(Product, ProductAdmin)
63 | admin.site.register(Resource, ResourceAdmin)
64 | admin.site.register(ProviderProduct, ProviderProductAdmin)
65 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Havuç Nedir ?
2 | Havuç, ürün bazında fiyat karşılaştırmanızı sağlayan bir web uygulamasıdır.
3 |
4 | # Nasıl Çalışır ?
5 | Havuç, ürün katoloğunda tanımladığnız her bir ürün için, ürünün bulunduğu web sitelere periyodik olarak bağlanarak fiyat ve resim bilgilerini toplar, arşivler. İlgili ürünün en düşük fiyatla nerede bulunduğu bilgisini saklar.
6 |
7 | # Kurulum:
8 |
9 | Uygulamayı geliştirme ortamında çalıştırmak için;
10 |
11 | . Sanal ortam kurulumunu gerceklestirn.
12 |
13 | ```shell
14 | $ virtualenv havuc-env
15 | $ cd havuc-env
16 | ```
17 |
18 | . Uygulamayı git reposundan geliştirme ortamınıza kopyalayın.
19 |
20 | ```shell
21 | $ git clone git@github.com:ibrahimgunduz34/havuc.git
22 | ```
23 |
24 | . Aşağıdaki komutu çalıştırarak gerekli linux paketlerinin kurulumunu gerçekleştirin.
25 | ```shell
26 | sudo apt-get install python-dev libffi-dev libxml2-dev libxslt-dev python-virtualenv redis-server python-mysqldb libmysqlclient-dev
27 | ```
28 |
29 | . Sanal ortamı aktif duruma getirin ve uygulama için gerekli paketlerin kurulumunu gerceklestirin.
30 |
31 | ```shell
32 | $ source bin/activate
33 | $ pip install -r requirements.pip
34 | ```
35 |
36 | . Redis kurulumunu gerçekleştirin. (yoksa)
37 |
38 | ```shell
39 | $ sudo apt-get install redis-server
40 | ```
41 |
42 | . SQLite veritabanın yaratılması için syncdb komutunu çalıştırın.
43 |
44 | ```shell
45 | $ python manage.py syncdb
46 | ```
47 |
48 |
49 | # Çalıştırılması:
50 | Havuc, fiyat edinme işlemini arkaplanda asenkron olarak gerçekleştirdiği için geliştirme ortamında celery kuyruklarını işleyecek django komutları çalıştırılmalıdır.
51 |
52 | ```shell
53 | $ python manage.py celeryd -B
54 | ...
55 | $ python managege.py celeryd -Q scheduled_tasks,crawler
56 | ```
57 |
58 | . Development web sunucusunu çalıştırın.
59 |
60 | ```shell
61 | $ python manage.py runserver
62 | ```
63 |
64 | # Ekran Görüntüleri:
65 |
66 | Ürün Listesi:
67 | 
68 |
69 | Ürün Detayı:
70 | 
71 |
--------------------------------------------------------------------------------
/Untitled Diagram.xml:
--------------------------------------------------------------------------------
1 | 7Zxbd6soFMc/TR5Pl4oafWzS9szDXM6azprLI1GSOCWSRUjTzqcfVLyBSWxPMPaUvlQ3CAq//Qc2xgmYb16+Urhd/0JihCeOFb9MwN3EcWwXuPxfZnktLG4YFoYVTWKRqTY8Jv8hYbSEdZ/EaNfKyAjBLNm2jRFJUxSxlg1SSg7tbEuC27Vu4QophscIYtX6VxKztbDallUn/ISS1VpUHXgiYQGjpxUl+1TUN3HAMv8rkjewLEvk361hTA4NE7ifgDklhBVHm5c5wlnbls1WXPdwJLW6b4pS1ueC6QIuoB25i8iZLt2F9cUpSniGeC/aYgJuf1v8m7VyccfstWwlfvPb7HC/wT8nS4STlJ/NtogmG8QQ5SlYmL/VttlhnTD0uIVRdumB88Nta7bB/Mzmh7xLGeSX0OocY7jdJYu8VotbKIr2dJc8o9/RriAns5I9y2qaV0TkWbOeQLEoqmpsKy93k0TiGMMFwrOq6+YEk6z6lOQPtGOUPKHSyHvUyv+qlJKQrIplgnEj50P+l9n5Uz3ATYIzh/gT0RimUJgF/bYjzrsqgjhZpdwW8W7NG1HtZ9H1z4gy9NIwiX7/igjvAPrKs4hUxxUMCh8NxOmhAXzJ+7rBuldmhMLJVlXRNWj8QLDWk7vS85vgybw1KNmSJGV5/d5s4t1J2BHK1mRFUoib4NUwWD86DEfdujcdrteCoyKhRYcKB79NDXAAI0ofk8M3i5JrjUqUOsAzonQxUQJvpkMSJaenKFk64HCNKH1MDt8sSt6oZkqe4e6TcOcHY+LON9x9Eu4CZ0zcTQ13n4S70B8Td4Hh7pNwZ49q0RmaNafGNWfwVjjeGQhzXS0BiY5JmIFjuIVg0IbB6RmAsPVEp9SZUZzstpBF61NQZI+bRBDfigZaEMbIhiegNL7N9s4yGybR09le7tlDKF6hR3ErCC/I4b42zHIDTyjv6lRH7cieRuhYa4hoEoN0hY55f+k/2S2d7HCKMGR8YG3V39V74tJvmac1QLGtblDKIopHEVfVDPD2h6+NbMKBj9bj+t311EgVJdaAVc/4TubM6HRNAZo6kgCBnqORDv0pmdaoP7yl6Ovfosfzk396df+4lMnvo0zhMMrke1Y3QWeU6RLiUfFeA0MR29P0e3EhW8STZjHcraslDk8WXR18MFzCEQ1kMi7A8ttFFHf53QOZP5Ww9KX3VqT8U+tk/ssPfOU4Zwa+Ucy8q+XVuZm3npFP3Rf+YCPf+6SpjzKVfjH0FNvtOcW+iBh07MEZMRgyXiepgd9PDbS8nOSoMZlPIQYl86fVwBtGDUIgzVNk3depBmocZqhpLXpJWMUFP86xuAm9K5PhjWickMlw7X4z2IuQ0bGHZMaJ4faxpUWF5/UbJnwtw4QaOftgw8Qwy18w7SMewTDiEVhHCBpgWAFqeM1ES46ScBKXgWJrMi2+oydY4km65lu6o/5q4M5o1zvXx+WAoxtGR3qJyw6Hky5bfTfa8HKChdPAgGGAAXIIVn5rXudY1/EyvZkoDxddDeUof7+J8lTHRBmowbVxvtagqkBTNWoKm9IR7elzY9IlNMxuadiNE/iloUFwtcjPr7ixQBkAKC4CrifO5WveqU19di1Lv9WtTa5/ZN/7jDapU6fwTEGX2s+St1kD5+R9Kftsul/kAPqjlmPxstplfL/lMlM+tF7SZYbyBt/m7t78a6PTc4161jdAEA7iG+UN9/YNcV/6fKPjJwdmNnK12YjX8y1LLVu9QH/Ybiw6eeXZiPYN49BrqWZb7DxLKrCvavqWdapYELZS3UEUtXyYvorqAc2K6qqxzF8JQ4oHlb+jSbPEsz+QEb95sd0Oh2Nk25AnjJYsu4AXlaSrP7K0uy/+D6TDF5Bd+Vcwna+Wdm2qexpk11UjjgaYkQEjf0DousA4BpixAyN/3OW6wKjv8BlgRgaM/OGN6wKjxqgNMCMDRv5ixnWBUQPZBpiRASN/6uK6wKgxWQPMyICRv1GhERh+Wn8ItliF11/bBff/Aw==
--------------------------------------------------------------------------------
/catalog/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 |
3 | from mptt.models import MPTTModel, TreeForeignKey
4 |
5 |
6 | class Category(MPTTModel):
7 | name = models.CharField(max_length=50, unique=True)
8 | parent = TreeForeignKey('self', null=True, blank=True,
9 | related_name='children')
10 |
11 | class MPTTMeta:
12 | order_insertion_by = ['name']
13 |
14 | def __unicode__(self):
15 | return self.name
16 |
17 |
18 | class Product(models.Model):
19 | name = models.CharField(max_length=120)
20 | category = models.ForeignKey(Category)
21 | small_description = models.CharField(max_length=255, null=True, blank=True)
22 | long_description = models.TextField(null=True, blank=True)
23 | last_price = models.DecimalField(max_digits=7, decimal_places=2,
24 | null=True, blank=True)
25 | last_currency = models.CharField(max_length=3, null=True, blank=True)
26 | image = models.ImageField(upload_to='product/%Y/%m')
27 | is_active = models.BooleanField(default=True)
28 | last_check_date = models.DateTimeField(null=True, blank=True)
29 | last_change_date = models.DateTimeField(null=True, blank=True)
30 |
31 | def __unicode__(self):
32 | return self.name
33 |
34 | def admin_detail_image(self):
35 | return '
' % self.image.url
36 | admin_detail_image.allow_tags = True
37 |
38 |
39 | class Resource(models.Model):
40 | name = models.CharField(max_length=50)
41 | slug = models.SlugField()
42 | icon = models.ImageField(upload_to='resources')
43 | resource_name = models.CharField(max_length=50)
44 | is_active = models.BooleanField(default=True)
45 |
46 | def admin_image(self):
47 | return '
' % self.icon.url
48 | admin_image.allow_tags = True
49 |
50 | def __unicode__(self):
51 | return self.name
52 |
53 |
54 | class ProductResource(models.Model):
55 | product = models.ForeignKey(Product)
56 | resource = models.ForeignKey(Resource)
57 | url = models.URLField()
58 | is_active = models.BooleanField(default=True)
59 | display_in_frontend = models.BooleanField(default=True)
60 | enable_crawling = models.BooleanField(default=True)
61 |
62 |
63 | class ProductPrice(models.Model):
64 | product = models.ForeignKey(Product)
65 | resource = models.ForeignKey(Resource)
66 | creation_date = models.DateTimeField(auto_now_add=True)
67 | price = models.DecimalField(max_digits=7, decimal_places=2)
68 | currency = models.CharField(max_length=3)
69 |
70 |
71 | class ProviderProduct(models.Model):
72 | resource = models.ForeignKey(Resource)
73 | product = models.ForeignKey(Product, null=True)
74 | name = models.CharField(max_length=120)
75 | url = models.URLField(unique=True)
76 |
77 | def is_matched(self):
78 | return self.product is not None
79 | is_matched.boolean = True
80 |
81 | def __unicode__(self):
82 | return self.name
83 |
--------------------------------------------------------------------------------
/main/settings_default.py:
--------------------------------------------------------------------------------
1 | import os
2 | import datetime
3 |
4 | import djcelery
5 |
6 | """
7 | Django settings for cmp project.
8 |
9 | For more information on this file, see
10 | https://docs.djangoproject.com/en/1.6/topics/settings/
11 |
12 | For the full list of settings and their values, see
13 | https://docs.djangoproject.com/en/1.6/ref/settings/
14 | """
15 |
16 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
17 |
18 | BASE_DIR = os.path.dirname(os.path.dirname(__file__))
19 |
20 | # preparing Celery
21 |
22 | djcelery.setup_loader()
23 |
24 |
25 | # Quick-start development settings - unsuitable for production
26 | # See https://docs.djangoproject.com/en/1.6/howto/deployment/checklist/
27 |
28 | # SECURITY WARNING: keep the secret key used in production secret!
29 | SECRET_KEY = '*@1!+rr*%ewd#3n$426iw^q%9@1tta4e%#hji&%@n6ulyko6it'
30 |
31 | # SECURITY WARNING: don't run with debug turned on in production!
32 | DEBUG = True
33 |
34 | TEMPLATE_DEBUG = True
35 |
36 | ALLOWED_HOSTS = []
37 |
38 |
39 | TEMPLATE_DIRS = (
40 | os.path.join(BASE_DIR, 'template'))
41 |
42 | # Application definition
43 |
44 | INSTALLED_APPS = (
45 | 'django.contrib.admin',
46 | 'django.contrib.auth',
47 | 'django.contrib.contenttypes',
48 | 'django.contrib.sessions',
49 | 'django.contrib.messages',
50 | 'django.contrib.staticfiles',
51 |
52 | 'mptt',
53 | 'djcelery',
54 |
55 | 'catalog',
56 | 'crawler',
57 | )
58 |
59 | MIDDLEWARE_CLASSES = (
60 | 'django.contrib.sessions.middleware.SessionMiddleware',
61 | 'django.middleware.common.CommonMiddleware',
62 | 'django.middleware.csrf.CsrfViewMiddleware',
63 | 'django.contrib.auth.middleware.AuthenticationMiddleware',
64 | 'django.contrib.messages.middleware.MessageMiddleware',
65 | 'django.middleware.clickjacking.XFrameOptionsMiddleware',
66 | )
67 |
68 | ROOT_URLCONF = 'main.urls'
69 |
70 | WSGI_APPLICATION = 'main.wsgi.application'
71 |
72 |
73 | # Database
74 | # https://docs.djangoproject.com/en/1.6/ref/settings/#databases
75 |
76 | DATABASES = {
77 | 'default': {
78 | 'ENGINE': 'django.db.backends.sqlite3',
79 | 'NAME': 'db.havuc',
80 | }
81 | }
82 |
83 | # Internationalization
84 | # https://docs.djangoproject.com/en/1.6/topics/i18n/
85 |
86 | LANGUAGE_CODE = 'tr-TR'
87 |
88 | TIME_ZONE = 'GMT'
89 |
90 | USE_I18N = True
91 |
92 | USE_L10N = True
93 |
94 | USE_TZ = False
95 |
96 |
97 | # Static files (CSS, JavaScript, Images)
98 | # https://docs.djangoproject.com/en/1.6/howto/static-files/
99 |
100 | STATIC_URL = '/static/'
101 |
102 | # media files
103 | # /var/www/project/static/media
104 | MEDIA_ROOT = os.path.join(BASE_DIR, 'static', 'media')
105 |
106 | # media url
107 | # /media/hede.jpg
108 | MEDIA_URL = '/media/'
109 |
110 | # celery configuration
111 | BROKER_BACKEND = 'redis'
112 | BROKER_HOST = '127.0.0.1'
113 | BROKER_USER = ""
114 | BROKER_PASSWORD = ""
115 | REDIS_PORT = 6379
116 |
117 | # Celery routing configuration
118 | CELERY_ROUTES = {
119 | 'crawler.tasks.crawler_job': {'queue': 'scheduled_tasks'},
120 | 'crawler.tasks.crawle_resource': {'queue': 'crawler'},
121 | }
122 |
123 | CELERYBEAT_SCHEDULE = {
124 | 'crawler_job': {
125 | 'task': 'crawler.tasks.crawler_job',
126 | 'schedule': datetime.timedelta(minutes=30)
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/crawler/resources.py:
--------------------------------------------------------------------------------
1 | from crawler.exceptions import ParseError
2 | from decimal import Decimal
3 | from lxml import html as parser
4 | from urlparse import urlparse
5 |
6 |
7 | class BaseResource(object):
8 | def __init__(self, url):
9 | self.url = url
10 | self.price = 0
11 | self.currency = None
12 | self.image_url = 0
13 | self.document = None
14 |
15 | def prepare_document(self, content):
16 | try:
17 | self.document = parser.fromstring(content)
18 | except:
19 | raise ParseError('Error occured while parsing '
20 | 'content. Url: %s' % self.url)
21 | self.parse_price()
22 | self.parse_currency()
23 | self.parse_image_url()
24 |
25 | def get_document(self):
26 | return self.document
27 |
28 | def get_items(self, xpath):
29 | return self.document.xpath(xpath)
30 |
31 | def get_item(self, xpath):
32 | return self.get_items(xpath)[0]
33 |
34 | def get_node_value(self, xpath):
35 | return self.get_item(xpath).text
36 |
37 | def get_attribute_value(self, xpath, attr):
38 | return self.get_item(xpath).get(attr)
39 |
40 | def get_url(self):
41 | return self.url
42 |
43 | def get_base_url(self):
44 | parsed_url = urlparse(self.url)
45 | return '%s://%s' % (parsed_url.scheme, parsed_url.hostname)
46 |
47 | def parse_price(self):
48 | raise NotImplemented()
49 |
50 | def parse_image_url(self):
51 | raise NotImplemented()
52 |
53 | def parse_currency(self):
54 | raise NotImplemented()
55 |
56 | def get_price(self):
57 | return self.price
58 |
59 | def get_currency(self):
60 | return self.currency
61 |
62 | def get_image_url(self):
63 | return self.image_url
64 |
65 |
66 | class VatanBilgisayarResource(BaseResource):
67 | def parse_price(self):
68 | xpath = '//*[@id="ctl00_u14_ascUrunDetay_dtUrunD' \
69 | 'etay_ctl00_lblSatisFiyat"]'
70 | price = self.get_node_value(xpath).replace('.', '')
71 | try:
72 | self.price = Decimal(price)
73 | except ValueError, TypeError:
74 | self.price = None
75 |
76 | def parse_currency(self):
77 | self.currency = 'TL'
78 |
79 | def parse_image_url(self):
80 | xpath = '//*[@class="slider"]/li[1]/a/img';
81 | self.image_url = '%s%s' % (self.get_base_url(),
82 | self.get_attribute_value(xpath, 'src'))
83 |
84 |
85 | class BimeksResource(BaseResource):
86 | def parse_price(self):
87 | thousand_xpath = '//*[@id="ctl00_cphcontent_detay_urun1_FormView_' \
88 | 'Urun_Detay_PanelPrices"]/div[1]/span'
89 | decimal_xpath = '//*[@id="ctl00_cphcontent_detay_urun1_FormView_' \
90 | 'Urun_Detay_PanelPrices"]/div[1]/span/small'
91 | thousand_value = self.get_node_value(thousand_xpath)
92 | decimal_value = self.get_node_value(decimal_xpath).split(' ')[0][1:]
93 | price = '%s.%s' % (thousand_value, decimal_value)
94 | try:
95 | self.price = Decimal(price)
96 | except ValueError, TypeError:
97 | self.price = None
98 |
99 | def parse_currency(self):
100 | xpath = '//*[@id="ctl00_cphcontent_detay_urun1_FormView_' \
101 | 'Urun_Detay_PanelPrices"]/div[1]/span/small'
102 | self.currency = self.get_node_value(xpath).split(' ')[1]
103 |
104 | def parse_image_url(self):
105 | xpath = '//*[@id="thumbs"]/ul/li[1]/a'
106 | self.image_url = self.get_attribute_value(xpath, 'href')
107 |
108 |
109 | class HepsiBuradaResource(BaseResource):
110 | def parse_price(self):
111 | xpath = '//*[@id="ctl00_ContentPlaceHolder1_ProductControl1_' \
112 | 'MainControl1_ProductMain1_lblPriceFirst"]'
113 | value = self.get_node_value(xpath).split(' ')
114 | price = value[0].replace('.', '').replace(',', '.')
115 | currency = value[1]
116 | try:
117 | self.price = Decimal(price)
118 | except ValueError, TypeError:
119 | self.price = None
120 | self.currency = currency
121 |
122 | def parse_currency(self):
123 | pass
124 |
125 | def parse_image_url(self):
126 | xpath = '//*[@id="ctl00_ContentPlaceHolder1_ProductControl1_' \
127 | 'MainControl1_TabControl1_TabImage1_rptBigImages_ctl00_imgBigImage"]'
128 | self.image_url = self.get_attribute_value(xpath, 'src')
129 |
130 |
131 | class HizliAlResource(BaseResource):
132 | def parse_price(self):
133 | xpath = '//*[@id="content_ProductPrices1_divFiyat"]/span'
134 | values = self.get_items(xpath)
135 | if len(values) > 1 and 'ndirim' in values[1].text.encode('utf8'):
136 | xpath = '//*[@id="content_ProductPrices1_divFiyat"]/div[2]'
137 | else:
138 | xpath = '//*[@id="content_ProductPrices1_divFiyat"]/div'
139 |
140 | price = self.get_node_value(xpath).strip()
141 | price = price.replace('.', '').replace(',', '.')
142 | try:
143 | self.price = Decimal(price)
144 | except ValueError, TypeError:
145 | self.price = None
146 |
147 | def parse_currency(self):
148 | xpath = '//*[@id="content_ProductPrices1_divFiyat"]/span'
149 | values = self.get_items(xpath)
150 | if len(values) > 1 and 'ndirim' in values[1].text.encode('utf8'):
151 | xpath = '//*[@id="content_ProductPrices1_divFiyat"]/div[2]/span'
152 | else:
153 | xpath = '//*[@id="content_ProductPrices1_divFiyat"]/div/span'
154 | self.currency = self.get_node_value(xpath).strip()
155 |
156 | def parse_image_url(self):
157 | xpath = '//*[@id="imagezoom_thum"]/div/ul/li[1]/a'
158 | self.image_url = self.get_attribute_value(xpath, 'href')
--------------------------------------------------------------------------------