├── main ├── __init__.py ├── settings.py ├── wsgi.py ├── urls.py └── settings_default.py ├── catalog ├── __init__.py ├── management │ ├── __init__.py │ └── commands │ │ ├── __init__.py │ │ └── import_csv.py ├── tests.py ├── views.py ├── admin.py └── models.py ├── crawler ├── __init__.py ├── models.py ├── tests.py ├── admin.py ├── views.py ├── exceptions.py ├── helpers.py ├── backends │ └── __init__.py ├── tasks.py └── resources.py ├── .gitignore ├── docs └── screenshots │ ├── screencapture-localhost-8000-admin-catalog-product-2.png │ └── screencapture-localhost-8000-admin-catalog-product.png ├── requirements.pip ├── manage.py ├── README.md └── Untitled Diagram.xml /main/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /catalog/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /catalog/management/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /catalog/management/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /catalog/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /crawler/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | # Create your models here. 4 | -------------------------------------------------------------------------------- /crawler/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /catalog/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | -------------------------------------------------------------------------------- /crawler/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /crawler/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.swo 3 | .idea 4 | db.* 5 | static/media 6 | *.pyc 7 | celerybeat-schedule 8 | settings_local.py -------------------------------------------------------------------------------- /crawler/exceptions.py: -------------------------------------------------------------------------------- 1 | class ConnectionError(Exception): 2 | pass 3 | 4 | 5 | class ParseError(Exception): 6 | pass -------------------------------------------------------------------------------- /main/settings.py: -------------------------------------------------------------------------------- 1 | from settings_default import * 2 | try: 3 | from settings_local import * 4 | except ImportError: 5 | pass -------------------------------------------------------------------------------- /docs/screenshots/screencapture-localhost-8000-admin-catalog-product-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibrahimgunduz34/havuc/HEAD/docs/screenshots/screencapture-localhost-8000-admin-catalog-product-2.png -------------------------------------------------------------------------------- /docs/screenshots/screencapture-localhost-8000-admin-catalog-product.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibrahimgunduz34/havuc/HEAD/docs/screenshots/screencapture-localhost-8000-admin-catalog-product.png -------------------------------------------------------------------------------- /requirements.pip: -------------------------------------------------------------------------------- 1 | MySQL-python==1.2.5 2 | PIL==1.1.7 3 | redis==2.10.3 4 | https://pypi.python.org/packages/source/l/lxml/lxml-3.4.1.tar.gz#md5=b7696a3f33d5610b215a343216ab5624 5 | Django==1.6.2 6 | django-mptt==0.6.1 7 | django-celery==3.1.16 8 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "main.settings") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /main/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for cmp project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.6/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "main.settings") 12 | 13 | from django.core.wsgi import get_wsgi_application 14 | application = get_wsgi_application() 15 | -------------------------------------------------------------------------------- /main/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf import settings 2 | from django.conf.urls import patterns, include, url 3 | from django.conf.urls.static import static 4 | 5 | from django.contrib import admin 6 | admin.autodiscover() 7 | 8 | urlpatterns = patterns('', 9 | # Examples: 10 | # url(r'^$', 'cmp.views.home', name='home'), 11 | # url(r'^blog/', include('blog.urls')), 12 | 13 | url(r'^admin/', include(admin.site.urls)), 14 | 15 | ) 16 | 17 | 18 | if settings.DEBUG: 19 | urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) -------------------------------------------------------------------------------- /crawler/helpers.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | 4 | def load_class(full_class_path): 5 | splitted_class_path = full_class_path.split('.') 6 | moduleName = '.'.join(splitted_class_path[0:-1]) 7 | if splitted_class_path.count > 1: 8 | className = splitted_class_path[-1] 9 | module = importlib.import_module(moduleName) 10 | if not hasattr(module, className): 11 | raise ImportError( 12 | 'No class exists %s in %s' % (className, moduleName)) 13 | return getattr(module, className) 14 | else: 15 | return importlib.import_module(moduleName) 16 | 17 | 18 | def load_resource(resource_name): 19 | class_path = 'crawler.resources.%s' % resource_name 20 | return load_class(class_path) -------------------------------------------------------------------------------- /crawler/backends/__init__.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | from crawler.exceptions import ConnectionError 3 | 4 | class WebCrawler(object): 5 | @classmethod 6 | def send_request(cls, url): 7 | try: 8 | request = urllib2.Request(url, headers={ 9 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 10 | 'AppleWebKit/537.36 (KHTML, like ' 11 | 'Gecko) Chrome/38.0.2125.122 Safari/537.36' 12 | }) 13 | return urllib2.urlopen(request).read() 14 | except urllib2.URLError: 15 | raise ConnectionError('Connection failed. Url: %s' % url) 16 | 17 | @classmethod 18 | def crawle_resource(cls, resource): 19 | response = WebCrawler.send_request(resource.get_url()) 20 | resource.prepare_document(response) 21 | return resource -------------------------------------------------------------------------------- /catalog/management/commands/import_csv.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from optparse import make_option 3 | 4 | from django.core.management.base import BaseCommand, CommandError 5 | from django.db.utils import IntegrityError 6 | 7 | from catalog.models import Resource, ProviderProduct 8 | 9 | 10 | class Command(BaseCommand): 11 | option_list = BaseCommand.option_list + ( 12 | make_option('--filename', '-f'), 13 | make_option('--resource_slug'), 14 | ) 15 | 16 | def validate_options(self, **options): 17 | filename = options.get('filename') 18 | resource_slug = options.get('resource_slug') 19 | 20 | if not filename: 21 | raise CommandError('filename is required!') 22 | 23 | if not resource_slug: 24 | raise CommandError('resource_slug is required!') 25 | 26 | def get_resource(self, resource_slug): 27 | try: 28 | return Resource.objects.get(slug=resource_slug) 29 | except Resource.DoesNotExist: 30 | raise CommandError('The specified resource is not found.') 31 | 32 | def encode_utf8(self, data): 33 | for line in data: 34 | yield line.decode('utf-8').encode('utf8') 35 | 36 | def handle(self, *args, **options): 37 | self.validate_options(**options) 38 | resource = self.get_resource(options.get('resource_slug')) 39 | with open(options.get('filename'), 'r') as fh: 40 | reader = csv.reader(self.encode_utf8(fh), delimiter=",") 41 | for row in reader: 42 | try: 43 | ProviderProduct.objects.create( 44 | name=unicode(row[0], 'utf8'), 45 | url=row[1], resource=resource) 46 | except IntegrityError: 47 | self.stdout.write('%s is already exist.' % row[1]) 48 | -------------------------------------------------------------------------------- /crawler/tasks.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from celery import task 4 | 5 | from catalog.models import ProductResource, ProductPrice 6 | 7 | from crawler.backends import WebCrawler 8 | from crawler.exceptions import ConnectionError, ParseError 9 | from crawler.helpers import load_resource 10 | 11 | 12 | 13 | @task(name="crawler.tasks.crawler_job") 14 | def crawler_job(): 15 | product_resources = ProductResource.objects.filter( 16 | is_active=True, product__is_active=True) 17 | for product_resource in product_resources: 18 | crawle_resource.delay(product_resource) 19 | 20 | 21 | @task(name="crawler.tasks.crawle_resource") 22 | def crawle_resource(product_resource): 23 | 24 | web_resource = load_resource(product_resource.resource.resource_name)( 25 | product_resource.url) 26 | try: 27 | WebCrawler.crawle_resource(web_resource) 28 | except (ConnectionError, ParseError): 29 | raise Exception('Crawler error. Url: %s ' % web_resource.get_url()) 30 | 31 | product = product_resource.product 32 | resource = product_resource.resource 33 | 34 | product.last_check_date = datetime.now() 35 | 36 | try: 37 | latest_price = ProductPrice.objects.filter( 38 | product=product, resource=resource).latest('id') 39 | if latest_price.price == web_resource.get_price() and \ 40 | latest_price.currency == web_resource.get_currency(): 41 | product.save() 42 | return False 43 | except ProductPrice.DoesNotExist: 44 | pass 45 | 46 | product_price = ProductPrice.objects.create( 47 | product=product, resource=resource, 48 | price=web_resource.get_price(), 49 | currency=web_resource.get_currency()) 50 | 51 | min_price = ProductPrice.objects.filter(product=product).order_by('price')[0] 52 | 53 | product.last_price = min_price.price 54 | product.last_currency = min_price.currency 55 | product.last_change_date = datetime.now() 56 | product.save() 57 | 58 | -------------------------------------------------------------------------------- /catalog/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | from catalog.models import (Product, Resource, ProductResource, 4 | Category, ProductPrice, ProviderProduct) 5 | 6 | from mptt.admin import MPTTModelAdmin 7 | 8 | 9 | class ChildCategory(admin.TabularInline): 10 | model = Category 11 | 12 | 13 | class CategoryAdmin(MPTTModelAdmin): 14 | mptt_level_indent = 20 15 | inlines = [ChildCategory, ] 16 | 17 | 18 | class ProductResourceAdmin(admin.TabularInline): 19 | model = ProductResource 20 | 21 | 22 | class ProductPriceAdmin(admin.TabularInline): 23 | model = ProductPrice 24 | readonly_fields = ['creation_date'] 25 | ordering = ['-creation_date', 'resource__name', 'price'] 26 | 27 | 28 | class ProductAdmin(admin.ModelAdmin): 29 | inlines = [ProductResourceAdmin, ProductPriceAdmin] 30 | list_display = ['name', 'category', 'last_price', 'last_currency', 31 | 'last_check_date', 'last_change_date'] 32 | readonly_fields = ['admin_detail_image'] 33 | 34 | 35 | class ResourceAdmin(admin.ModelAdmin): 36 | list_display = ['name', 'admin_image'] 37 | prepopulated_fields = {'slug': ('name', )} 38 | 39 | 40 | class ProviderProductIsMatchedFilter(admin.SimpleListFilter): 41 | title = 'Is matched ?' 42 | parameter_name = 'is_matched' 43 | 44 | def lookups(self, request, model_admin): 45 | return ( 46 | (1, 'Matched'), 47 | (0, 'Not Matched'), 48 | ) 49 | 50 | def queryset(self, request, queryset): 51 | if self.value() is not None: 52 | return queryset.filter(product__isnull=not bool(int(self.value()))) 53 | 54 | 55 | class ProviderProductAdmin(admin.ModelAdmin): 56 | raw_id_fields = ('product', ) 57 | list_display = ['name', 'resource', 'is_matched'] 58 | list_filter = ['resource', ProviderProductIsMatchedFilter] 59 | 60 | 61 | admin.site.register(Category, CategoryAdmin) 62 | admin.site.register(Product, ProductAdmin) 63 | admin.site.register(Resource, ResourceAdmin) 64 | admin.site.register(ProviderProduct, ProviderProductAdmin) 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Havuç Nedir ? 2 | Havuç, ürün bazında fiyat karşılaştırmanızı sağlayan bir web uygulamasıdır. 3 | 4 | # Nasıl Çalışır ? 5 | Havuç, ürün katoloğunda tanımladığnız her bir ürün için, ürünün bulunduğu web sitelere periyodik olarak bağlanarak fiyat ve resim bilgilerini toplar, arşivler. İlgili ürünün en düşük fiyatla nerede bulunduğu bilgisini saklar. 6 | 7 | # Kurulum: 8 | 9 | Uygulamayı geliştirme ortamında çalıştırmak için; 10 | 11 | . Sanal ortam kurulumunu gerceklestirn. 12 | 13 | ```shell 14 | $ virtualenv havuc-env 15 | $ cd havuc-env 16 | ``` 17 | 18 | . Uygulamayı git reposundan geliştirme ortamınıza kopyalayın. 19 | 20 | ```shell 21 | $ git clone git@github.com:ibrahimgunduz34/havuc.git 22 | ``` 23 | 24 | . Aşağıdaki komutu çalıştırarak gerekli linux paketlerinin kurulumunu gerçekleştirin. 25 | ```shell 26 | sudo apt-get install python-dev libffi-dev libxml2-dev libxslt-dev python-virtualenv redis-server python-mysqldb libmysqlclient-dev 27 | ``` 28 | 29 | . Sanal ortamı aktif duruma getirin ve uygulama için gerekli paketlerin kurulumunu gerceklestirin. 30 | 31 | ```shell 32 | $ source bin/activate 33 | $ pip install -r requirements.pip 34 | ``` 35 | 36 | . Redis kurulumunu gerçekleştirin. (yoksa) 37 | 38 | ```shell 39 | $ sudo apt-get install redis-server 40 | ``` 41 | 42 | . SQLite veritabanın yaratılması için syncdb komutunu çalıştırın. 43 | 44 | ```shell 45 | $ python manage.py syncdb 46 | ``` 47 | 48 | 49 | # Çalıştırılması: 50 | Havuc, fiyat edinme işlemini arkaplanda asenkron olarak gerçekleştirdiği için geliştirme ortamında celery kuyruklarını işleyecek django komutları çalıştırılmalıdır. 51 | 52 | ```shell 53 | $ python manage.py celeryd -B 54 | ... 55 | $ python managege.py celeryd -Q scheduled_tasks,crawler 56 | ``` 57 | 58 | . Development web sunucusunu çalıştırın. 59 | 60 | ```shell 61 | $ python manage.py runserver 62 | ``` 63 | 64 | # Ekran Görüntüleri: 65 | 66 | Ürün Listesi: 67 | ![Ürün Listesi](/docs/screenshots/screencapture-localhost-8000-admin-catalog-product.png) 68 | 69 | Ürün Detayı: 70 | ![Ürün Detayı](/docs/screenshots/screencapture-localhost-8000-admin-catalog-product-2.png) 71 | -------------------------------------------------------------------------------- /Untitled Diagram.xml: -------------------------------------------------------------------------------- 1 | 7Zxbd6soFMc/TR5Pl4oafWzS9szDXM6azprLI1GSOCWSRUjTzqcfVLyBSWxPMPaUvlQ3CAq//Qc2xgmYb16+Urhd/0JihCeOFb9MwN3EcWwXuPxfZnktLG4YFoYVTWKRqTY8Jv8hYbSEdZ/EaNfKyAjBLNm2jRFJUxSxlg1SSg7tbEuC27Vu4QophscIYtX6VxKztbDallUn/ISS1VpUHXgiYQGjpxUl+1TUN3HAMv8rkjewLEvk361hTA4NE7ifgDklhBVHm5c5wlnbls1WXPdwJLW6b4pS1ueC6QIuoB25i8iZLt2F9cUpSniGeC/aYgJuf1v8m7VyccfstWwlfvPb7HC/wT8nS4STlJ/NtogmG8QQ5SlYmL/VttlhnTD0uIVRdumB88Nta7bB/Mzmh7xLGeSX0OocY7jdJYu8VotbKIr2dJc8o9/RriAns5I9y2qaV0TkWbOeQLEoqmpsKy93k0TiGMMFwrOq6+YEk6z6lOQPtGOUPKHSyHvUyv+qlJKQrIplgnEj50P+l9n5Uz3ATYIzh/gT0RimUJgF/bYjzrsqgjhZpdwW8W7NG1HtZ9H1z4gy9NIwiX7/igjvAPrKs4hUxxUMCh8NxOmhAXzJ+7rBuldmhMLJVlXRNWj8QLDWk7vS85vgybw1KNmSJGV5/d5s4t1J2BHK1mRFUoib4NUwWD86DEfdujcdrteCoyKhRYcKB79NDXAAI0ofk8M3i5JrjUqUOsAzonQxUQJvpkMSJaenKFk64HCNKH1MDt8sSt6oZkqe4e6TcOcHY+LON9x9Eu4CZ0zcTQ13n4S70B8Td4Hh7pNwZ49q0RmaNafGNWfwVjjeGQhzXS0BiY5JmIFjuIVg0IbB6RmAsPVEp9SZUZzstpBF61NQZI+bRBDfigZaEMbIhiegNL7N9s4yGybR09le7tlDKF6hR3ErCC/I4b42zHIDTyjv6lRH7cieRuhYa4hoEoN0hY55f+k/2S2d7HCKMGR8YG3V39V74tJvmac1QLGtblDKIopHEVfVDPD2h6+NbMKBj9bj+t311EgVJdaAVc/4TubM6HRNAZo6kgCBnqORDv0pmdaoP7yl6Ovfosfzk396df+4lMnvo0zhMMrke1Y3QWeU6RLiUfFeA0MR29P0e3EhW8STZjHcraslDk8WXR18MFzCEQ1kMi7A8ttFFHf53QOZP5Ww9KX3VqT8U+tk/ssPfOU4Zwa+Ucy8q+XVuZm3npFP3Rf+YCPf+6SpjzKVfjH0FNvtOcW+iBh07MEZMRgyXiepgd9PDbS8nOSoMZlPIQYl86fVwBtGDUIgzVNk3depBmocZqhpLXpJWMUFP86xuAm9K5PhjWickMlw7X4z2IuQ0bGHZMaJ4faxpUWF5/UbJnwtw4QaOftgw8Qwy18w7SMewTDiEVhHCBpgWAFqeM1ES46ScBKXgWJrMi2+oydY4km65lu6o/5q4M5o1zvXx+WAoxtGR3qJyw6Hky5bfTfa8HKChdPAgGGAAXIIVn5rXudY1/EyvZkoDxddDeUof7+J8lTHRBmowbVxvtagqkBTNWoKm9IR7elzY9IlNMxuadiNE/iloUFwtcjPr7ixQBkAKC4CrifO5WveqU19di1Lv9WtTa5/ZN/7jDapU6fwTEGX2s+St1kD5+R9Kftsul/kAPqjlmPxstplfL/lMlM+tF7SZYbyBt/m7t78a6PTc4161jdAEA7iG+UN9/YNcV/6fKPjJwdmNnK12YjX8y1LLVu9QH/Ybiw6eeXZiPYN49BrqWZb7DxLKrCvavqWdapYELZS3UEUtXyYvorqAc2K6qqxzF8JQ4oHlb+jSbPEsz+QEb95sd0Oh2Nk25AnjJYsu4AXlaSrP7K0uy/+D6TDF5Bd+Vcwna+Wdm2qexpk11UjjgaYkQEjf0DousA4BpixAyN/3OW6wKjv8BlgRgaM/OGN6wKjxqgNMCMDRv5ixnWBUQPZBpiRASN/6uK6wKgxWQPMyICRv1GhERh+Wn8ItliF11/bBff/Aw== -------------------------------------------------------------------------------- /catalog/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | from mptt.models import MPTTModel, TreeForeignKey 4 | 5 | 6 | class Category(MPTTModel): 7 | name = models.CharField(max_length=50, unique=True) 8 | parent = TreeForeignKey('self', null=True, blank=True, 9 | related_name='children') 10 | 11 | class MPTTMeta: 12 | order_insertion_by = ['name'] 13 | 14 | def __unicode__(self): 15 | return self.name 16 | 17 | 18 | class Product(models.Model): 19 | name = models.CharField(max_length=120) 20 | category = models.ForeignKey(Category) 21 | small_description = models.CharField(max_length=255, null=True, blank=True) 22 | long_description = models.TextField(null=True, blank=True) 23 | last_price = models.DecimalField(max_digits=7, decimal_places=2, 24 | null=True, blank=True) 25 | last_currency = models.CharField(max_length=3, null=True, blank=True) 26 | image = models.ImageField(upload_to='product/%Y/%m') 27 | is_active = models.BooleanField(default=True) 28 | last_check_date = models.DateTimeField(null=True, blank=True) 29 | last_change_date = models.DateTimeField(null=True, blank=True) 30 | 31 | def __unicode__(self): 32 | return self.name 33 | 34 | def admin_detail_image(self): 35 | return '' % self.image.url 36 | admin_detail_image.allow_tags = True 37 | 38 | 39 | class Resource(models.Model): 40 | name = models.CharField(max_length=50) 41 | slug = models.SlugField() 42 | icon = models.ImageField(upload_to='resources') 43 | resource_name = models.CharField(max_length=50) 44 | is_active = models.BooleanField(default=True) 45 | 46 | def admin_image(self): 47 | return '' % self.icon.url 48 | admin_image.allow_tags = True 49 | 50 | def __unicode__(self): 51 | return self.name 52 | 53 | 54 | class ProductResource(models.Model): 55 | product = models.ForeignKey(Product) 56 | resource = models.ForeignKey(Resource) 57 | url = models.URLField() 58 | is_active = models.BooleanField(default=True) 59 | display_in_frontend = models.BooleanField(default=True) 60 | enable_crawling = models.BooleanField(default=True) 61 | 62 | 63 | class ProductPrice(models.Model): 64 | product = models.ForeignKey(Product) 65 | resource = models.ForeignKey(Resource) 66 | creation_date = models.DateTimeField(auto_now_add=True) 67 | price = models.DecimalField(max_digits=7, decimal_places=2) 68 | currency = models.CharField(max_length=3) 69 | 70 | 71 | class ProviderProduct(models.Model): 72 | resource = models.ForeignKey(Resource) 73 | product = models.ForeignKey(Product, null=True) 74 | name = models.CharField(max_length=120) 75 | url = models.URLField(unique=True) 76 | 77 | def is_matched(self): 78 | return self.product is not None 79 | is_matched.boolean = True 80 | 81 | def __unicode__(self): 82 | return self.name 83 | -------------------------------------------------------------------------------- /main/settings_default.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | 4 | import djcelery 5 | 6 | """ 7 | Django settings for cmp project. 8 | 9 | For more information on this file, see 10 | https://docs.djangoproject.com/en/1.6/topics/settings/ 11 | 12 | For the full list of settings and their values, see 13 | https://docs.djangoproject.com/en/1.6/ref/settings/ 14 | """ 15 | 16 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 17 | 18 | BASE_DIR = os.path.dirname(os.path.dirname(__file__)) 19 | 20 | # preparing Celery 21 | 22 | djcelery.setup_loader() 23 | 24 | 25 | # Quick-start development settings - unsuitable for production 26 | # See https://docs.djangoproject.com/en/1.6/howto/deployment/checklist/ 27 | 28 | # SECURITY WARNING: keep the secret key used in production secret! 29 | SECRET_KEY = '*@1!+rr*%ewd#3n$426iw^q%9@1tta4e%#hji&%@n6ulyko6it' 30 | 31 | # SECURITY WARNING: don't run with debug turned on in production! 32 | DEBUG = True 33 | 34 | TEMPLATE_DEBUG = True 35 | 36 | ALLOWED_HOSTS = [] 37 | 38 | 39 | TEMPLATE_DIRS = ( 40 | os.path.join(BASE_DIR, 'template')) 41 | 42 | # Application definition 43 | 44 | INSTALLED_APPS = ( 45 | 'django.contrib.admin', 46 | 'django.contrib.auth', 47 | 'django.contrib.contenttypes', 48 | 'django.contrib.sessions', 49 | 'django.contrib.messages', 50 | 'django.contrib.staticfiles', 51 | 52 | 'mptt', 53 | 'djcelery', 54 | 55 | 'catalog', 56 | 'crawler', 57 | ) 58 | 59 | MIDDLEWARE_CLASSES = ( 60 | 'django.contrib.sessions.middleware.SessionMiddleware', 61 | 'django.middleware.common.CommonMiddleware', 62 | 'django.middleware.csrf.CsrfViewMiddleware', 63 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 64 | 'django.contrib.messages.middleware.MessageMiddleware', 65 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 66 | ) 67 | 68 | ROOT_URLCONF = 'main.urls' 69 | 70 | WSGI_APPLICATION = 'main.wsgi.application' 71 | 72 | 73 | # Database 74 | # https://docs.djangoproject.com/en/1.6/ref/settings/#databases 75 | 76 | DATABASES = { 77 | 'default': { 78 | 'ENGINE': 'django.db.backends.sqlite3', 79 | 'NAME': 'db.havuc', 80 | } 81 | } 82 | 83 | # Internationalization 84 | # https://docs.djangoproject.com/en/1.6/topics/i18n/ 85 | 86 | LANGUAGE_CODE = 'tr-TR' 87 | 88 | TIME_ZONE = 'GMT' 89 | 90 | USE_I18N = True 91 | 92 | USE_L10N = True 93 | 94 | USE_TZ = False 95 | 96 | 97 | # Static files (CSS, JavaScript, Images) 98 | # https://docs.djangoproject.com/en/1.6/howto/static-files/ 99 | 100 | STATIC_URL = '/static/' 101 | 102 | # media files 103 | # /var/www/project/static/media 104 | MEDIA_ROOT = os.path.join(BASE_DIR, 'static', 'media') 105 | 106 | # media url 107 | # /media/hede.jpg 108 | MEDIA_URL = '/media/' 109 | 110 | # celery configuration 111 | BROKER_BACKEND = 'redis' 112 | BROKER_HOST = '127.0.0.1' 113 | BROKER_USER = "" 114 | BROKER_PASSWORD = "" 115 | REDIS_PORT = 6379 116 | 117 | # Celery routing configuration 118 | CELERY_ROUTES = { 119 | 'crawler.tasks.crawler_job': {'queue': 'scheduled_tasks'}, 120 | 'crawler.tasks.crawle_resource': {'queue': 'crawler'}, 121 | } 122 | 123 | CELERYBEAT_SCHEDULE = { 124 | 'crawler_job': { 125 | 'task': 'crawler.tasks.crawler_job', 126 | 'schedule': datetime.timedelta(minutes=30) 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /crawler/resources.py: -------------------------------------------------------------------------------- 1 | from crawler.exceptions import ParseError 2 | from decimal import Decimal 3 | from lxml import html as parser 4 | from urlparse import urlparse 5 | 6 | 7 | class BaseResource(object): 8 | def __init__(self, url): 9 | self.url = url 10 | self.price = 0 11 | self.currency = None 12 | self.image_url = 0 13 | self.document = None 14 | 15 | def prepare_document(self, content): 16 | try: 17 | self.document = parser.fromstring(content) 18 | except: 19 | raise ParseError('Error occured while parsing ' 20 | 'content. Url: %s' % self.url) 21 | self.parse_price() 22 | self.parse_currency() 23 | self.parse_image_url() 24 | 25 | def get_document(self): 26 | return self.document 27 | 28 | def get_items(self, xpath): 29 | return self.document.xpath(xpath) 30 | 31 | def get_item(self, xpath): 32 | return self.get_items(xpath)[0] 33 | 34 | def get_node_value(self, xpath): 35 | return self.get_item(xpath).text 36 | 37 | def get_attribute_value(self, xpath, attr): 38 | return self.get_item(xpath).get(attr) 39 | 40 | def get_url(self): 41 | return self.url 42 | 43 | def get_base_url(self): 44 | parsed_url = urlparse(self.url) 45 | return '%s://%s' % (parsed_url.scheme, parsed_url.hostname) 46 | 47 | def parse_price(self): 48 | raise NotImplemented() 49 | 50 | def parse_image_url(self): 51 | raise NotImplemented() 52 | 53 | def parse_currency(self): 54 | raise NotImplemented() 55 | 56 | def get_price(self): 57 | return self.price 58 | 59 | def get_currency(self): 60 | return self.currency 61 | 62 | def get_image_url(self): 63 | return self.image_url 64 | 65 | 66 | class VatanBilgisayarResource(BaseResource): 67 | def parse_price(self): 68 | xpath = '//*[@id="ctl00_u14_ascUrunDetay_dtUrunD' \ 69 | 'etay_ctl00_lblSatisFiyat"]' 70 | price = self.get_node_value(xpath).replace('.', '') 71 | try: 72 | self.price = Decimal(price) 73 | except ValueError, TypeError: 74 | self.price = None 75 | 76 | def parse_currency(self): 77 | self.currency = 'TL' 78 | 79 | def parse_image_url(self): 80 | xpath = '//*[@class="slider"]/li[1]/a/img'; 81 | self.image_url = '%s%s' % (self.get_base_url(), 82 | self.get_attribute_value(xpath, 'src')) 83 | 84 | 85 | class BimeksResource(BaseResource): 86 | def parse_price(self): 87 | thousand_xpath = '//*[@id="ctl00_cphcontent_detay_urun1_FormView_' \ 88 | 'Urun_Detay_PanelPrices"]/div[1]/span' 89 | decimal_xpath = '//*[@id="ctl00_cphcontent_detay_urun1_FormView_' \ 90 | 'Urun_Detay_PanelPrices"]/div[1]/span/small' 91 | thousand_value = self.get_node_value(thousand_xpath) 92 | decimal_value = self.get_node_value(decimal_xpath).split(' ')[0][1:] 93 | price = '%s.%s' % (thousand_value, decimal_value) 94 | try: 95 | self.price = Decimal(price) 96 | except ValueError, TypeError: 97 | self.price = None 98 | 99 | def parse_currency(self): 100 | xpath = '//*[@id="ctl00_cphcontent_detay_urun1_FormView_' \ 101 | 'Urun_Detay_PanelPrices"]/div[1]/span/small' 102 | self.currency = self.get_node_value(xpath).split(' ')[1] 103 | 104 | def parse_image_url(self): 105 | xpath = '//*[@id="thumbs"]/ul/li[1]/a' 106 | self.image_url = self.get_attribute_value(xpath, 'href') 107 | 108 | 109 | class HepsiBuradaResource(BaseResource): 110 | def parse_price(self): 111 | xpath = '//*[@id="ctl00_ContentPlaceHolder1_ProductControl1_' \ 112 | 'MainControl1_ProductMain1_lblPriceFirst"]' 113 | value = self.get_node_value(xpath).split(' ') 114 | price = value[0].replace('.', '').replace(',', '.') 115 | currency = value[1] 116 | try: 117 | self.price = Decimal(price) 118 | except ValueError, TypeError: 119 | self.price = None 120 | self.currency = currency 121 | 122 | def parse_currency(self): 123 | pass 124 | 125 | def parse_image_url(self): 126 | xpath = '//*[@id="ctl00_ContentPlaceHolder1_ProductControl1_' \ 127 | 'MainControl1_TabControl1_TabImage1_rptBigImages_ctl00_imgBigImage"]' 128 | self.image_url = self.get_attribute_value(xpath, 'src') 129 | 130 | 131 | class HizliAlResource(BaseResource): 132 | def parse_price(self): 133 | xpath = '//*[@id="content_ProductPrices1_divFiyat"]/span' 134 | values = self.get_items(xpath) 135 | if len(values) > 1 and 'ndirim' in values[1].text.encode('utf8'): 136 | xpath = '//*[@id="content_ProductPrices1_divFiyat"]/div[2]' 137 | else: 138 | xpath = '//*[@id="content_ProductPrices1_divFiyat"]/div' 139 | 140 | price = self.get_node_value(xpath).strip() 141 | price = price.replace('.', '').replace(',', '.') 142 | try: 143 | self.price = Decimal(price) 144 | except ValueError, TypeError: 145 | self.price = None 146 | 147 | def parse_currency(self): 148 | xpath = '//*[@id="content_ProductPrices1_divFiyat"]/span' 149 | values = self.get_items(xpath) 150 | if len(values) > 1 and 'ndirim' in values[1].text.encode('utf8'): 151 | xpath = '//*[@id="content_ProductPrices1_divFiyat"]/div[2]/span' 152 | else: 153 | xpath = '//*[@id="content_ProductPrices1_divFiyat"]/div/span' 154 | self.currency = self.get_node_value(xpath).strip() 155 | 156 | def parse_image_url(self): 157 | xpath = '//*[@id="imagezoom_thum"]/div/ul/li[1]/a' 158 | self.image_url = self.get_attribute_value(xpath, 'href') --------------------------------------------------------------------------------