├── README.md ├── celerycrawler ├── __init__.py ├── couchviews │ └── db │ │ ├── page │ │ ├── by_rank │ │ │ └── map.js │ │ ├── by_url │ │ │ └── map.js │ │ └── links_to_url │ │ │ └── map.js │ │ └── robotstxt │ │ └── by_domain │ │ └── map.js ├── indexer.py ├── management │ ├── __init__.py │ └── commands │ │ ├── __init__.py │ │ ├── index_update.py │ │ ├── start_crawl.py │ │ └── update_couchdb.py ├── models.py ├── settings.py ├── tasks.py ├── templates │ ├── base.html │ ├── index.html │ └── results.html ├── tests.py ├── urls.py ├── utils.py ├── views.py └── wsgi.py └── manage.py /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrewjw/celery-crawler/b3664f26b61707ad54560fbea16fa858c7f68e1e/README.md -------------------------------------------------------------------------------- /celerycrawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrewjw/celery-crawler/b3664f26b61707ad54560fbea16fa858c7f68e1e/celerycrawler/__init__.py -------------------------------------------------------------------------------- /celerycrawler/couchviews/db/page/by_rank/map.js: -------------------------------------------------------------------------------- 1 | function (doc) { 2 | if(doc.type == "page") { 3 | emit(-doc.rank, doc._id); 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /celerycrawler/couchviews/db/page/by_url/map.js: -------------------------------------------------------------------------------- 1 | function (doc) { 2 | if(doc.type == "page") { 3 | emit(doc.url, doc._id); 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /celerycrawler/couchviews/db/page/links_to_url/map.js: -------------------------------------------------------------------------------- 1 | function (doc) { 2 | if(doc.type == "page") { 3 | for(i = 0; i < doc.links.length; i++) { 4 | emit(doc.links[i], [doc.rank, doc.links.length]); 5 | } 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /celerycrawler/couchviews/db/robotstxt/by_domain/map.js: -------------------------------------------------------------------------------- 1 | function (doc) { 2 | if(doc.type == "robotstxt") { 3 | emit([doc.protocol, doc.domain], doc._id); 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /celerycrawler/indexer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from whoosh import index 4 | from whoosh.fields import * 5 | 6 | schema = Schema( 7 | title=TEXT(stored=True), 8 | url=ID(stored=True, unique=True), 9 | desc=ID(stored=True), 10 | rank=NUMERIC(stored=True, numtype=float), 11 | raw=TEXT, 12 | content=TEXT) 13 | 14 | _ix = None 15 | 16 | def get_index(): 17 | global _ix 18 | 19 | if _ix is not None: 20 | pass 21 | elif not os.path.exists("indexdir"): 22 | os.mkdir("indexdir") 23 | _ix = index.create_in("indexdir", schema) 24 | else: 25 | _ix = index.open_dir("indexdir") 26 | 27 | return _ix 28 | 29 | def get_writer(): 30 | return get_index().writer() 31 | 32 | def get_searcher(): 33 | return get_index().searcher() 34 | 35 | def get_last_change(): 36 | get_index() # create directory 37 | 38 | if os.path.exists("indexdir/since.txt"): 39 | try: 40 | return int(open("indexdir/since.txt").read()) 41 | except ValueError: 42 | return 0 43 | else: 44 | return 0 45 | 46 | def set_last_change(since): 47 | get_index() # create directory 48 | 49 | open("indexdir/since.txt", "w").write(str(since)) 50 | -------------------------------------------------------------------------------- /celerycrawler/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrewjw/celery-crawler/b3664f26b61707ad54560fbea16fa858c7f68e1e/celerycrawler/management/__init__.py -------------------------------------------------------------------------------- /celerycrawler/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrewjw/celery-crawler/b3664f26b61707ad54560fbea16fa858c7f68e1e/celerycrawler/management/commands/__init__.py -------------------------------------------------------------------------------- /celerycrawler/management/commands/index_update.py: -------------------------------------------------------------------------------- 1 | import re 2 | import couchdb 3 | import requests 4 | import lxml 5 | 6 | from pprint import pprint 7 | from lxml.html import document_fromstring 8 | from lxml.html.clean import Cleaner 9 | from celerycrawler import settings 10 | from celerycrawler.indexer import get_writer, get_last_change, set_last_change 11 | from django.core.management.base import BaseCommand, CommandError 12 | 13 | 14 | class Command(BaseCommand): 15 | def handle(self, **options): 16 | since = get_last_change() 17 | writer = get_writer() 18 | 19 | last_change = since 20 | while True: 21 | doc = {} 22 | 23 | changes = settings.db.changes(since=since) 24 | since = changes["last_seq"] 25 | 26 | if since != last_change: 27 | print("Detected new tasks ".format(len(changes))) 28 | print("=== changes ===") 29 | pprint(changes) 30 | for changeset in changes["results"]: 31 | try: 32 | doc = settings.db[changeset["id"]] 33 | except couchdb.http.ResourceNotFound: 34 | print("resource not found") 35 | continue 36 | 37 | if not ("type" in doc and "page" in doc["type"]): 38 | if since != last_change: 39 | print("not processing doc: {}".format(str(doc))) 40 | last_change = since 41 | continue 42 | 43 | print("indexing", doc["url"]) 44 | 45 | ##### 46 | # raw, html, text 47 | ##################### 48 | raw = doc['content'] 49 | print("type(RAW) = %s" % type(raw)) 50 | tree = document_fromstring(str(raw)) 51 | title = ' '.join([title for title in tree.xpath('//title/text()')]) 52 | 53 | # enable filters to remove Javascript and CSS from HTML document 54 | cleaner = Cleaner() 55 | cleaner.javascript = True 56 | cleaner.style = True 57 | cleaner.html = True 58 | cleaner.page_structure = False 59 | cleaner.meta = False 60 | cleaner.safe_attrs_only = False 61 | cleaner.links = False 62 | 63 | html = cleaner.clean_html(tree) 64 | text_content = html.text_content() 65 | 66 | lxml.html.tostring(html) 67 | description = ' '.join(tree.xpath("//meta[@name='description']/@content")) 68 | 69 | writer.update_document( 70 | title=title, 71 | url=doc['url'], 72 | desc=description, 73 | rank=doc['rank'], 74 | content='\n'.join([title, doc['url'], text_content]), 75 | raw=raw, 76 | ) 77 | 78 | writer.commit() 79 | writer = get_writer() 80 | set_last_change(since) 81 | last_change = since 82 | -------------------------------------------------------------------------------- /celerycrawler/management/commands/start_crawl.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand, CommandError 2 | from celerycrawler.tasks import retrieve_page 3 | 4 | class Command(BaseCommand): 5 | 6 | def handle(self, url, **options): 7 | print("handling: {}".format(url)) 8 | retrieve_page.delay(url, rank=1) 9 | -------------------------------------------------------------------------------- /celerycrawler/management/commands/update_couchdb.py: -------------------------------------------------------------------------------- 1 | import couchdb 2 | import glob 3 | import os 4 | 5 | from celerycrawler import settings 6 | from django.core.management.base import NoArgsCommand 7 | 8 | class Command(NoArgsCommand): 9 | help = "Update couchdb views" 10 | 11 | can_import_settings = True 12 | 13 | def handle_noargs(self, **options): 14 | couchdir = os.path.realpath(os.path.split(__file__)[0] + "../../../couchviews") 15 | 16 | databases = glob.glob(couchdir+"/*") 17 | for d in databases: 18 | if not os.path.isdir(d): 19 | continue 20 | 21 | db = getattr(settings, d.split("/")[-1]) 22 | 23 | for design in glob.glob(d + "/*"): 24 | design = design.split("/")[-1] 25 | try: 26 | doc = db["_design/" + design] 27 | except couchdb.http.ResourceNotFound: 28 | doc = {"_id": "_design/" + design} 29 | 30 | doc["views"] = {} 31 | for mapreduce in glob.glob(d+"/"+design+"/*"): 32 | mapreduce = mapreduce.split("/")[-1] 33 | mr = {} 34 | mr["map"] = open(d+"/"+design+"/"+mapreduce+"/map.js").read() 35 | try: 36 | mr["reduce"] = reduce = open(d+"/"+design+"/"+mapreduce+"/reduce.js").read() 37 | except IOError: 38 | pass 39 | 40 | doc["views"][mapreduce] = mr 41 | 42 | db["_design/" + design] = doc 43 | -------------------------------------------------------------------------------- /celerycrawler/models.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import time 3 | import requests 4 | 5 | from celerycrawler import settings 6 | from datetime import datetime 7 | from urllib.parse import urlparse 8 | from urllib.robotparser import RobotFileParser 9 | from urllib.request import urlopen, Request, HTTPError 10 | from urllib.request import install_opener, build_opener, HTTPRedirectHandler 11 | from couchdb.mapping import Document, TextField, DateTimeField, ListField, FloatField 12 | from django.core.cache import cache 13 | 14 | install_opener(build_opener(HTTPRedirectHandler())) 15 | 16 | class Page(Document): 17 | type = TextField(default="page") 18 | url = TextField() 19 | raw = TextField() 20 | content = TextField() 21 | links = ListField(TextField()) 22 | rank = FloatField(default=0) 23 | last_checked = DateTimeField(default=datetime.now) 24 | 25 | def is_valid(self): 26 | return (datetime.now() - self.last_checked).days < 7 27 | 28 | def update(self): 29 | print("updating page") 30 | 31 | parse = urlparse(self.url) 32 | robotstxt = RobotsTxt.get_by_domain(parse.scheme, parse.netloc) 33 | #if not robotstxt.is_allowed(self.url): 34 | # return False 35 | 36 | while cache.get(parse.netloc) is not None: 37 | time.sleep(1) 38 | 39 | cache.set(parse.netloc, True, 10) 40 | 41 | print("getting: {}".format(self.url)) 42 | resp = requests.get(self.url, headers={'User-Agent': 43 | settings.USER_AGENT}) 44 | 45 | ctype = resp.headers['content-type'] 46 | if not ctype.startswith("text/html"): 47 | print("unsupported content-type: {}".format(ctype)) 48 | return 49 | 50 | print("setting Page.content...") 51 | self.content = resp.text 52 | self.raw = resp.text 53 | 54 | self.last_checked = datetime.now() 55 | self.store(settings.db) 56 | 57 | @staticmethod 58 | def count(): 59 | r = settings.db.view("page/by_url", limit=0) 60 | return r.total_rows 61 | 62 | @staticmethod 63 | def get_top_by_rank(limit=10): 64 | r = settings.db.view("page/by_rank", limit=limit) 65 | docs = [] 66 | for row in r.rows: 67 | docs.append(Page.load(settings.db, row.value)) 68 | return docs 69 | 70 | @staticmethod 71 | def get_by_url(url, update=True): 72 | r = settings.db.view("page/by_url", key=url) 73 | if len(r.rows) == 1: 74 | doc = Page.load(settings.db, r.rows[0].value) 75 | if doc.is_valid(): 76 | return doc 77 | elif not update: 78 | return None 79 | else: 80 | doc = Page(url=url) 81 | print("Page.get_by_url: doc.update() ...") 82 | doc.update() 83 | 84 | return doc 85 | 86 | @staticmethod 87 | def get_id_by_url(url, update=True): 88 | r = settings.db.view("page/by_url", key=url) 89 | if len(r) == 1: 90 | return r.rows[0].value 91 | else: 92 | doc = Page.get_by_url(url, update=update) 93 | if doc is not None: 94 | return doc.id 95 | else: 96 | return None 97 | 98 | @staticmethod 99 | def get_links_to_url(url): 100 | return [row.value for row in settings.db.view("page/links_to_url", key=url).rows] 101 | 102 | class RobotsTxt(Document): 103 | type = TextField(default="robotstxt") 104 | 105 | domain = TextField() 106 | protocol = TextField() 107 | 108 | robot_parser_pickle = TextField() 109 | 110 | def _get_robot_parser(self): 111 | parser = RobotFileParser() 112 | parser.set_url(self.protocol + "://" + self.domain + "/robots.txt") 113 | 114 | return parser 115 | 116 | def is_valid(self): 117 | parser = self._get_robot_parser() 118 | return (time.time() - parser.mtime()) < 7*24*60*60 119 | 120 | def is_allowed(self, url): 121 | parser = self._get_robot_parser() 122 | return parser.can_fetch(settings.USER_AGENT, url) 123 | 124 | def update(self): 125 | while cache.get(self.domain) is not None: 126 | time.sleep(1) 127 | cache.set(self.domain, True, 10) 128 | 129 | print("getting %s://%s/robots.txt" % (self.protocol, self.domain)) 130 | parser = self._get_robot_parser() 131 | parser.read() 132 | parser.modified() 133 | 134 | self.store(settings.db) 135 | 136 | @staticmethod 137 | def get_by_domain(protocol, domain): 138 | r = settings.db.view("robotstxt/by_domain", key=[protocol, domain]) 139 | if len(r) > 0: 140 | doc = RobotsTxt.load(settings.db, r.rows[0].value) 141 | if doc.is_valid(): 142 | return doc 143 | else: 144 | doc = RobotsTxt(protocol=protocol, domain=domain) 145 | 146 | doc.update() 147 | doc.store(settings.db) 148 | 149 | return doc 150 | -------------------------------------------------------------------------------- /celerycrawler/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for celerycrawler project. 3 | 4 | For more information on this file, see 5 | https://docs.djangoproject.com/en/1.7/topics/settings/ 6 | 7 | For the full list of settings and their values, see 8 | https://docs.djangoproject.com/en/1.7/ref/settings/ 9 | """ 10 | 11 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 12 | import os 13 | BASE_DIR = os.path.dirname(os.path.dirname(__file__)) 14 | 15 | 16 | # Quick-start development settings - unsuitable for production 17 | # See https://docs.djangoproject.com/en/1.7/howto/deployment/checklist/ 18 | 19 | # SECURITY WARNING: keep the secret key used in production secret! 20 | SECRET_KEY = 'ua$=!zpmt&@_=paocb_d_hxg78kdmxu3%f$e&15_eecg%k-2uv' 21 | 22 | # SECURITY WARNING: don't run with debug turned on in production! 23 | DEBUG = True 24 | 25 | TEMPLATE_DEBUG = True 26 | 27 | ALLOWED_HOSTS = [] 28 | 29 | 30 | # Application definition 31 | 32 | INSTALLED_APPS = ( 33 | 'django.contrib.admin', 34 | 'django.contrib.auth', 35 | 'django.contrib.contenttypes', 36 | 'django.contrib.sessions', 37 | 'django.contrib.messages', 38 | 'django.contrib.staticfiles', 39 | 'djcelery', 40 | 'celerycrawler' 41 | ) 42 | 43 | MIDDLEWARE_CLASSES = ( 44 | 'django.contrib.sessions.middleware.SessionMiddleware', 45 | 'django.middleware.common.CommonMiddleware', 46 | 'django.middleware.csrf.CsrfViewMiddleware', 47 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 48 | 'django.contrib.auth.middleware.SessionAuthenticationMiddleware', 49 | 'django.contrib.messages.middleware.MessageMiddleware', 50 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 51 | ) 52 | 53 | ROOT_URLCONF = 'celerycrawler.urls' 54 | 55 | WSGI_APPLICATION = 'celerycrawler.wsgi.application' 56 | 57 | 58 | # Database 59 | # https://docs.djangoproject.com/en/1.7/ref/settings/#databases 60 | 61 | DATABASES = { 62 | 'default': { 63 | 'ENGINE': 'django.db.backends.sqlite3', 64 | 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), 65 | } 66 | } 67 | 68 | # Internationalization 69 | # https://docs.djangoproject.com/en/1.7/topics/i18n/ 70 | 71 | LANGUAGE_CODE = 'en-us' 72 | 73 | TIME_ZONE = 'UTC' 74 | 75 | USE_I18N = True 76 | 77 | USE_L10N = True 78 | 79 | USE_TZ = True 80 | 81 | 82 | # Static files (CSS, JavaScript, Images) 83 | # https://docs.djangoproject.com/en/1.7/howto/static-files/ 84 | 85 | STATIC_URL = '/static/' 86 | 87 | ######################## 88 | # celery configuration # 89 | ######################## 90 | 91 | import djcelery 92 | djcelery.setup_loader() 93 | BROKER_BACKEND = "couchdb" 94 | BROKER_HOST = "localhost" 95 | BROKER_PORT = 5984 96 | BROKER_VHOST = "celerycrawler" 97 | 98 | #BROKER_URL = 'amqp://guest:guest@localhost:5672' 99 | 100 | CELERYD_CONCURRENCY = 5 101 | CELERY_ACCEPT_CONTENT = ['pickle', 'json', 'msgpack', 'yaml'] 102 | CELERY_QUEUES = {"retrieve": {"exchange": "default", 103 | "exchange_type": "direct", 104 | "routing_key": "retrieve"}, 105 | "process": {"exchange": "default", 106 | "exchange_type": "direct", 107 | "routing_key": "process "}, 108 | "celery": {"exchange": "default", 109 | "exchange_type": "direct", 110 | "routing_key": "celery"}} 111 | 112 | 113 | class Router(object): 114 | def route_for_task(self, task, args=None, kwargs=None): 115 | if task == "celerycrawler.tasks.retrieve_page": 116 | return { "queue": "retrieve" } 117 | else: 118 | return { "queue": "process" } 119 | 120 | CELERY_ROUTES = (Router(), ) 121 | 122 | import couchdb 123 | 124 | server = couchdb.Server() 125 | try: 126 | db = server.create("celerycrawler") 127 | except: 128 | db = server["celerycrawler"] 129 | 130 | USER_AGENT = 'ua' 131 | -------------------------------------------------------------------------------- /celerycrawler/tasks.py: -------------------------------------------------------------------------------- 1 | from lxml.html import document_fromstring 2 | from urllib.parse import urlparse, urljoin 3 | from celerycrawler import settings 4 | from celery.decorators import task 5 | from celerycrawler.models import Page, RobotsTxt 6 | from celerycrawler.utils import unescape 7 | 8 | 9 | @task 10 | def retrieve_page(url, rank=None): 11 | print("retrieve_page {}".format(url)) 12 | page = Page.get_by_url(url, update=True) 13 | if page is None: 14 | print("Page is None") 15 | return 16 | 17 | if rank is not None: 18 | page.rank = rank 19 | page.store(settings.db) 20 | 21 | if page.id is None: 22 | page.update() 23 | 24 | find_links.delay(page.id) 25 | 26 | @task 27 | def find_links(doc_id): 28 | print("in find_links") 29 | if doc_id is None: 30 | print("doc_id = None") 31 | return False 32 | 33 | doc = Page.load(settings.db, doc_id) 34 | 35 | if not hasattr(doc, 'content'): 36 | print("Got None for the content of %s -> %s." % (doc_id, doc.url)) 37 | return False 38 | elif not doc['content']: 39 | print("tasks.py:elif not doc.content") 40 | return False 41 | 42 | raw_links = [] 43 | tree = document_fromstring(doc.content) 44 | for a in tree.xpath('//a'): 45 | link = urljoin(doc['url'], a.get('href')) 46 | doc.links.append(link) 47 | 48 | doc.store(settings.db) 49 | 50 | calculate_rank.delay(doc.id) 51 | 52 | for link in doc.links: 53 | p = Page.get_id_by_url(link, update=False) 54 | if p is not None: 55 | calculate_rank.delay(p) 56 | else: 57 | retrieve_page.delay(link) 58 | 59 | print("find_links {} -> {}".format(doc.url, len(doc.links))) 60 | 61 | @task 62 | def calculate_rank(doc_id): 63 | print("in calculate_rank") 64 | page = Page.load(settings.db, doc_id) 65 | 66 | links = Page.get_links_to_url(page.url) 67 | 68 | rank = 0 69 | for link in links: 70 | rank += link[0] / link[1] 71 | 72 | old_rank = page.rank 73 | page.rank = rank * 0.85 74 | 75 | if page.rank == 0: 76 | n_links = settings.db.view("page/by_url", limit=0).total_rows 77 | page.rank = 1.0 / n_links 78 | 79 | if abs(old_rank - page.rank) > 0.0001: 80 | print("%s: %s -> %s" % (page.url, old_rank, page.rank)) 81 | page.store(settings.db) 82 | 83 | for link in page.links: 84 | p = Page.get_id_by_url(link, update=False) 85 | if p is not None: 86 | calculate_rank.delay(p) 87 | -------------------------------------------------------------------------------- /celerycrawler/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 |{{ doc_count }} pages in index.
12 | 13 |
11 | {{ result.title|safe }} ({{ result.score }}, {{ result.rank }}, {{ result.combined }})
12 | {{ result.desc|safe }}
13 |