├── README.md
├── celerycrawler
    ├── __init__.py
    ├── couchviews
    │   └── db
    │   │   ├── page
    │   │       ├── by_rank
    │   │       │   └── map.js
    │   │       ├── by_url
    │   │       │   └── map.js
    │   │       └── links_to_url
    │   │       │   └── map.js
    │   │   └── robotstxt
    │   │       └── by_domain
    │   │           └── map.js
    ├── indexer.py
    ├── management
    │   ├── __init__.py
    │   └── commands
    │   │   ├── __init__.py
    │   │   ├── index_update.py
    │   │   ├── start_crawl.py
    │   │   └── update_couchdb.py
    ├── models.py
    ├── settings.py
    ├── tasks.py
    ├── templates
    │   ├── base.html
    │   ├── index.html
    │   └── results.html
    ├── tests.py
    ├── urls.py
    ├── utils.py
    ├── views.py
    └── wsgi.py
└── manage.py


/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewjw/celery-crawler/b3664f26b61707ad54560fbea16fa858c7f68e1e/README.md


--------------------------------------------------------------------------------
/celerycrawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewjw/celery-crawler/b3664f26b61707ad54560fbea16fa858c7f68e1e/celerycrawler/__init__.py


--------------------------------------------------------------------------------
/celerycrawler/couchviews/db/page/by_rank/map.js:
--------------------------------------------------------------------------------
1 | function (doc) {
2 |     if(doc.type == "page") {
3 |         emit(-doc.rank, doc._id);
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/celerycrawler/couchviews/db/page/by_url/map.js:
--------------------------------------------------------------------------------
1 | function (doc) {
2 |     if(doc.type == "page") {
3 |         emit(doc.url, doc._id);
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/celerycrawler/couchviews/db/page/links_to_url/map.js:
--------------------------------------------------------------------------------
1 | function (doc) {
2 |     if(doc.type == "page") {
3 |         for(i = 0; i < doc.links.length; i++) {
4 |             emit(doc.links[i], [doc.rank, doc.links.length]);
5 |         }
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/celerycrawler/couchviews/db/robotstxt/by_domain/map.js:
--------------------------------------------------------------------------------
1 | function (doc) {
2 |     if(doc.type == "robotstxt") {
3 |         emit([doc.protocol, doc.domain], doc._id);
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/celerycrawler/indexer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from whoosh import index
 4 | from whoosh.fields import *
 5 | 
 6 | schema = Schema(
 7 |     title=TEXT(stored=True),
 8 |     url=ID(stored=True, unique=True),
 9 |     desc=ID(stored=True),
10 |     rank=NUMERIC(stored=True, numtype=float),
11 |     raw=TEXT,
12 |     content=TEXT)
13 | 
14 | _ix = None
15 | 
16 | def get_index():
17 |     global _ix
18 | 
19 |     if _ix is not None:
20 |         pass
21 |     elif not os.path.exists("indexdir"):
22 |         os.mkdir("indexdir")
23 |         _ix = index.create_in("indexdir", schema)
24 |     else:
25 |         _ix = index.open_dir("indexdir")
26 | 
27 |     return _ix
28 | 
29 | def get_writer():
30 |     return get_index().writer()
31 | 
32 | def get_searcher():
33 |     return get_index().searcher()
34 | 
35 | def get_last_change():
36 |     get_index() # create directory
37 | 
38 |     if os.path.exists("indexdir/since.txt"):
39 |         try:
40 |             return int(open("indexdir/since.txt").read())
41 |         except ValueError:
42 |             return 0
43 |     else:
44 |         return 0
45 | 
46 | def set_last_change(since):
47 |     get_index() # create directory
48 | 
49 |     open("indexdir/since.txt", "w").write(str(since))
50 | 


--------------------------------------------------------------------------------
/celerycrawler/management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewjw/celery-crawler/b3664f26b61707ad54560fbea16fa858c7f68e1e/celerycrawler/management/__init__.py


--------------------------------------------------------------------------------
/celerycrawler/management/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewjw/celery-crawler/b3664f26b61707ad54560fbea16fa858c7f68e1e/celerycrawler/management/commands/__init__.py


--------------------------------------------------------------------------------
/celerycrawler/management/commands/index_update.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import couchdb
 3 | import requests
 4 | import lxml
 5 | 
 6 | from pprint import pprint
 7 | from lxml.html import document_fromstring
 8 | from lxml.html.clean import Cleaner
 9 | from celerycrawler import settings
10 | from celerycrawler.indexer import get_writer, get_last_change, set_last_change
11 | from django.core.management.base import BaseCommand, CommandError
12 | 
13 | 
14 | class Command(BaseCommand):
15 |     def handle(self, **options):
16 |         since = get_last_change()
17 |         writer = get_writer()
18 | 
19 |         last_change = since
20 |         while True:
21 |             doc = {}
22 |             
23 |             changes = settings.db.changes(since=since)
24 |             since = changes["last_seq"]
25 | 
26 |             if since != last_change:
27 |                 print("Detected new tasks ".format(len(changes)))
28 |                 print("=== changes ===")
29 |                 pprint(changes)
30 |             for changeset in changes["results"]:
31 |                 try:
32 |                     doc = settings.db[changeset["id"]]
33 |                 except couchdb.http.ResourceNotFound:
34 |                     print("resource not found")
35 |                     continue
36 | 
37 |             if not ("type" in doc and "page" in doc["type"]):
38 |                 if since != last_change:
39 |                     print("not processing doc: {}".format(str(doc)))
40 |                     last_change = since
41 |                 continue
42 |                     
43 |             print("indexing", doc["url"])
44 | 
45 |             #####
46 |             # raw, html, text
47 |             #####################
48 |             raw = doc['content']
49 |             print("type(RAW) = %s" % type(raw))
50 |             tree = document_fromstring(str(raw))
51 |             title = ' '.join([title for title in tree.xpath('//title/text()')])
52 |             
53 |             # enable filters to remove Javascript and CSS from HTML document
54 |             cleaner = Cleaner()
55 |             cleaner.javascript = True
56 |             cleaner.style = True
57 |             cleaner.html = True
58 |             cleaner.page_structure = False
59 |             cleaner.meta = False
60 |             cleaner.safe_attrs_only = False
61 |             cleaner.links = False
62 | 
63 |             html = cleaner.clean_html(tree)
64 |             text_content = html.text_content()
65 | 
66 |             lxml.html.tostring(html)
67 |             description = ' '.join(tree.xpath("//meta[@name='description']/@content"))
68 | 
69 |             writer.update_document(
70 |                 title=title,
71 |                 url=doc['url'],
72 |                 desc=description,
73 |                 rank=doc['rank'],
74 |                 content='\n'.join([title, doc['url'], text_content]),
75 |                 raw=raw,
76 |             )
77 | 
78 |             writer.commit()
79 |             writer = get_writer()
80 |             set_last_change(since)
81 |             last_change = since
82 | 


--------------------------------------------------------------------------------
/celerycrawler/management/commands/start_crawl.py:
--------------------------------------------------------------------------------
1 | from django.core.management.base import BaseCommand, CommandError
2 | from celerycrawler.tasks import retrieve_page
3 | 
4 | class Command(BaseCommand):
5 |     
6 |     def handle(self, url, **options):
7 |         print("handling: {}".format(url))
8 |         retrieve_page.delay(url, rank=1)
9 | 


--------------------------------------------------------------------------------
/celerycrawler/management/commands/update_couchdb.py:
--------------------------------------------------------------------------------
 1 | import couchdb
 2 | import glob
 3 | import os
 4 | 
 5 | from celerycrawler import settings
 6 | from django.core.management.base import NoArgsCommand
 7 | 
 8 | class Command(NoArgsCommand):
 9 |     help = "Update couchdb views"
10 | 
11 |     can_import_settings = True
12 | 
13 |     def handle_noargs(self, **options):
14 |         couchdir = os.path.realpath(os.path.split(__file__)[0] + "../../../couchviews")
15 | 
16 |         databases = glob.glob(couchdir+"/*")
17 |         for d in databases:
18 |             if not os.path.isdir(d):
19 |                 continue
20 | 
21 |             db = getattr(settings, d.split("/")[-1])
22 | 
23 |             for design in glob.glob(d + "/*"):
24 |                 design = design.split("/")[-1]
25 |                 try:
26 |                     doc = db["_design/" + design]
27 |                 except couchdb.http.ResourceNotFound:
28 |                     doc = {"_id": "_design/" + design}
29 | 
30 |                 doc["views"] = {}
31 |                 for mapreduce in glob.glob(d+"/"+design+"/*"):
32 |                      mapreduce = mapreduce.split("/")[-1]
33 |                      mr = {}
34 |                      mr["map"] = open(d+"/"+design+"/"+mapreduce+"/map.js").read()
35 |                      try:
36 |                          mr["reduce"] = reduce = open(d+"/"+design+"/"+mapreduce+"/reduce.js").read()
37 |                      except IOError:
38 |                          pass
39 | 
40 |                      doc["views"][mapreduce] = mr
41 | 
42 |                 db["_design/" + design] = doc
43 | 


--------------------------------------------------------------------------------
/celerycrawler/models.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import time
  3 | import requests
  4 | 
  5 | from celerycrawler import settings
  6 | from datetime import datetime
  7 | from urllib.parse import urlparse
  8 | from urllib.robotparser import RobotFileParser
  9 | from urllib.request import urlopen, Request, HTTPError
 10 | from urllib.request import install_opener, build_opener, HTTPRedirectHandler
 11 | from couchdb.mapping import Document, TextField, DateTimeField, ListField, FloatField
 12 | from django.core.cache import cache
 13 | 
 14 | install_opener(build_opener(HTTPRedirectHandler()))
 15 | 
 16 | class Page(Document):
 17 |     type = TextField(default="page")
 18 |     url = TextField()
 19 |     raw = TextField()
 20 |     content = TextField()
 21 |     links = ListField(TextField())
 22 |     rank = FloatField(default=0)
 23 |     last_checked = DateTimeField(default=datetime.now)
 24 | 
 25 |     def is_valid(self):
 26 |         return (datetime.now() - self.last_checked).days < 7
 27 | 
 28 |     def update(self):
 29 |         print("updating page")
 30 |         
 31 |         parse = urlparse(self.url)
 32 |         robotstxt = RobotsTxt.get_by_domain(parse.scheme, parse.netloc)
 33 |         #if not robotstxt.is_allowed(self.url):
 34 |         #    return False
 35 | 
 36 |         while cache.get(parse.netloc) is not None:
 37 |             time.sleep(1)
 38 | 
 39 |         cache.set(parse.netloc, True, 10)
 40 | 
 41 |         print("getting: {}".format(self.url))
 42 |         resp = requests.get(self.url, headers={'User-Agent':
 43 |                                                settings.USER_AGENT})
 44 | 
 45 |         ctype = resp.headers['content-type']
 46 |         if not ctype.startswith("text/html"):
 47 |             print("unsupported content-type: {}".format(ctype))
 48 |             return
 49 | 
 50 |         print("setting Page.content...")
 51 |         self.content = resp.text
 52 |         self.raw = resp.text
 53 | 
 54 |         self.last_checked = datetime.now()
 55 |         self.store(settings.db)
 56 | 
 57 |     @staticmethod
 58 |     def count():
 59 |         r = settings.db.view("page/by_url", limit=0)
 60 |         return r.total_rows
 61 | 
 62 |     @staticmethod
 63 |     def get_top_by_rank(limit=10):
 64 |         r = settings.db.view("page/by_rank", limit=limit)
 65 |         docs = []
 66 |         for row in r.rows:
 67 |             docs.append(Page.load(settings.db, row.value))
 68 |         return docs
 69 | 
 70 |     @staticmethod
 71 |     def get_by_url(url, update=True):
 72 |         r = settings.db.view("page/by_url", key=url)
 73 |         if len(r.rows) == 1:
 74 |             doc = Page.load(settings.db, r.rows[0].value)
 75 |             if doc.is_valid():
 76 |                 return doc
 77 |         elif not update:
 78 |             return None
 79 |         else:
 80 |             doc = Page(url=url)
 81 |         print("Page.get_by_url: doc.update() ...")
 82 |         doc.update()
 83 | 
 84 |         return doc
 85 | 
 86 |     @staticmethod
 87 |     def get_id_by_url(url, update=True):
 88 |         r = settings.db.view("page/by_url", key=url)
 89 |         if len(r) == 1:
 90 |             return r.rows[0].value
 91 |         else:
 92 |             doc = Page.get_by_url(url, update=update)
 93 |             if doc is not None:
 94 |                 return doc.id
 95 |             else:
 96 |                 return None
 97 | 
 98 |     @staticmethod
 99 |     def get_links_to_url(url):
100 |         return [row.value for row in settings.db.view("page/links_to_url", key=url).rows]
101 | 
102 | class RobotsTxt(Document):
103 |     type = TextField(default="robotstxt")
104 | 
105 |     domain = TextField()
106 |     protocol = TextField()
107 | 
108 |     robot_parser_pickle = TextField()
109 | 
110 |     def _get_robot_parser(self):
111 |         parser = RobotFileParser()
112 |         parser.set_url(self.protocol + "://" + self.domain + "/robots.txt")
113 | 
114 |         return parser
115 | 
116 |     def is_valid(self):
117 |         parser = self._get_robot_parser()
118 |         return (time.time() - parser.mtime()) < 7*24*60*60
119 | 
120 |     def is_allowed(self, url):
121 |         parser = self._get_robot_parser()
122 |         return parser.can_fetch(settings.USER_AGENT, url)
123 | 
124 |     def update(self):
125 |         while cache.get(self.domain) is not None:
126 |             time.sleep(1)
127 |         cache.set(self.domain, True, 10)
128 | 
129 |         print("getting %s://%s/robots.txt" % (self.protocol, self.domain))
130 |         parser = self._get_robot_parser()
131 |         parser.read()
132 |         parser.modified()
133 | 
134 |         self.store(settings.db)
135 | 
136 |     @staticmethod
137 |     def get_by_domain(protocol, domain):
138 |         r = settings.db.view("robotstxt/by_domain", key=[protocol, domain])
139 |         if len(r) > 0:
140 |             doc = RobotsTxt.load(settings.db, r.rows[0].value)
141 |             if doc.is_valid():
142 |                 return doc
143 |         else:
144 |             doc = RobotsTxt(protocol=protocol, domain=domain)
145 | 
146 |         doc.update()
147 |         doc.store(settings.db)
148 | 
149 |         return doc
150 | 


--------------------------------------------------------------------------------
/celerycrawler/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for celerycrawler project.
  3 | 
  4 | For more information on this file, see
  5 | https://docs.djangoproject.com/en/1.7/topics/settings/
  6 | 
  7 | For the full list of settings and their values, see
  8 | https://docs.djangoproject.com/en/1.7/ref/settings/
  9 | """
 10 | 
 11 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 12 | import os
 13 | BASE_DIR = os.path.dirname(os.path.dirname(__file__))
 14 | 
 15 | 
 16 | # Quick-start development settings - unsuitable for production
 17 | # See https://docs.djangoproject.com/en/1.7/howto/deployment/checklist/
 18 | 
 19 | # SECURITY WARNING: keep the secret key used in production secret!
 20 | SECRET_KEY = 'ua$=!zpmt&@_=paocb_d_hxg78kdmxu3%f$e&15_eecg%k-2uv'
 21 | 
 22 | # SECURITY WARNING: don't run with debug turned on in production!
 23 | DEBUG = True
 24 | 
 25 | TEMPLATE_DEBUG = True
 26 | 
 27 | ALLOWED_HOSTS = []
 28 | 
 29 | 
 30 | # Application definition
 31 | 
 32 | INSTALLED_APPS = (
 33 |     'django.contrib.admin',
 34 |     'django.contrib.auth',
 35 |     'django.contrib.contenttypes',
 36 |     'django.contrib.sessions',
 37 |     'django.contrib.messages',
 38 |     'django.contrib.staticfiles',
 39 |     'djcelery',
 40 |     'celerycrawler'
 41 | )
 42 | 
 43 | MIDDLEWARE_CLASSES = (
 44 |     'django.contrib.sessions.middleware.SessionMiddleware',
 45 |     'django.middleware.common.CommonMiddleware',
 46 |     'django.middleware.csrf.CsrfViewMiddleware',
 47 |     'django.contrib.auth.middleware.AuthenticationMiddleware',
 48 |     'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
 49 |     'django.contrib.messages.middleware.MessageMiddleware',
 50 |     'django.middleware.clickjacking.XFrameOptionsMiddleware',
 51 | )
 52 | 
 53 | ROOT_URLCONF = 'celerycrawler.urls'
 54 | 
 55 | WSGI_APPLICATION = 'celerycrawler.wsgi.application'
 56 | 
 57 | 
 58 | # Database
 59 | # https://docs.djangoproject.com/en/1.7/ref/settings/#databases
 60 | 
 61 | DATABASES = {
 62 |     'default': {
 63 |         'ENGINE': 'django.db.backends.sqlite3',
 64 |         'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
 65 |     }
 66 | }
 67 | 
 68 | # Internationalization
 69 | # https://docs.djangoproject.com/en/1.7/topics/i18n/
 70 | 
 71 | LANGUAGE_CODE = 'en-us'
 72 | 
 73 | TIME_ZONE = 'UTC'
 74 | 
 75 | USE_I18N = True
 76 | 
 77 | USE_L10N = True
 78 | 
 79 | USE_TZ = True
 80 | 
 81 | 
 82 | # Static files (CSS, JavaScript, Images)
 83 | # https://docs.djangoproject.com/en/1.7/howto/static-files/
 84 | 
 85 | STATIC_URL = '/static/'
 86 | 
 87 | ########################
 88 | # celery configuration #
 89 | ########################
 90 | 
 91 | import djcelery
 92 | djcelery.setup_loader()
 93 | BROKER_BACKEND = "couchdb"
 94 | BROKER_HOST = "localhost"
 95 | BROKER_PORT = 5984
 96 | BROKER_VHOST = "celerycrawler"
 97 | 
 98 | #BROKER_URL = 'amqp://guest:guest@localhost:5672'
 99 | 
100 | CELERYD_CONCURRENCY = 5
101 | CELERY_ACCEPT_CONTENT = ['pickle', 'json', 'msgpack', 'yaml']
102 | CELERY_QUEUES = {"retrieve": {"exchange": "default",
103 |                               "exchange_type": "direct",
104 |                               "routing_key": "retrieve"},
105 |                  "process": {"exchange": "default",
106 |                              "exchange_type": "direct",
107 |                              "routing_key": "process "},
108 |                  "celery": {"exchange": "default",
109 |                             "exchange_type": "direct",
110 |                             "routing_key": "celery"}}
111 | 
112 | 
113 | class Router(object):
114 |     def route_for_task(self, task, args=None, kwargs=None):
115 |         if task == "celerycrawler.tasks.retrieve_page":
116 |             return { "queue": "retrieve" }
117 |         else:
118 |             return { "queue": "process" }
119 | 
120 | CELERY_ROUTES = (Router(), )
121 | 
122 | import couchdb
123 | 
124 | server = couchdb.Server()
125 | try:
126 |     db = server.create("celerycrawler")
127 | except:
128 |     db = server["celerycrawler"]
129 | 
130 | USER_AGENT = 'ua'
131 | 


--------------------------------------------------------------------------------
/celerycrawler/tasks.py:
--------------------------------------------------------------------------------
 1 | from lxml.html import document_fromstring
 2 | from urllib.parse import urlparse, urljoin
 3 | from celerycrawler import settings
 4 | from celery.decorators import task
 5 | from celerycrawler.models import Page, RobotsTxt
 6 | from celerycrawler.utils import unescape
 7 | 
 8 | 
 9 | @task
10 | def retrieve_page(url, rank=None):
11 |     print("retrieve_page {}".format(url))
12 |     page = Page.get_by_url(url, update=True)
13 |     if page is None:
14 |         print("Page is None")
15 |         return
16 | 
17 |     if rank is not None:
18 |         page.rank = rank
19 |         page.store(settings.db)
20 | 
21 |     if page.id is None:
22 |         page.update()
23 | 
24 |     find_links.delay(page.id)
25 | 
26 | @task
27 | def find_links(doc_id):
28 |     print("in find_links")
29 |     if doc_id is None:
30 |         print("doc_id = None")
31 |         return False
32 | 
33 |     doc = Page.load(settings.db, doc_id)
34 | 
35 |     if not hasattr(doc, 'content'):
36 |         print("Got None for the content of %s -> %s." % (doc_id, doc.url))
37 |         return False
38 |     elif not doc['content']:
39 |         print("tasks.py:elif not doc.content")
40 |         return False
41 | 
42 |     raw_links = []
43 |     tree = document_fromstring(doc.content)
44 |     for a in tree.xpath('//a'):
45 |         link = urljoin(doc['url'], a.get('href'))
46 |         doc.links.append(link)
47 |     
48 |     doc.store(settings.db)
49 | 
50 |     calculate_rank.delay(doc.id)
51 | 
52 |     for link in doc.links:
53 |         p = Page.get_id_by_url(link, update=False)
54 |         if p is not None:
55 |             calculate_rank.delay(p)
56 |         else:
57 |             retrieve_page.delay(link)
58 | 
59 |     print("find_links {} -> {}".format(doc.url, len(doc.links)))
60 | 
61 | @task
62 | def calculate_rank(doc_id):
63 |     print("in calculate_rank")
64 |     page = Page.load(settings.db, doc_id)
65 | 
66 |     links = Page.get_links_to_url(page.url)
67 | 
68 |     rank = 0
69 |     for link in links:
70 |         rank += link[0] / link[1]
71 | 
72 |     old_rank = page.rank
73 |     page.rank = rank * 0.85
74 | 
75 |     if page.rank == 0:
76 |         n_links = settings.db.view("page/by_url", limit=0).total_rows
77 |         page.rank = 1.0 / n_links
78 | 
79 |     if abs(old_rank - page.rank) > 0.0001:
80 |         print("%s: %s -> %s" % (page.url, old_rank, page.rank))
81 |         page.store(settings.db)
82 |         
83 |         for link in page.links:
84 |             p = Page.get_id_by_url(link, update=False)
85 |             if p is not None:
86 |                 calculate_rank.delay(p)
87 | 


--------------------------------------------------------------------------------
/celerycrawler/templates/base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <title>Celery Search</title>
 5 | </head>
 6 | <body>
 7 |     <h1>Celery Search</h1>
 8 | 
 9 |     {% block body %}
10 |     {% endblock %}
11 | </body>
12 | </html>
13 | 


--------------------------------------------------------------------------------
/celerycrawler/templates/index.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block body %}
 4 |     <form action="/search" method="get">
 5 |         <input name="q" type="text">
 6 |         <input type="submit">
 7 |     </form>
 8 | 
 9 |     <hr>
10 | 
11 |     <p>{{ doc_count }} pages in index.</p>
12 | 
13 |     <hr>
14 | 
15 |     <h2>Top Pages</h2>
16 | 
17 |     <ol>
18 |     {% for page in top_docs %}
19 |         <li><a href="{{ page.url }}">{{ page.url }}</a> - {{ page.rank }}</li>
20 |     {% endfor %}
21 |     </ol>
22 | {% endblock %}
23 | 


--------------------------------------------------------------------------------
/celerycrawler/templates/results.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block body %}
 4 |     <form action="/search" method="get">
 5 |         <input name="q" type="text" value="{{ q }}">
 6 |         <input type="submit">
 7 |     </form>
 8 | 
 9 |     {% for result in results|slice:":20" %}
10 |         <p>
11 |             <b><a href="{{ result.url }}">{{ result.title|safe }}</a></b> ({{ result.score }}, {{ result.rank }}, {{ result.combined }})<br>
12 |             {{ result.desc|safe }}
13 |         </p>
14 |     {% endfor %}
15 | {% endblock %}
16 | 


--------------------------------------------------------------------------------
/celerycrawler/tests.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file demonstrates two different styles of tests (one doctest and one
 3 | unittest). These will both pass when you run "manage.py test".
 4 | 
 5 | Replace these with more appropriate tests for your application.
 6 | """
 7 | 
 8 | from django.test import TestCase
 9 | 
10 | class SimpleTest(TestCase):
11 |     def test_basic_addition(self):
12 |         """
13 |         Tests that 1 + 1 always equals 2.
14 |         """
15 |         self.failUnlessEqual(1 + 1, 2)
16 | 
17 | __test__ = {"doctest": """
18 | Another way to test that 1 + 1 is equal to 2.
19 | 
20 | >>> 1 + 1 == 2
21 | True
22 | """}
23 | 
24 | 


--------------------------------------------------------------------------------
/celerycrawler/urls.py:
--------------------------------------------------------------------------------
 1 | from django.conf.urls.defaults import *
 2 | 
 3 | # Uncomment the next two lines to enable the admin:
 4 | # from django.contrib import admin
 5 | # admin.autodiscover()
 6 | 
 7 | urlpatterns = patterns('',
 8 |     (r'^$', 'crawler.views.index'),
 9 |     (r'^search$', 'crawler.views.search'),
10 | 
11 |     # Uncomment the admin/doc line below to enable admin documentation:
12 |     # (r'^admin/doc/', include('django.contrib.admindocs.urls')),
13 | 
14 |     # Uncomment the next line to enable the admin:
15 |     # (r'^admin/', include(admin.site.urls)),
16 | )
17 | 


--------------------------------------------------------------------------------
/celerycrawler/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def unescape(text):
 4 |    """Removes HTML or XML character references 
 5 |       and entities from a text string.
 6 |       keep &amp;, &gt;, &lt; in the source code.
 7 |    from Fredrik Lundh
 8 |    http://effbot.org/zone/re-sub.htm#unescape-html
 9 |    """
10 |    def fixup(m):
11 |       text = m.group(0)
12 |       if text[:2] == "&#":
13 |          # character reference
14 |          try:
15 |             if text[:3] == "&#x":
16 |                return unichr(int(text[3:-1], 16))
17 |             else:
18 |                return unichr(int(text[2:-1]))
19 |          except ValueError as e:
20 |             print("ValueError: {}".format(e))
21 |             pass
22 |       else:
23 |          # named entity
24 |          try:
25 |             if text[1:-1] == "amp":
26 |                text = "&amp;amp;"
27 |             elif text[1:-1] == "gt":
28 |                text = "&amp;gt;"
29 |             elif text[1:-1] == "lt":
30 |                text = "&amp;lt;"
31 |             else:
32 |                print(text[1:-1])
33 |                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
34 |          except KeyError:
35 |             print("keyerror")
36 |             pass
37 |       return text # leave as is
38 |    return re.sub("&#?\w+;", fixup, text)
39 | 


--------------------------------------------------------------------------------
/celerycrawler/views.py:
--------------------------------------------------------------------------------
 1 | from django.shortcuts import render_to_response
 2 | from whoosh.qparser import QueryParser
 3 | 
 4 | from celerycrawler.indexer import get_searcher, schema
 5 | from celerycrawler.models import Page
 6 | 
 7 | def index(req):
 8 |     return render_to_response("index.html", { "doc_count": Page.count(), "top_docs": Page.get_top_by_rank(limit=20) })
 9 | 
10 | def search(req):
11 |     searcher = get_searcher()
12 | 
13 |     q = QueryParser("content", schema=schema).parse(req.GET["q"])
14 | 
15 |     results = searcher.search(q, limit=100)
16 | 
17 |     if len(results) > 0:
18 |         max_score = max([r.score for r in results])
19 |         max_rank = max([r.fields()["rank"] for r in results])
20 | 
21 |         combined = []
22 |         for r in results:
23 |             fields = r.fields()
24 |             r.score = r.score/max_score
25 |             r.rank = fields["rank"]/max_rank
26 |             r.combined = r.score + r.rank
27 |             combined.append(r)
28 | 
29 |         combined.sort(key=lambda x: x.combined, reverse=True)
30 |     else:
31 |         combined = []
32 | 
33 |     return render_to_response("results.html", { "q": req.GET["q"], "results": combined })
34 | 


--------------------------------------------------------------------------------
/celerycrawler/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for celerycrawler project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/1.7/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "celerycrawler.settings")
12 | 
13 | from django.core.wsgi import get_wsgi_application
14 | application = get_wsgi_application()
15 | 


--------------------------------------------------------------------------------
/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "celerycrawler.settings")
 7 | 
 8 |     from django.core.management import execute_from_command_line
 9 | 
10 |     execute_from_command_line(sys.argv)
11 | 


--------------------------------------------------------------------------------