├── run_prod.sh
├── workaround.py
├── templates
├── index.html
├── macros.html
├── base.html
└── search_results.html
├── requirements.txt
├── bootstrap_db.sh
├── crawler_utils.py
├── .gitignore
├── LICENSE
├── crawler_api.py
├── lang_proc.py
├── download_whole_subreddit.py
├── README.md
├── web_ui.py
└── indexer.py
/run_prod.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # TODO better way of finding gunicorn binary
3 | sudo ./venv/bin/gunicorn web_ui:app -b 0.0.0.0:80 --access-logfile -
4 |
--------------------------------------------------------------------------------
/workaround.py:
--------------------------------------------------------------------------------
1 | class Document(object):
2 | def __init__(self, parsed_text, score, title):
3 | self.parsed_text = parsed_text
4 | self.score = score
5 | self.title = title
6 |
7 |
8 | class InvertedIndexHit(object):
9 | def __init__(self, docid, position, score):
10 | self.docid = docid
11 | self.position = position
12 | self.score = score
13 |
--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 |
3 | {% import "bootstrap/wtf.html" as wtf %}
4 |
5 | {% block title %} This is a search engine for /r/learnprogramming {% endblock %}
6 |
7 | {% block content %}
8 |
9 |
Search /r/learnprogramming
10 |
11 |
{{wtf.quick_form(form)}}
12 |
13 | {% endblock %}
14 |
15 |
16 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==0.10.1
2 | Flask-Bootstrap==3.3.0.1
3 | Flask-WTF==0.10.3
4 | Jinja2==2.7.3
5 | MarkupSafe==0.23
6 | WTForms==2.0.1
7 | Werkzeug==0.9.6
8 | argparse==1.2.1
9 | gunicorn==19.1.1
10 | itsdangerous==0.24
11 | nltk==3.0.0
12 | pip-autoremove==0.8.0
13 | git+https://github.com/praw-dev/praw.git@af8676fea6a1f51f357b719a73017ea980500aa7
14 | progressbar==2.2
15 | requests==2.5.0
16 | six==1.9.0
17 | update-checker==0.11
18 | wsgiref==0.1.2
19 |
--------------------------------------------------------------------------------
/templates/macros.html:
--------------------------------------------------------------------------------
1 | {% macro render_pagination(pagination) %}
2 | {% if pagination.pages > 1 %}
3 |
16 | {% endif %}
17 | {% endmacro %}
18 |
--------------------------------------------------------------------------------
/templates/base.html:
--------------------------------------------------------------------------------
1 | {% extends "bootstrap/base.html" %}
2 |
3 | {% block navbar %}
4 |
5 | {%- endblock navbar %}
6 |
7 |
--------------------------------------------------------------------------------
/bootstrap_db.sh:
--------------------------------------------------------------------------------
1 | echo "Script for bootstrapping the search engine databases"
2 | echo ""
3 | echo "This script will create the necessary folders, gets the posts from reddit, indexes them and runs the web server."
4 | echo "It will get posts for 60 seconds."
5 |
6 | rm -rf docs_dir/
7 | rm -rf index_dir/
8 | mkdir docs_dir
9 | mkdir index_dir
10 |
11 | timeout 60s python download_whole_subreddit.py --storage_dir docs_dir/ --timestamp_interval 9000 --subreddit learnprogramming
12 | python indexer.py --stored_documents_dir docs_dir/ --index_dir index_dir/
13 |
14 | INDEXES_DIR=index_dir/ python web_ui.py
15 |
--------------------------------------------------------------------------------
/crawler_utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os.path
3 |
4 |
5 | def comments_to_json(comments):
6 | result = []
7 | for comment in comments:
8 | result.append({"score": comment.score,
9 | "url": comment.permalink,
10 | "body": comment.body,
11 | "id": comment.id,
12 | "replies": comments_to_json(comment.replies)})
13 |
14 | return result
15 |
16 |
17 | def save_submission(submission, storage_dir):
18 | with open(os.path.join(storage_dir, submission.id), "w") as f:
19 | f.write(json.dumps({"url": submission.permalink,
20 | "text": submission.selftext,
21 | "title": submission.title,
22 | "score": submission.score,
23 | "comments": comments_to_json(submission.comments)}))
24 | f.close()
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # PyInstaller
26 | # Usually these files are written by a python script from a template
27 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 |
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 |
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 |
43 | # Translations
44 | *.mo
45 | *.pot
46 |
47 | # Django stuff:
48 | *.log
49 |
50 | # Sphinx documentation
51 | docs/_build/
52 |
53 | # PyBuilder
54 | target/
55 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2014 Alexander Putilin
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/templates/search_results.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 |
3 | {% import "macros.html" as macros %}
4 |
5 | {% block title %} This is a search engine{% endblock %}
6 |
7 | {% block content %}
8 |
9 |
10 |
Search results for [{{query | safe}}]
11 |
12 |
13 |
{{total_doc_num}} results ({{processing_time.total_seconds()}} seconds)
14 |
15 |
16 | {% for doc in docs %}
17 | -
18 |
19 | {{doc[2]}}
20 |
21 |
22 | ...
23 | {% for word, bold in doc[1] %}
24 | {% if bold %} {% endif %}
25 | {{ word }}
26 | {% if bold %} {% endif %}
27 | {% endfor %}
28 | ...
29 |
30 |
31 | {% else %}
32 |
33 | No results found
34 |
35 | {% endfor %}
36 |
37 |
38 |
41 |
42 |
43 | {% endblock %}
44 |
45 |
--------------------------------------------------------------------------------
/crawler_api.py:
--------------------------------------------------------------------------------
1 | import praw
2 | import logging
3 | import argparse
4 | from distutils.dir_util import mkpath
5 | from praw.helpers import submission_stream
6 | from crawler_utils import save_submission
7 |
8 |
9 | def get_as_much_stuff_as_possible(storage_dir):
10 | mkpath(storage_dir, mode=0755)
11 | r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme')
12 | for method_name in ["get_hot", "get_new", "get_top_from_all", "get_top_from_week",
13 | "get_top_from_month", "get_top_from_year", "get_top_from_day",
14 | "get_top_from_hour"]:
15 | method = getattr(r.get_subreddit('learnprogramming'), method_name)
16 | submissions = method(limit=1000)
17 | for s in submissions:
18 | save_submission(s, storage_dir)
19 |
20 |
21 | def crawl_continuously(storage_dir):
22 | r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme')
23 | for s in submission_stream(r, "learnprogramming"):
24 | s.replace_more_comments(limit=None)
25 | save_submission(s, storage_dir)
26 |
27 |
28 | def main():
29 | logging.getLogger().setLevel(logging.DEBUG)
30 |
31 | parser = argparse.ArgumentParser(description='Crawl /r/learnprogramming using api')
32 | parser.add_argument("--storage_dir", dest="storage_dir", required=True)
33 | args = parser.parse_args()
34 |
35 | get_as_much_stuff_as_possible(args.storage_dir)
36 | crawl_continuously(args.storage_dir)
37 |
38 |
39 | if __name__ == "__main__":
40 | main()
41 |
--------------------------------------------------------------------------------
/lang_proc.py:
--------------------------------------------------------------------------------
1 | from nltk.stem.porter import PorterStemmer
2 | from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
3 | from nltk.corpus import stopwords
4 | _stop_words = stopwords.words('english')
5 | import itertools
6 | import string
7 |
8 |
9 | class Term(object):
10 | def __init__(self, full_word):
11 | self.full_word = full_word
12 | # TODO: Lemmatization requires downloads
13 | # wnl = WordNetLemmatizer()
14 | # lemmas = [wnl.lemmatize(token) for token in tokens]
15 | self.stem = PorterStemmer().stem(full_word).lower()
16 |
17 | def __eq__(self, other):
18 | return self.stem == other.stem
19 |
20 | def __hash__(self):
21 | return hash(self.stem)
22 |
23 | def __repr__(self):
24 | return "Term {}({})".format(self.stem.encode('utf8'), self.full_word.encode('utf8'))
25 |
26 | def __str__(self):
27 | return repr(self)
28 |
29 | def is_punctuation(self):
30 | return self.stem in string.punctuation
31 |
32 | def is_stop_word(self):
33 | return self.full_word in _stop_words
34 |
35 |
36 | def stem_and_tokenize_text(text):
37 | sents = sent_tokenize(text)
38 | tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents]))
39 | terms = [Term(token) for token in tokens]
40 | return filter(lambda term: not term.is_punctuation(), terms)
41 |
42 |
43 | def to_query_terms(query_raw):
44 | # In case query and doc require different processing in the future
45 | return stem_and_tokenize_text(query_raw)
46 |
47 |
48 | def to_doc_terms(doc_raw):
49 | # In case query and doc require different processing in the future
50 | return stem_and_tokenize_text(doc_raw)
51 |
--------------------------------------------------------------------------------
/download_whole_subreddit.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import argparse
3 | from distutils.dir_util import mkpath
4 | import time
5 | import praw
6 | from crawler_utils import save_submission
7 |
8 |
9 | # Downloads all the self posts from given subreddit
10 | def download_the_whole_subreddit(storage_dir, subreddit_name, ts_interval, largest_timestamp):
11 | mkpath(storage_dir, mode=0755)
12 | r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme')
13 | if largest_timestamp is None:
14 | largest_timestamp = int(time.time()) + 12*3600
15 | cts2 = largest_timestamp
16 | cts1 = largest_timestamp - ts_interval
17 | current_ts_interval = ts_interval
18 | while True:
19 | try:
20 | search_results = list(r.search('timestamp:{}..{}'.format(cts1, cts2), subreddit=subreddit_name, syntax='cloudsearch'))
21 | except Exception as e:
22 | logging.exception(e)
23 | continue
24 |
25 | logging.debug("Got {} submissions in interval {}..{}".format(len(search_results), cts1, cts2))
26 | if len(search_results) == 25:
27 | current_ts_interval /= 2
28 | cts1 = cts2 - current_ts_interval
29 | logging.debug("Reducing ts interval to {}".format(current_ts_interval))
30 | continue
31 |
32 | for submission in search_results:
33 | submission.replace_more_comments(limit=None)
34 | save_submission(submission, storage_dir)
35 |
36 | cts2 = cts1
37 | cts1 = cts2 - current_ts_interval
38 |
39 | if cts1 < 0:
40 | break
41 |
42 | if len(search_results) <= 7:
43 | current_ts_interval *= 2
44 | logging.debug("Increasing ts interval to {}".format(current_ts_interval))
45 |
46 |
47 | def main():
48 | logging.getLogger().setLevel(logging.DEBUG)
49 |
50 | parser = argparse.ArgumentParser(description='Download the whole subreddit')
51 | parser.add_argument("--storage_dir", dest="storage_dir", required=True)
52 | parser.add_argument("--subreddit", dest="subreddit", required=True, help="Download the whole subreddit")
53 | parser.add_argument("--timestamp_interval", dest="timestamp_interval", type=int, required=True)
54 | parser.add_argument("--largest_timestamp", dest="largest_timestamp", type=int, required=False, default=None)
55 | args = parser.parse_args()
56 |
57 | download_the_whole_subreddit(args.storage_dir, args.subreddit, args.timestamp_interval, args.largest_timestamp)
58 |
59 | if __name__ == "__main__":
60 | main()
61 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | SearchingReddit
2 | ===============
3 |
4 | # Goal
5 |
6 | This is an example of small search engine that allows people to search over /r/learnprogramming subreddit.
7 |
8 | The goal of the project is to build a small but functional search engine and live stream it on youtube: http://www.youtube.com/c/AlexanderPutilin
9 |
10 |
11 | # Setting up
12 |
13 | 1) NLTK files:
14 |
15 | ```
16 | >>> import nltk
17 | >>> nltk.download('punkt')
18 | >>> nltk.download('stopwords')
19 | ```
20 |
21 | 2) Database:
22 | To run the search engine, you need to download posts from reddit and run the indexer to create the final database. You have 2 options for doing this.
23 |
24 | a. Using the bootstrap script:
25 | You can create a functional development environment by just running a single command. This will result in a database with very little results, but it's very easy to set up and enough to start developing.
26 |
27 | `$ ./db_bootstrap.sh`
28 |
29 | b. Doing it manually:
30 | If you want to run your own search engine, or need more result for your development, you can get the reddit data manually. This will allow you to have as much results as you need.
31 |
32 | First, you need to create two directories. One for the reddit documents and one for the indexed database. Let's call them `docs_dir` and `index_dir`.
33 |
34 | To get reddit data into the documents directory, you need to run `download_whole_subreddit.py` like this.
35 |
36 | `python download_whole_subreddit.py --storage_dir docs_dir/ --timestamp_interval 9000 --subreddit learnprogramming`
37 |
38 | This command will download reddit posts from the /r/learnprogramming subreddit to the `docs_dir` directory. It will keep running and downloading posts until you stop it.
39 |
40 | After downloading the data, you need to index it so the search engine can use it. To do this you need to run `indexer.py` like this.
41 |
42 | `python indexer.py --stored_documents_dir docs_dir/ --index_dir index_dir/`
43 |
44 | This command will index the posts from the documents directory and put them into the `index_dir` folder.
45 |
46 | After indexing, you can run the web server by running `web_ui.py`. You also need to supply the index directory as an environment variable.
47 |
48 | `INDEXES_DIR=index_dir/ python web_ui.py`
49 |
50 | Running this will run the search engine with the database in the `index_dir` directory.
51 |
--------------------------------------------------------------------------------
/web_ui.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, render_template, redirect, url_for, request, abort
2 | from flask_bootstrap import Bootstrap
3 | from flask_wtf import Form
4 | from wtforms import StringField, SubmitField
5 | from wtforms.validators import DataRequired
6 | from indexer import Searcher, ShelveIndexes
7 | from lang_proc import to_query_terms
8 | import logging
9 | import cgi
10 | from datetime import datetime
11 | import os
12 | import workaround # NOQA
13 |
14 | searcher = Searcher(os.environ["INDEXES_DIR"], ShelveIndexes)
15 | app = Flask(__name__)
16 | app.logger.setLevel(logging.DEBUG)
17 | Bootstrap(app)
18 |
19 |
20 | def url_for_other_page(page):
21 | args = request.view_args.copy()
22 | args['page'] = page
23 | return url_for(request.endpoint, **args)
24 |
25 |
26 | app.jinja_env.globals['url_for_other_page'] = url_for_other_page
27 |
28 |
29 | class SearchForm(Form):
30 | user_query = StringField('Query', validators=[DataRequired()])
31 | search_button = SubmitField("Search!")
32 |
33 |
34 | @app.route("/", methods=["GET", "POST"])
35 | def index():
36 | search_form = SearchForm(csrf_enabled=False)
37 | if search_form.validate_on_submit():
38 | return redirect(url_for("search_results", query=search_form.user_query.data))
39 | return render_template("index.html", form=search_form)
40 |
41 |
42 | @app.route("/search_results/", defaults={'page': 1})
43 | @app.route("/search_results//")
44 | def search_results(query, page):
45 | start_time = datetime.now()
46 | query_terms = to_query_terms(query)
47 | app.logger.info("Requested [{}]".format(" ".join(map(str, query_terms))))
48 | page_size = 25
49 | search_results = searcher.find_documents_and_rank_by_points(query_terms)
50 | docids = search_results.get_page(page, page_size)
51 | pagination = search_results.get_pagination(page, page_size)
52 | if page > pagination.pages:
53 | abort(404)
54 | docs = []
55 | for docid in docids:
56 | docs.append((searcher.indexes.get_url(docid), searcher.generate_snippet(query_terms, docid), searcher.indexes.get_title(docid)))
57 | finish_time = datetime.now()
58 |
59 | return render_template("search_results.html",
60 | processing_time=(finish_time-start_time),
61 | offset=((page-1)*page_size),
62 | total_doc_num=search_results.total_doc_num(),
63 | pagination=pagination,
64 | query=cgi.escape(query),
65 | docs=docs)
66 |
67 | if __name__ == "__main__":
68 | app.run(debug=True, host='0.0.0.0')
69 |
--------------------------------------------------------------------------------
/indexer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import os
4 | from lang_proc import to_doc_terms
5 | import json
6 | import shelve
7 | import math
8 | import workaround
9 |
10 |
11 | class ShelveIndexes(object):
12 | def __init__(self):
13 | # map(dict): from word to ids of documents that contain the word
14 | self.inverted_index = None
15 | # list of parsed documents (map from doc id to list of words in a document)
16 | self.forward_index = None
17 | self.url_to_id = None
18 | self.id_to_url = dict()
19 | self.doc_count = 0
20 | self.block_count = 0
21 |
22 | def total_doc_count(self):
23 | return self._doc_count
24 |
25 | def average_doclen(self):
26 | return self._avgdl
27 |
28 | def save_on_disk(self, index_dir):
29 | self.inverted_index.close()
30 | self.forward_index.close()
31 | self.url_to_id.close()
32 | self._merge_blocks()
33 |
34 | def load_from_disk(self, index_dir):
35 | self.inverted_index = shelve.open(os.path.join(index_dir, "inverted_index"))
36 | self.forward_index = shelve.open(os.path.join(index_dir, "forward_index"))
37 | self.url_to_id = shelve.open(os.path.join(index_dir, "url_to_id"))
38 | self.id_to_url = {v: k for k, v in self.url_to_id.items()}
39 |
40 | # TODO: avgdl and total doc count should be calculated when indexing
41 | self._doc_count = 0
42 | """
43 | total_word_count = 0
44 | for (docid, text) in self.forward_index.iteritems():
45 | self._doc_count += 1
46 | total_word_count += len(text.parsed_text)
47 | self._avgdl = total_word_count / self._doc_count
48 | """
49 |
50 | print "LOADED!"
51 |
52 | def start_indexing(self, index_dir):
53 | self.forward_index = shelve.open(os.path.join(index_dir, "forward_index"), "n", writeback=True)
54 | self.url_to_id = shelve.open(os.path.join(index_dir, "url_to_id"), "n", writeback=True)
55 | self.index_dir = index_dir
56 |
57 | def sync(self):
58 | self.inverted_index.sync()
59 | self.forward_index.sync()
60 | self.url_to_id.sync()
61 |
62 | def _merge_blocks(self):
63 | print "Merging blocks!"
64 | blocks = [shelve.open(os.path.join(self.index_dir, "inverted_index_block{}".format(i))) for i in xrange(self.block_count)]
65 | keys = set()
66 | for block in blocks:
67 | keys |= set(block.keys())
68 | print "Total word count", len(keys)
69 | merged_index = shelve.open(os.path.join(self.index_dir, "inverted_index"), "n")
70 | key_ind = 0
71 | for key in keys:
72 | key_ind += 1
73 | print "MERGING", key_ind, key
74 | merged_index[key] = sum([block.get(key, []) for block in blocks], [])
75 |
76 | merged_index.close()
77 |
78 | def _create_new_ii_block(self):
79 | print "Created a new block!"
80 | if self.inverted_index:
81 | self.inverted_index.close()
82 | self.inverted_index = shelve.open(os.path.join(self.index_dir, "inverted_index_block{}".format(self.block_count)), "n", writeback=True)
83 | self.block_count += 1
84 |
85 | def add_document(self, url, doc):
86 | if self.doc_count % 2000 == 0:
87 | self._create_new_ii_block()
88 |
89 | self.doc_count += 1
90 | assert url.encode('utf8') not in self.url_to_id
91 | current_id = self.doc_count
92 | self.url_to_id[url.encode('utf8')] = current_id
93 | self.id_to_url[current_id] = url
94 | self.forward_index[str(current_id)] = doc
95 | for position, term in enumerate(doc.parsed_text):
96 | if term.is_stop_word():
97 | continue
98 |
99 | stem = term.stem.encode('utf8')
100 | if stem not in self.inverted_index:
101 | self.inverted_index[stem] = []
102 | self.inverted_index[stem].append(workaround.InvertedIndexHit(current_id, position, doc.score))
103 |
104 | def get_documents(self, query_term):
105 | return self.inverted_index.get(query_term.stem.encode('utf8'), [])
106 |
107 | def get_document_text(self, doc_id):
108 | return self.forward_index[str(doc_id)].parsed_text
109 |
110 | def get_url(self, doc_id):
111 | return self.id_to_url[doc_id]
112 |
113 | def get_title(self, doc_id):
114 | return self.forward_index[str(doc_id)].title
115 |
116 |
117 | class SerpPagination(object):
118 | def __init__(self, page, page_size, total_doc_num):
119 | self.page = page
120 | self.page_size = page_size
121 | self.pages = (total_doc_num / page_size) + 1
122 |
123 | def iter_pages(self):
124 | if self.pages == 1:
125 | return [1]
126 | if self.page <= 6:
127 | left_part = range(1, self.page)
128 | else:
129 | left_part = [1, None] + range(self.page - 4, self.page)
130 | right_part = range(self.page, min(self.pages + 1, self.page + 5))
131 |
132 | result = left_part + right_part
133 | if result[-1] != self.page:
134 | result.append(None)
135 |
136 | return result
137 |
138 |
139 |
140 |
141 | class SearchResults(object):
142 | def __init__(self, docids_with_relevance):
143 | self.docids, self.relevances = zip(*docids_with_relevance) if docids_with_relevance else ([], [])
144 |
145 | def get_page(self, page, page_size):
146 | start_num = (page-1)*page_size
147 | return self.docids[start_num:start_num+page_size]
148 |
149 | def get_pagination(self, page, page_size):
150 | return SerpPagination(page, page_size, len(self.docids))
151 |
152 | def total_doc_num(self):
153 | return len(self.docids)
154 |
155 |
156 | class Searcher(object):
157 | def __init__(self, index_dir, IndexesImplementation):
158 | self.indexes = IndexesImplementation()
159 | self.indexes.load_from_disk(index_dir)
160 |
161 | # The algorithms based on:
162 | # http://rcrezende.blogspot.com/2010/08/smallest-relevant-text-snippet-for.html
163 | def generate_snippet(self, query_terms, doc_id):
164 | query_terms_in_window = []
165 | best_window_len = 100500 # TODO: inf would be better :)
166 | terms_in_best_window = 0
167 | best_window = []
168 | for pos, term in enumerate(self.indexes.get_document_text(doc_id)):
169 | if term in query_terms:
170 | query_terms_in_window.append((term, pos))
171 | if len(query_terms_in_window) > 1 and query_terms_in_window[0][0] == term:
172 | query_terms_in_window.pop(0)
173 | current_window_len = pos - query_terms_in_window[0][1] + 1
174 | tiw = len(set(map(lambda x: x[0], query_terms_in_window)))
175 | if tiw > terms_in_best_window or (tiw == terms_in_best_window and current_window_len < best_window_len):
176 | terms_in_best_window = tiw
177 | best_window = query_terms_in_window[:]
178 | best_window_len = current_window_len
179 |
180 | doc_len = len(self.indexes.get_document_text(doc_id))
181 | # TODO: 15 should be a named constant
182 | snippet_start = max(best_window[0][1] - 8, 0)
183 | snippet_end = min(doc_len, best_window[len(best_window) - 1][1] + 1 + 8)
184 |
185 | snippet = [(term.full_word, term in query_terms) for term in self.indexes.get_document_text(doc_id)[snippet_start:snippet_end]]
186 | # TODO 50 should be a named constant too!
187 | if len(snippet) > 50:
188 | excessive_len = len(snippet) - 50
189 | snippet = snippet[:len(snippet) / 2 - excessive_len / 2] + [("...", False)] + snippet[len(snippet) / 2 + excessive_len / 2:]
190 |
191 |
192 | return snippet
193 |
194 | """
195 | def find_documents_AND(self, query_terms):
196 | # docid -> number of query words
197 | query_term_count = defaultdict(set)
198 | for query_term in query_terms:
199 | for (pos, docid) in self.indexes.get_documents(query_term):
200 | query_term_count[docid].add(query_term)
201 |
202 | return SearchResults(self.rank_docids([doc_id for doc_id, unique_hits in query_term_count.iteritems() if len(unique_hits) == len(query_terms)]))
203 | """
204 |
205 | def find_documents_and_rank_by_points(self, query_terms):
206 | docids_and_relevance = set()
207 | for query_term in query_terms:
208 | for hit in self.indexes.get_documents(query_term):
209 | docids_and_relevance.add((hit.docid, hit.score))
210 |
211 | return SearchResults(sorted(list(docids_and_relevance), key=lambda x: x[1], reverse=True))
212 |
213 | def _bm25(self, docid, query_terms_to_posting_lists_sizes):
214 | result = 0
215 | text = self.indexes.get_document_text(docid)
216 | text_len = len(text)
217 | for qt, nd_containing in query_terms_to_posting_lists_sizes.iteritems():
218 | term_frequency = float(len(filter(lambda t: qt == t, text))) / text_len
219 | inverted_document_frequency = math.log((self.indexes.total_doc_count() - nd_containing + 0.5) / (nd_containing + 0.5))
220 | k1 = 1.5
221 | b = 0.75
222 | result += inverted_document_frequency * (term_frequency * (k1+1)) / (term_frequency + k1*(1 - b + b * query_terms_to_posting_lists_sizes[qt] / self.indexes.average_doclen()))
223 |
224 | return result
225 |
226 | def find_documents_and_rank_by_bm25(self, query_terms):
227 | docids = set()
228 | query_terms_to_posting_lists_sizes = dict()
229 | for query_term in query_terms:
230 | posting_list = self.indexes.get_documents(query_term)
231 | query_terms_to_posting_lists_sizes[query_term] = len(posting_list)
232 | for hit in posting_list:
233 | docids.add(hit.docid)
234 |
235 | docids_and_relevance = set()
236 | for docid in docids:
237 | docids_and_relevance.add((docid, self._bm25(docid, query_terms_to_posting_lists_sizes)))
238 |
239 | return SearchResults(sorted(list(docids_and_relevance), key=lambda x: x[1], reverse=True))
240 |
241 |
242 |
243 | def create_index_from_dir_API(stored_documents_dir, index_dir, IndexesImplementation=ShelveIndexes):
244 | indexer = IndexesImplementation()
245 | indexer.start_indexing(index_dir)
246 | filenames = [name for name in os.listdir(stored_documents_dir)]
247 | # widgets = [' Indexing: ', Percentage(), ' ', Bar(marker=RotatingMarker())]
248 | indexed_docs_num = 0
249 | # progressbar = ProgressBar(widgets=widgets, maxval=len(filenames))
250 | for filename in filenames:
251 | indexed_docs_num += 1
252 | # progressbar.update(indexed_docs_num)
253 | opened_file = open(os.path.join(stored_documents_dir, filename))
254 | doc_json = json.load(opened_file)
255 | parsed_doc = to_doc_terms(doc_json['text'])
256 | print indexed_docs_num
257 | if indexed_docs_num % 100 == 0:
258 | print indexed_docs_num, "Syncing..."
259 | indexer.sync()
260 | print indexed_docs_num, "Synced!"
261 |
262 | indexer.add_document(doc_json['url'], workaround.Document(parsed_doc, int(doc_json['score']), doc_json["title"]))
263 | # progressbar.update(indexed_docs_num)
264 | indexer.save_on_disk(index_dir)
265 |
266 |
267 | def main():
268 | parser = argparse.ArgumentParser(description='Index /r/learnprogramming')
269 | parser.add_argument("--stored_documents_dir", dest="stored_documents_dir", required=True)
270 | parser.add_argument("--index_dir", dest="index_dir", required=True)
271 | args = parser.parse_args()
272 | create_index_from_dir_API(args.stored_documents_dir, args.index_dir)
273 |
274 |
275 | if __name__ == "__main__":
276 | main()
277 |
--------------------------------------------------------------------------------