├── run_prod.sh ├── workaround.py ├── templates ├── index.html ├── macros.html ├── base.html └── search_results.html ├── requirements.txt ├── bootstrap_db.sh ├── crawler_utils.py ├── .gitignore ├── LICENSE ├── crawler_api.py ├── lang_proc.py ├── download_whole_subreddit.py ├── README.md ├── web_ui.py └── indexer.py /run_prod.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # TODO better way of finding gunicorn binary 3 | sudo ./venv/bin/gunicorn web_ui:app -b 0.0.0.0:80 --access-logfile - 4 | -------------------------------------------------------------------------------- /workaround.py: -------------------------------------------------------------------------------- 1 | class Document(object): 2 | def __init__(self, parsed_text, score, title): 3 | self.parsed_text = parsed_text 4 | self.score = score 5 | self.title = title 6 | 7 | 8 | class InvertedIndexHit(object): 9 | def __init__(self, docid, position, score): 10 | self.docid = docid 11 | self.position = position 12 | self.score = score 13 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% import "bootstrap/wtf.html" as wtf %} 4 | 5 | {% block title %} This is a search engine for /r/learnprogramming {% endblock %} 6 | 7 | {% block content %} 8 |

9 |

Search /r/learnprogramming

10 |

11 |

12 |

13 | {% endblock %} 14 | 15 | 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==0.10.1 2 | Flask-Bootstrap==3.3.0.1 3 | Flask-WTF==0.10.3 4 | Jinja2==2.7.3 5 | MarkupSafe==0.23 6 | WTForms==2.0.1 7 | Werkzeug==0.9.6 8 | argparse==1.2.1 9 | gunicorn==19.1.1 10 | itsdangerous==0.24 11 | nltk==3.0.0 12 | pip-autoremove==0.8.0 13 | git+https://github.com/praw-dev/praw.git@af8676fea6a1f51f357b719a73017ea980500aa7 14 | progressbar==2.2 15 | requests==2.5.0 16 | six==1.9.0 17 | update-checker==0.11 18 | wsgiref==0.1.2 19 | -------------------------------------------------------------------------------- /templates/macros.html: -------------------------------------------------------------------------------- 1 | {% macro render_pagination(pagination) %} 2 | {% if pagination.pages > 1 %} 3 |

{{ page }}
{{ page }}
…

16 | {% endif %} 17 | {% endmacro %} 18 | -------------------------------------------------------------------------------- /templates/base.html: -------------------------------------------------------------------------------- 1 | {% extends "bootstrap/base.html" %} 2 | 3 | {% block navbar %} 4 |

5 | {%- endblock navbar %} 6 | 7 | -------------------------------------------------------------------------------- /bootstrap_db.sh: -------------------------------------------------------------------------------- 1 | echo "Script for bootstrapping the search engine databases" 2 | echo "" 3 | echo "This script will create the necessary folders, gets the posts from reddit, indexes them and runs the web server." 4 | echo "It will get posts for 60 seconds." 5 | 6 | rm -rf docs_dir/ 7 | rm -rf index_dir/ 8 | mkdir docs_dir 9 | mkdir index_dir 10 | 11 | timeout 60s python download_whole_subreddit.py --storage_dir docs_dir/ --timestamp_interval 9000 --subreddit learnprogramming 12 | python indexer.py --stored_documents_dir docs_dir/ --index_dir index_dir/ 13 | 14 | INDEXES_DIR=index_dir/ python web_ui.py 15 | -------------------------------------------------------------------------------- /crawler_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os.path 3 | 4 | 5 | def comments_to_json(comments): 6 | result = [] 7 | for comment in comments: 8 | result.append({"score": comment.score, 9 | "url": comment.permalink, 10 | "body": comment.body, 11 | "id": comment.id, 12 | "replies": comments_to_json(comment.replies)}) 13 | 14 | return result 15 | 16 | 17 | def save_submission(submission, storage_dir): 18 | with open(os.path.join(storage_dir, submission.id), "w") as f: 19 | f.write(json.dumps({"url": submission.permalink, 20 | "text": submission.selftext, 21 | "title": submission.title, 22 | "score": submission.score, 23 | "comments": comments_to_json(submission.comments)})) 24 | f.close() 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Alexander Putilin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /templates/search_results.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% import "macros.html" as macros %} 4 | 5 | {% block title %} This is a search engine{% endblock %} 6 | 7 | {% block content %} 8 |

9 |

10 |

Search results for [{{query | safe}}] 11 |

12 |

13 |

{{total_doc_num}} results ({{processing_time.total_seconds()}} seconds)

14 |

18 |
19 | {{doc[2]}} 20 |
21 |
22 | ... 23 | {% for word, bold in doc[1] %} 24 | {% if bold %} {% endif %} 25 | {{ word }} 26 | {% if bold %} {% endif %} 27 | {% endfor %} 28 | ... 29 |
30 |

33 | No results found 34 |

43 | {% endblock %} 44 | 45 | -------------------------------------------------------------------------------- /crawler_api.py: -------------------------------------------------------------------------------- 1 | import praw 2 | import logging 3 | import argparse 4 | from distutils.dir_util import mkpath 5 | from praw.helpers import submission_stream 6 | from crawler_utils import save_submission 7 | 8 | 9 | def get_as_much_stuff_as_possible(storage_dir): 10 | mkpath(storage_dir, mode=0755) 11 | r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme') 12 | for method_name in ["get_hot", "get_new", "get_top_from_all", "get_top_from_week", 13 | "get_top_from_month", "get_top_from_year", "get_top_from_day", 14 | "get_top_from_hour"]: 15 | method = getattr(r.get_subreddit('learnprogramming'), method_name) 16 | submissions = method(limit=1000) 17 | for s in submissions: 18 | save_submission(s, storage_dir) 19 | 20 | 21 | def crawl_continuously(storage_dir): 22 | r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme') 23 | for s in submission_stream(r, "learnprogramming"): 24 | s.replace_more_comments(limit=None) 25 | save_submission(s, storage_dir) 26 | 27 | 28 | def main(): 29 | logging.getLogger().setLevel(logging.DEBUG) 30 | 31 | parser = argparse.ArgumentParser(description='Crawl /r/learnprogramming using api') 32 | parser.add_argument("--storage_dir", dest="storage_dir", required=True) 33 | args = parser.parse_args() 34 | 35 | get_as_much_stuff_as_possible(args.storage_dir) 36 | crawl_continuously(args.storage_dir) 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /lang_proc.py: -------------------------------------------------------------------------------- 1 | from nltk.stem.porter import PorterStemmer 2 | from nltk.tokenize import sent_tokenize, TreebankWordTokenizer 3 | from nltk.corpus import stopwords 4 | _stop_words = stopwords.words('english') 5 | import itertools 6 | import string 7 | 8 | 9 | class Term(object): 10 | def __init__(self, full_word): 11 | self.full_word = full_word 12 | # TODO: Lemmatization requires downloads 13 | # wnl = WordNetLemmatizer() 14 | # lemmas = [wnl.lemmatize(token) for token in tokens] 15 | self.stem = PorterStemmer().stem(full_word).lower() 16 | 17 | def __eq__(self, other): 18 | return self.stem == other.stem 19 | 20 | def __hash__(self): 21 | return hash(self.stem) 22 | 23 | def __repr__(self): 24 | return "Term {}({})".format(self.stem.encode('utf8'), self.full_word.encode('utf8')) 25 | 26 | def __str__(self): 27 | return repr(self) 28 | 29 | def is_punctuation(self): 30 | return self.stem in string.punctuation 31 | 32 | def is_stop_word(self): 33 | return self.full_word in _stop_words 34 | 35 | 36 | def stem_and_tokenize_text(text): 37 | sents = sent_tokenize(text) 38 | tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents])) 39 | terms = [Term(token) for token in tokens] 40 | return filter(lambda term: not term.is_punctuation(), terms) 41 | 42 | 43 | def to_query_terms(query_raw): 44 | # In case query and doc require different processing in the future 45 | return stem_and_tokenize_text(query_raw) 46 | 47 | 48 | def to_doc_terms(doc_raw): 49 | # In case query and doc require different processing in the future 50 | return stem_and_tokenize_text(doc_raw) 51 | -------------------------------------------------------------------------------- /download_whole_subreddit.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import argparse 3 | from distutils.dir_util import mkpath 4 | import time 5 | import praw 6 | from crawler_utils import save_submission 7 | 8 | 9 | # Downloads all the self posts from given subreddit 10 | def download_the_whole_subreddit(storage_dir, subreddit_name, ts_interval, largest_timestamp): 11 | mkpath(storage_dir, mode=0755) 12 | r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme') 13 | if largest_timestamp is None: 14 | largest_timestamp = int(time.time()) + 12*3600 15 | cts2 = largest_timestamp 16 | cts1 = largest_timestamp - ts_interval 17 | current_ts_interval = ts_interval 18 | while True: 19 | try: 20 | search_results = list(r.search('timestamp:{}..{}'.format(cts1, cts2), subreddit=subreddit_name, syntax='cloudsearch')) 21 | except Exception as e: 22 | logging.exception(e) 23 | continue 24 | 25 | logging.debug("Got {} submissions in interval {}..{}".format(len(search_results), cts1, cts2)) 26 | if len(search_results) == 25: 27 | current_ts_interval /= 2 28 | cts1 = cts2 - current_ts_interval 29 | logging.debug("Reducing ts interval to {}".format(current_ts_interval)) 30 | continue 31 | 32 | for submission in search_results: 33 | submission.replace_more_comments(limit=None) 34 | save_submission(submission, storage_dir) 35 | 36 | cts2 = cts1 37 | cts1 = cts2 - current_ts_interval 38 | 39 | if cts1 < 0: 40 | break 41 | 42 | if len(search_results) <= 7: 43 | current_ts_interval *= 2 44 | logging.debug("Increasing ts interval to {}".format(current_ts_interval)) 45 | 46 | 47 | def main(): 48 | logging.getLogger().setLevel(logging.DEBUG) 49 | 50 | parser = argparse.ArgumentParser(description='Download the whole subreddit') 51 | parser.add_argument("--storage_dir", dest="storage_dir", required=True) 52 | parser.add_argument("--subreddit", dest="subreddit", required=True, help="Download the whole subreddit") 53 | parser.add_argument("--timestamp_interval", dest="timestamp_interval", type=int, required=True) 54 | parser.add_argument("--largest_timestamp", dest="largest_timestamp", type=int, required=False, default=None) 55 | args = parser.parse_args() 56 | 57 | download_the_whole_subreddit(args.storage_dir, args.subreddit, args.timestamp_interval, args.largest_timestamp) 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SearchingReddit 2 | =============== 3 | 4 | # Goal 5 | 6 | This is an example of small search engine that allows people to search over /r/learnprogramming subreddit. 7 | 8 | The goal of the project is to build a small but functional search engine and live stream it on youtube: http://www.youtube.com/c/AlexanderPutilin 9 | 10 | 11 | # Setting up 12 | 13 | 1) NLTK files: 14 | 15 | ``` 16 | >>> import nltk 17 | >>> nltk.download('punkt') 18 | >>> nltk.download('stopwords') 19 | ``` 20 | 21 | 2) Database: 22 | To run the search engine, you need to download posts from reddit and run the indexer to create the final database. You have 2 options for doing this. 23 | 24 | a. Using the bootstrap script: 25 | You can create a functional development environment by just running a single command. This will result in a database with very little results, but it's very easy to set up and enough to start developing. 26 | 27 | `$ ./db_bootstrap.sh` 28 | 29 | b. Doing it manually: 30 | If you want to run your own search engine, or need more result for your development, you can get the reddit data manually. This will allow you to have as much results as you need. 31 | 32 | First, you need to create two directories. One for the reddit documents and one for the indexed database. Let's call them `docs_dir` and `index_dir`. 33 | 34 | To get reddit data into the documents directory, you need to run `download_whole_subreddit.py` like this. 35 | 36 | `python download_whole_subreddit.py --storage_dir docs_dir/ --timestamp_interval 9000 --subreddit learnprogramming` 37 | 38 | This command will download reddit posts from the /r/learnprogramming subreddit to the `docs_dir` directory. It will keep running and downloading posts until you stop it. 39 | 40 | After downloading the data, you need to index it so the search engine can use it. To do this you need to run `indexer.py` like this. 41 | 42 | `python indexer.py --stored_documents_dir docs_dir/ --index_dir index_dir/` 43 | 44 | This command will index the posts from the documents directory and put them into the `index_dir` folder. 45 | 46 | After indexing, you can run the web server by running `web_ui.py`. You also need to supply the index directory as an environment variable. 47 | 48 | `INDEXES_DIR=index_dir/ python web_ui.py` 49 | 50 | Running this will run the search engine with the database in the `index_dir` directory. 51 | -------------------------------------------------------------------------------- /web_ui.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, redirect, url_for, request, abort 2 | from flask_bootstrap import Bootstrap 3 | from flask_wtf import Form 4 | from wtforms import StringField, SubmitField 5 | from wtforms.validators import DataRequired 6 | from indexer import Searcher, ShelveIndexes 7 | from lang_proc import to_query_terms 8 | import logging 9 | import cgi 10 | from datetime import datetime 11 | import os 12 | import workaround # NOQA 13 | 14 | searcher = Searcher(os.environ["INDEXES_DIR"], ShelveIndexes) 15 | app = Flask(__name__) 16 | app.logger.setLevel(logging.DEBUG) 17 | Bootstrap(app) 18 | 19 | 20 | def url_for_other_page(page): 21 | args = request.view_args.copy() 22 | args['page'] = page 23 | return url_for(request.endpoint, **args) 24 | 25 | 26 | app.jinja_env.globals['url_for_other_page'] = url_for_other_page 27 | 28 | 29 | class SearchForm(Form): 30 | user_query = StringField('Query', validators=[DataRequired()]) 31 | search_button = SubmitField("Search!") 32 | 33 | 34 | @app.route("/", methods=["GET", "POST"]) 35 | def index(): 36 | search_form = SearchForm(csrf_enabled=False) 37 | if search_form.validate_on_submit(): 38 | return redirect(url_for("search_results", query=search_form.user_query.data)) 39 | return render_template("index.html", form=search_form) 40 | 41 | 42 | @app.route("/search_results/", defaults={'page': 1}) 43 | @app.route("/search_results//") 44 | def search_results(query, page): 45 | start_time = datetime.now() 46 | query_terms = to_query_terms(query) 47 | app.logger.info("Requested [{}]".format(" ".join(map(str, query_terms)))) 48 | page_size = 25 49 | search_results = searcher.find_documents_and_rank_by_points(query_terms) 50 | docids = search_results.get_page(page, page_size) 51 | pagination = search_results.get_pagination(page, page_size) 52 | if page > pagination.pages: 53 | abort(404) 54 | docs = [] 55 | for docid in docids: 56 | docs.append((searcher.indexes.get_url(docid), searcher.generate_snippet(query_terms, docid), searcher.indexes.get_title(docid))) 57 | finish_time = datetime.now() 58 | 59 | return render_template("search_results.html", 60 | processing_time=(finish_time-start_time), 61 | offset=((page-1)*page_size), 62 | total_doc_num=search_results.total_doc_num(), 63 | pagination=pagination, 64 | query=cgi.escape(query), 65 | docs=docs) 66 | 67 | if __name__ == "__main__": 68 | app.run(debug=True, host='0.0.0.0') 69 | -------------------------------------------------------------------------------- /indexer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import os 4 | from lang_proc import to_doc_terms 5 | import json 6 | import shelve 7 | import math 8 | import workaround 9 | 10 | 11 | class ShelveIndexes(object): 12 | def __init__(self): 13 | # map(dict): from word to ids of documents that contain the word 14 | self.inverted_index = None 15 | # list of parsed documents (map from doc id to list of words in a document) 16 | self.forward_index = None 17 | self.url_to_id = None 18 | self.id_to_url = dict() 19 | self.doc_count = 0 20 | self.block_count = 0 21 | 22 | def total_doc_count(self): 23 | return self._doc_count 24 | 25 | def average_doclen(self): 26 | return self._avgdl 27 | 28 | def save_on_disk(self, index_dir): 29 | self.inverted_index.close() 30 | self.forward_index.close() 31 | self.url_to_id.close() 32 | self._merge_blocks() 33 | 34 | def load_from_disk(self, index_dir): 35 | self.inverted_index = shelve.open(os.path.join(index_dir, "inverted_index")) 36 | self.forward_index = shelve.open(os.path.join(index_dir, "forward_index")) 37 | self.url_to_id = shelve.open(os.path.join(index_dir, "url_to_id")) 38 | self.id_to_url = {v: k for k, v in self.url_to_id.items()} 39 | 40 | # TODO: avgdl and total doc count should be calculated when indexing 41 | self._doc_count = 0 42 | """ 43 | total_word_count = 0 44 | for (docid, text) in self.forward_index.iteritems(): 45 | self._doc_count += 1 46 | total_word_count += len(text.parsed_text) 47 | self._avgdl = total_word_count / self._doc_count 48 | """ 49 | 50 | print "LOADED!" 51 | 52 | def start_indexing(self, index_dir): 53 | self.forward_index = shelve.open(os.path.join(index_dir, "forward_index"), "n", writeback=True) 54 | self.url_to_id = shelve.open(os.path.join(index_dir, "url_to_id"), "n", writeback=True) 55 | self.index_dir = index_dir 56 | 57 | def sync(self): 58 | self.inverted_index.sync() 59 | self.forward_index.sync() 60 | self.url_to_id.sync() 61 | 62 | def _merge_blocks(self): 63 | print "Merging blocks!" 64 | blocks = [shelve.open(os.path.join(self.index_dir, "inverted_index_block{}".format(i))) for i in xrange(self.block_count)] 65 | keys = set() 66 | for block in blocks: 67 | keys |= set(block.keys()) 68 | print "Total word count", len(keys) 69 | merged_index = shelve.open(os.path.join(self.index_dir, "inverted_index"), "n") 70 | key_ind = 0 71 | for key in keys: 72 | key_ind += 1 73 | print "MERGING", key_ind, key 74 | merged_index[key] = sum([block.get(key, []) for block in blocks], []) 75 | 76 | merged_index.close() 77 | 78 | def _create_new_ii_block(self): 79 | print "Created a new block!" 80 | if self.inverted_index: 81 | self.inverted_index.close() 82 | self.inverted_index = shelve.open(os.path.join(self.index_dir, "inverted_index_block{}".format(self.block_count)), "n", writeback=True) 83 | self.block_count += 1 84 | 85 | def add_document(self, url, doc): 86 | if self.doc_count % 2000 == 0: 87 | self._create_new_ii_block() 88 | 89 | self.doc_count += 1 90 | assert url.encode('utf8') not in self.url_to_id 91 | current_id = self.doc_count 92 | self.url_to_id[url.encode('utf8')] = current_id 93 | self.id_to_url[current_id] = url 94 | self.forward_index[str(current_id)] = doc 95 | for position, term in enumerate(doc.parsed_text): 96 | if term.is_stop_word(): 97 | continue 98 | 99 | stem = term.stem.encode('utf8') 100 | if stem not in self.inverted_index: 101 | self.inverted_index[stem] = [] 102 | self.inverted_index[stem].append(workaround.InvertedIndexHit(current_id, position, doc.score)) 103 | 104 | def get_documents(self, query_term): 105 | return self.inverted_index.get(query_term.stem.encode('utf8'), []) 106 | 107 | def get_document_text(self, doc_id): 108 | return self.forward_index[str(doc_id)].parsed_text 109 | 110 | def get_url(self, doc_id): 111 | return self.id_to_url[doc_id] 112 | 113 | def get_title(self, doc_id): 114 | return self.forward_index[str(doc_id)].title 115 | 116 | 117 | class SerpPagination(object): 118 | def __init__(self, page, page_size, total_doc_num): 119 | self.page = page 120 | self.page_size = page_size 121 | self.pages = (total_doc_num / page_size) + 1 122 | 123 | def iter_pages(self): 124 | if self.pages == 1: 125 | return [1] 126 | if self.page <= 6: 127 | left_part = range(1, self.page) 128 | else: 129 | left_part = [1, None] + range(self.page - 4, self.page) 130 | right_part = range(self.page, min(self.pages + 1, self.page + 5)) 131 | 132 | result = left_part + right_part 133 | if result[-1] != self.page: 134 | result.append(None) 135 | 136 | return result 137 | 138 | 139 | 140 | 141 | class SearchResults(object): 142 | def __init__(self, docids_with_relevance): 143 | self.docids, self.relevances = zip(*docids_with_relevance) if docids_with_relevance else ([], []) 144 | 145 | def get_page(self, page, page_size): 146 | start_num = (page-1)*page_size 147 | return self.docids[start_num:start_num+page_size] 148 | 149 | def get_pagination(self, page, page_size): 150 | return SerpPagination(page, page_size, len(self.docids)) 151 | 152 | def total_doc_num(self): 153 | return len(self.docids) 154 | 155 | 156 | class Searcher(object): 157 | def __init__(self, index_dir, IndexesImplementation): 158 | self.indexes = IndexesImplementation() 159 | self.indexes.load_from_disk(index_dir) 160 | 161 | # The algorithms based on: 162 | # http://rcrezende.blogspot.com/2010/08/smallest-relevant-text-snippet-for.html 163 | def generate_snippet(self, query_terms, doc_id): 164 | query_terms_in_window = [] 165 | best_window_len = 100500 # TODO: inf would be better :) 166 | terms_in_best_window = 0 167 | best_window = [] 168 | for pos, term in enumerate(self.indexes.get_document_text(doc_id)): 169 | if term in query_terms: 170 | query_terms_in_window.append((term, pos)) 171 | if len(query_terms_in_window) > 1 and query_terms_in_window[0][0] == term: 172 | query_terms_in_window.pop(0) 173 | current_window_len = pos - query_terms_in_window[0][1] + 1 174 | tiw = len(set(map(lambda x: x[0], query_terms_in_window))) 175 | if tiw > terms_in_best_window or (tiw == terms_in_best_window and current_window_len < best_window_len): 176 | terms_in_best_window = tiw 177 | best_window = query_terms_in_window[:] 178 | best_window_len = current_window_len 179 | 180 | doc_len = len(self.indexes.get_document_text(doc_id)) 181 | # TODO: 15 should be a named constant 182 | snippet_start = max(best_window[0][1] - 8, 0) 183 | snippet_end = min(doc_len, best_window[len(best_window) - 1][1] + 1 + 8) 184 | 185 | snippet = [(term.full_word, term in query_terms) for term in self.indexes.get_document_text(doc_id)[snippet_start:snippet_end]] 186 | # TODO 50 should be a named constant too! 187 | if len(snippet) > 50: 188 | excessive_len = len(snippet) - 50 189 | snippet = snippet[:len(snippet) / 2 - excessive_len / 2] + [("...", False)] + snippet[len(snippet) / 2 + excessive_len / 2:] 190 | 191 | 192 | return snippet 193 | 194 | """ 195 | def find_documents_AND(self, query_terms): 196 | # docid -> number of query words 197 | query_term_count = defaultdict(set) 198 | for query_term in query_terms: 199 | for (pos, docid) in self.indexes.get_documents(query_term): 200 | query_term_count[docid].add(query_term) 201 | 202 | return SearchResults(self.rank_docids([doc_id for doc_id, unique_hits in query_term_count.iteritems() if len(unique_hits) == len(query_terms)])) 203 | """ 204 | 205 | def find_documents_and_rank_by_points(self, query_terms): 206 | docids_and_relevance = set() 207 | for query_term in query_terms: 208 | for hit in self.indexes.get_documents(query_term): 209 | docids_and_relevance.add((hit.docid, hit.score)) 210 | 211 | return SearchResults(sorted(list(docids_and_relevance), key=lambda x: x[1], reverse=True)) 212 | 213 | def _bm25(self, docid, query_terms_to_posting_lists_sizes): 214 | result = 0 215 | text = self.indexes.get_document_text(docid) 216 | text_len = len(text) 217 | for qt, nd_containing in query_terms_to_posting_lists_sizes.iteritems(): 218 | term_frequency = float(len(filter(lambda t: qt == t, text))) / text_len 219 | inverted_document_frequency = math.log((self.indexes.total_doc_count() - nd_containing + 0.5) / (nd_containing + 0.5)) 220 | k1 = 1.5 221 | b = 0.75 222 | result += inverted_document_frequency * (term_frequency * (k1+1)) / (term_frequency + k1*(1 - b + b * query_terms_to_posting_lists_sizes[qt] / self.indexes.average_doclen())) 223 | 224 | return result 225 | 226 | def find_documents_and_rank_by_bm25(self, query_terms): 227 | docids = set() 228 | query_terms_to_posting_lists_sizes = dict() 229 | for query_term in query_terms: 230 | posting_list = self.indexes.get_documents(query_term) 231 | query_terms_to_posting_lists_sizes[query_term] = len(posting_list) 232 | for hit in posting_list: 233 | docids.add(hit.docid) 234 | 235 | docids_and_relevance = set() 236 | for docid in docids: 237 | docids_and_relevance.add((docid, self._bm25(docid, query_terms_to_posting_lists_sizes))) 238 | 239 | return SearchResults(sorted(list(docids_and_relevance), key=lambda x: x[1], reverse=True)) 240 | 241 | 242 | 243 | def create_index_from_dir_API(stored_documents_dir, index_dir, IndexesImplementation=ShelveIndexes): 244 | indexer = IndexesImplementation() 245 | indexer.start_indexing(index_dir) 246 | filenames = [name for name in os.listdir(stored_documents_dir)] 247 | # widgets = [' Indexing: ', Percentage(), ' ', Bar(marker=RotatingMarker())] 248 | indexed_docs_num = 0 249 | # progressbar = ProgressBar(widgets=widgets, maxval=len(filenames)) 250 | for filename in filenames: 251 | indexed_docs_num += 1 252 | # progressbar.update(indexed_docs_num) 253 | opened_file = open(os.path.join(stored_documents_dir, filename)) 254 | doc_json = json.load(opened_file) 255 | parsed_doc = to_doc_terms(doc_json['text']) 256 | print indexed_docs_num 257 | if indexed_docs_num % 100 == 0: 258 | print indexed_docs_num, "Syncing..." 259 | indexer.sync() 260 | print indexed_docs_num, "Synced!" 261 | 262 | indexer.add_document(doc_json['url'], workaround.Document(parsed_doc, int(doc_json['score']), doc_json["title"])) 263 | # progressbar.update(indexed_docs_num) 264 | indexer.save_on_disk(index_dir) 265 | 266 | 267 | def main(): 268 | parser = argparse.ArgumentParser(description='Index /r/learnprogramming') 269 | parser.add_argument("--stored_documents_dir", dest="stored_documents_dir", required=True) 270 | parser.add_argument("--index_dir", dest="index_dir", required=True) 271 | args = parser.parse_args() 272 | create_index_from_dir_API(args.stored_documents_dir, args.index_dir) 273 | 274 | 275 | if __name__ == "__main__": 276 | main() 277 | --------------------------------------------------------------------------------