├── .gitignore ├── Pipfile ├── Pipfile.lock ├── README.md ├── data └── .gitinclude ├── example_1.py ├── index └── __init__.py ├── lab_1.py ├── lab_2.py ├── magazine_index └── __init__.py └── ranking ├── __init__.py └── tf_idf_ranker.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | .idea/ 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | *.so 7 | .Python 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | wheels/ 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | MANIFEST 24 | *.manifest 25 | *.spec 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | htmlcov/ 29 | .tox/ 30 | .coverage 31 | .coverage.* 32 | .cache 33 | nosetests.xml 34 | coverage.xml 35 | *.cover 36 | .hypothesis/ 37 | *.mo 38 | *.pot 39 | *.log 40 | local_settings.py 41 | instance/ 42 | .webassets-cache 43 | .scrapy 44 | docs/_build/ 45 | target/ 46 | .ipynb_checkpoints 47 | .python-version 48 | celerybeat-schedule 49 | *.sage.py 50 | .env 51 | .venv 52 | env/ 53 | venv/ 54 | ENV/ 55 | env.bak/ 56 | venv.bak/ 57 | .spyderproject 58 | .spyproject 59 | .ropeproject 60 | /site 61 | .mypy_cache/ 62 | index.bin 63 | data 64 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.python.org/simple" 3 | name = "pypi" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | nltk = "*" 10 | halo = "*" 11 | 12 | 13 | [requires] 14 | python_version = "3.5" -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "3e336dcded8f717d91613a8638e557e22f73a50d56d97af723a37ed2913ef231" 5 | }, 6 | "host-environment-markers": { 7 | "implementation_name": "cpython", 8 | "implementation_version": "3.5.2", 9 | "os_name": "posix", 10 | "platform_machine": "x86_64", 11 | "platform_python_implementation": "CPython", 12 | "platform_release": "4.10.0-35-generic", 13 | "platform_system": "Linux", 14 | "platform_version": "#39~16.04.1-Ubuntu SMP Wed Sep 13 09:02:42 UTC 2017", 15 | "python_full_version": "3.5.2", 16 | "python_version": "3.5", 17 | "sys_platform": "linux" 18 | }, 19 | "pipfile-spec": 6, 20 | "requires": { 21 | "python_version": "3.5" 22 | }, 23 | "sources": [ 24 | { 25 | "name": "pypi", 26 | "url": "https://pypi.python.org/simple", 27 | "verify_ssl": true 28 | } 29 | ] 30 | }, 31 | "default": { 32 | "colorama": { 33 | "hashes": [ 34 | "sha256:463f8483208e921368c9f306094eb6f725c6ca42b0f97e313cb5d5512459feda", 35 | "sha256:48eb22f4f8461b1df5734a074b57042430fb06e1d61bd1e11b078c0fe6d7a1f1" 36 | ], 37 | "version": "==0.3.9" 38 | }, 39 | "cursor": { 40 | "hashes": [ 41 | "sha256:61041d4362ce3a486f3bb2f412b9f6e492c90e0abfa54d0f69ac2e08984b6e6d" 42 | ], 43 | "version": "==1.1.0" 44 | }, 45 | "enum34": { 46 | "hashes": [ 47 | "sha256:6bd0f6ad48ec2aa117d3d141940d484deccda84d4fcd884f5c3d93c23ecd8c79", 48 | "sha256:644837f692e5f550741432dd3f223bbb9852018674981b1664e5dc339387588a", 49 | "sha256:8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1", 50 | "sha256:2d81cbbe0e73112bdfe6ef8576f2238f2ba27dd0d55752a776c41d38b7da2850" 51 | ], 52 | "version": "==1.1.6" 53 | }, 54 | "halo": { 55 | "hashes": [ 56 | "sha256:d0b5fb361c17ad31e5eea8220f9460997b80e2b05b5d98b1510f5506bb465d86" 57 | ], 58 | "version": "==0.0.6" 59 | }, 60 | "log-symbols": { 61 | "hashes": [ 62 | "sha256:87be2f283cd6f455d89b76abcf2805fad5692ec9dcd8a31d10a4c975f51392eb" 63 | ], 64 | "version": "==0.0.11" 65 | }, 66 | "nltk": { 67 | "hashes": [ 68 | "sha256:2661f9971d983db314bbebd51ba770811a362c6597fd0f303bb1d3beadcb4834", 69 | "sha256:8a3bad9ff7f67d2828a7915c2fab93e6ca910c2015ef40799e1805fccf2354e5" 70 | ], 71 | "version": "==3.2.5" 72 | }, 73 | "six": { 74 | "hashes": [ 75 | "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb", 76 | "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9" 77 | ], 78 | "version": "==1.11.0" 79 | }, 80 | "spinners": { 81 | "hashes": [ 82 | "sha256:f38891d1e21bf188e1adbe832fbe7e8e365ae255ae71dd230b25fc83ea6c98e2" 83 | ], 84 | "version": "==0.0.19" 85 | }, 86 | "termcolor": { 87 | "hashes": [ 88 | "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b" 89 | ], 90 | "version": "==1.1.0" 91 | } 92 | }, 93 | "develop": {} 94 | } 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Index Search Query 2 | 3 | Inverted Index, Query Formulation and Ranking from Scratch in Python. 4 | 5 | > Part of Information Retrieval Lab (Autumn 2017-18) 6 | 7 | ## Part 1: The Inverted Index 8 | 9 | ### Dataset 10 | 11 | The dataset used for this purpose is taken from the `FIRE 2011` corpus. It can be downloaded from [here](http://www.isical.ac.in/~fire/data/docs/adhoc/en.docs.2011.tar.gpg). It contains articles from two different magazines. The methods for handling these files are present in the [`magazine_index`](magazine_index) package. 12 | 13 | ### Usage 14 | 15 | If you wish to index all the files recursively from a directory, use the following command - 16 | 17 | ```bash 18 | $ python lab1.py path/to/files 19 | ``` 20 | 21 | This will create an inverted index and save it to a file called `index.bin`. You can directly use this file if created already by not passing any argument to the script - 22 | 23 | ```bash 24 | $ python lab1.py 25 | Loading index from "index.bin" 26 | 27 | ... 28 | ``` 29 | 30 | ### Using a pre-built index 31 | 32 | Since indexing documents can take a lot of time, here are some already indexed files which can be renamed to `index.bin` and used directly - 33 | 34 | | Name | Link | Size | Comments | 35 | |------|------|------|----------| 36 | | `index.bin` | [LINK](https://drive.google.com/open?id=0BxDMRh_L_8pOUzlZQ0JJMUtYd1E) | 478 MB | Full index, 392k documents | 37 | | `index.bin.bak1` | [LINK](https://drive.google.com/file/d/0BxDMRh_L_8pObWU0ZkE1NHBTUUU/view?usp=sharing) | 374 MB | 303k documents | 38 | | `index.bin.bak` | [LINK](https://drive.google.com/file/d/0BxDMRh_L_8pOYmRKU0I5MWJhbG8/view?usp=sharing) | 36 MB | 25.8k documents | 39 | 40 | ### Example 41 | 42 | ```bash 43 | $ python lab_1.py 44 | Loading index from "index.bin" 45 | 46 | Please start entering words to get top 5 documents containing them (CTRL+C to exit) - 47 | Enter word: market 48 | [('1100110_calcutta_story_11965855.utf8', 58), ('1070603_calcutta_story_7858507.utf8', 31), ('1100326_opinion_story_12251777.utf8', 30), ('1050912_frontpage_story_5227346.utf8', 30), ('1040406_opinion_story_2948544.utf8', 29)] 49 | Enter word: delhi 50 | [('1080422_sports_ipl.utf8', 30), ('1031223_opinion_story_2710457.utf8', 28), ('1090225_sports_story_10587273.utf8', 22), ('1090812_sports_story_11351508.utf8', 21), ('1100223_sports_story_12140507.utf8', 21)] 51 | Enter word: messi 52 | [('1100612_sports_story_12557276.utf8', 27), ('1100527_sports_story_12492679.utf8', 17), ('1100619_sports_story_12582889.utf8', 17), ('1090529_calcutta_story_11031479.utf8', 16), ('1100613_frontpage_story_12560387.utf8', 12)] 53 | ``` 54 | 55 | 56 | 57 | ## Part 2: Ranking of Documents 58 | 59 | ### Usage 60 | 61 | You can use the pre-built index here. 62 | 63 | ```bash 64 | $ python lab_2.py index.bin 65 | Loading index from index.bin 66 | Enter query: programming 67 | en.15.66.21.2008.5.9 : 97.3490637742472 68 | en.3.347.409.2010.2.2 : 79.64923399711134 69 | en.3.373.142.2007.6.8 : 53.09948933140756 70 | en.3.406.410.2007.11.18 : 48.6745318871236 71 | en.2.296.350.2010.1.20 : 48.6745318871236 72 | en.3.393.372.2007.9.23 : 44.24957444283963 73 | en.3.321.344.2009.7.31 : 44.24957444283963 74 | en.3.373.299.2007.6.11 : 44.24957444283963 75 | en.15.109.486.2009.4.1 : 44.24957444283963 76 | en.3.393.75.2007.9.24 : 44.24957444283963 77 | ``` 78 | 79 | 80 | --- 81 | 82 | ## Development 83 | 84 | `pipenv` is used for this project - 85 | 86 | ```bash 87 | $ sudo -H pip install pipenv 88 | ``` 89 | 90 | To install dependencies, simply 91 | 92 | ```bash 93 | $ pipenv install 94 | ``` 95 | 96 | To enter a virtualenv shell 97 | 98 | ```bash 99 | $ pipenv shell 100 | ``` 101 | 102 | This will spawn a new shell where all dependencies will be present. 103 | -------------------------------------------------------------------------------- /data/.gitinclude: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/singhpratyush/index-search-query/75d608f40ef98294475cced9a3003ab8fe9fd777/data/.gitinclude -------------------------------------------------------------------------------- /example_1.py: -------------------------------------------------------------------------------- 1 | import index 2 | 3 | my_index = index.Index() 4 | 5 | my_index.index('1', 'hello, my world! going to work world! :(') 6 | my_index.index('2', 'bye bye world, going to sleep :)') 7 | print(my_index.get_docs_for_token('world')) 8 | -------------------------------------------------------------------------------- /index/__init__.py: -------------------------------------------------------------------------------- 1 | import math 2 | import operator 3 | import pickle 4 | import queue 5 | import threading 6 | 7 | from nltk.corpus import stopwords as nltk_stopwords 8 | from nltk.stem import PorterStemmer 9 | from nltk.tokenize import RegexpTokenizer 10 | 11 | 12 | class Index: 13 | 14 | tokenizer = RegexpTokenizer(r'\w+') 15 | stop_words = nltk_stopwords.words() 16 | stemmer = PorterStemmer() 17 | 18 | def __init__(self, verbose=False): 19 | self._doc_set = set() 20 | self._total_words = 0 21 | self._inverted_index = {} 22 | self._lock = threading.Lock() 23 | self._bulk_index_queue = queue.Queue() 24 | self._verbose = verbose 25 | 26 | def idf(self, token): 27 | if token not in self._inverted_index: 28 | return 0 29 | count = len(self._inverted_index[token]['frequency']) 30 | return math.log(self.doc_count() / count) 31 | 32 | def print(self, content): 33 | if self._verbose: 34 | print(content) 35 | 36 | def __str__(self): 37 | return '' % (self.doc_count(), self.word_count()) 38 | 39 | @staticmethod 40 | def clean(content): 41 | tokens = Index.tokenizer.tokenize(content) 42 | tokens = [Index.stemmer.stem(i) for i in tokens if i not in Index.stop_words] 43 | return tokens 44 | 45 | def repopulate_counts(self): 46 | self._total_words = sum([i[1]['count'] for i in self._inverted_index.items()]) 47 | 48 | def word_count(self): 49 | return self._total_words 50 | 51 | def index(self, document_id, content, repopulate=True): 52 | tokens = Index.clean(content) 53 | token_set = set(tokens) 54 | for token in token_set: 55 | t_c = tokens.count(token) 56 | self._update_inverted_index(token, document_id, t_c) 57 | if repopulate: 58 | self.repopulate_counts() 59 | self._doc_set.add(document_id) 60 | return self.word_count() 61 | 62 | def _update_inverted_index(self, token, document, count): 63 | if token in self._inverted_index: 64 | with self._lock: 65 | self._inverted_index[token]['frequency'][document] = count 66 | self._inverted_index[token]['count'] += count 67 | else: 68 | with self._lock: 69 | self._inverted_index[token] = { 70 | 'count': count, 71 | 'frequency': {document: count} 72 | } 73 | 74 | def doc_count(self): 75 | return len(self._doc_set) 76 | 77 | def save(self, filename): 78 | with open(filename, 'wb') as f: 79 | pickle.dump(self._inverted_index, f) 80 | 81 | def load(self, filename): 82 | with open(filename, 'rb') as f: 83 | self._inverted_index = pickle.load(f) 84 | return self._populate_documents() 85 | 86 | def _populate_documents(self): 87 | self._doc_set = set() 88 | for token in self._inverted_index: 89 | frequencies = self._inverted_index[token]['frequency'] 90 | for doc in frequencies: 91 | self._doc_set.add(doc) 92 | self.repopulate_counts() 93 | return self.doc_count() 94 | 95 | @staticmethod 96 | def from_file(filename): 97 | index = Index() 98 | index.load(filename) 99 | return index 100 | 101 | def get_docs_for_token(self, token, count=None): 102 | token = Index.clean(token) 103 | if len(token) == 0: 104 | return [] 105 | token = token[0] 106 | docs = self._inverted_index.get(token, {'frequency': {}})['frequency'] 107 | sorted_docs = sorted(docs.items(), key=operator.itemgetter(1), reverse=True) 108 | doc_list = list(sorted_docs) 109 | return doc_list if count is None else doc_list[:count] 110 | 111 | def bulk_index(self, doc_list, threads=8): 112 | for doc_item in doc_list: 113 | self.print('Added doc %s to queue' % doc_item[0]) 114 | self._bulk_index_queue.put(doc_item) 115 | thread_list = [] 116 | for i in range(threads): 117 | th = threading.Thread(target=self._bulk_index_worker) 118 | th.start() 119 | thread_list.append(th) 120 | for th in thread_list: 121 | th.join() 122 | self.repopulate_counts() 123 | return self.doc_count() 124 | 125 | def _bulk_index_worker(self): 126 | while True: 127 | try: 128 | doc_id, content = self._bulk_index_queue.get(timeout=0.1) 129 | except: 130 | return 131 | self.index(doc_id, content, repopulate=False) 132 | self.print('Remaining - %s. Indexed %s' % (self._bulk_index_queue.qsize(), doc_id)) 133 | -------------------------------------------------------------------------------- /lab_1.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import index as ir_index 3 | import magazine_index 4 | 5 | 6 | def main(): 7 | if len(sys.argv) == 2: 8 | path = sys.argv[1] 9 | print('Creating index from data at %s' % path) 10 | index = magazine_index.create_dictionary_index(path) 11 | print('Saving index to "index.bin"') 12 | index.save('index.bin') 13 | print('Loading index from "index.bin"') 14 | index = ir_index.Index.from_file('index.bin') 15 | print(index) 16 | print('Please start entering words to get top 5 documents containing them (CTRL+C to exit) - ') 17 | while True: 18 | try: 19 | token = input('Enter word: ') 20 | print(index.get_docs_for_token(token, count=5)) 21 | except KeyboardInterrupt: 22 | break 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /lab_2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from index import Index 4 | from ranking.tf_idf_ranker import TfIdfRanker 5 | 6 | 7 | def main(): 8 | file = sys.argv[1] 9 | print('Loading index from %s' % file) 10 | index = Index.from_file(file) 11 | while True: 12 | try: 13 | query = input('Enter query: ') 14 | documents = TfIdfRanker.search(index, query) 15 | for doc, score in documents: 16 | print('%s : %s' % (doc, score)) 17 | except KeyboardInterrupt: 18 | break 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /magazine_index/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from index import Index 4 | 5 | re_text = re.compile(r'(.*?)', re.DOTALL) 6 | 7 | 8 | def render(path): 9 | with open(path) as f: 10 | return path.split('/')[-1], re_text.findall(f.read())[0] 11 | 12 | 13 | def lazy_load_docs(path): 14 | docs = [path] 15 | while docs: 16 | top = docs.pop() 17 | if os.path.isdir(top): 18 | for i in os.listdir(top): 19 | abs_path = os.path.join(top, i) 20 | docs.append(abs_path) 21 | elif top.endswith('.utf8') or top.split('/')[-1].startswith('en.'): 22 | yield render(top) 23 | 24 | 25 | def create_dictionary_index(path): 26 | index = Index(verbose=True) 27 | index.bulk_index(lazy_load_docs(path), threads=16) 28 | return index 29 | -------------------------------------------------------------------------------- /ranking/__init__.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | from index import Index 4 | 5 | 6 | class Ranker(abc.ABC): 7 | 8 | def __init__(self): 9 | pass 10 | 11 | @staticmethod 12 | @abc.abstractclassmethod 13 | def get_top_docs(index, tokens): 14 | pass 15 | 16 | @classmethod 17 | def search(cls, index, query, count=10): 18 | assert isinstance(index, Index) 19 | tokens = Index.clean(query) 20 | top_docs = list(cls.get_top_docs(index, tokens)) 21 | return top_docs[:count] 22 | -------------------------------------------------------------------------------- /ranking/tf_idf_ranker.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | from ranking import Ranker 4 | 5 | 6 | class TfIdfRanker(Ranker): 7 | 8 | @staticmethod 9 | def get_top_docs(index, tokens): 10 | documents = {} 11 | for token in tokens: 12 | relevant_docs = index.get_docs_for_token(token) 13 | for doc_id, freq in relevant_docs: 14 | if doc_id not in documents: 15 | documents[doc_id] = 0. 16 | documents[doc_id] += freq * index.idf(token) 17 | return sorted(documents.items(), key=operator.itemgetter(1), reverse=True) 18 | --------------------------------------------------------------------------------