├── .gitignore
├── Pipfile
├── Pipfile.lock
├── README.md
├── data
    └── .gitinclude
├── example_1.py
├── index
    └── __init__.py
├── lab_1.py
├── lab_2.py
├── magazine_index
    └── __init__.py
└── ranking
    ├── __init__.py
    └── tf_idf_ranker.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by .ignore support plugin (hsz.mobi)
 2 | .idea/
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | *.so
 7 | .Python
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | wheels/
20 | *.egg-info/
21 | .installed.cfg
22 | *.egg
23 | MANIFEST
24 | *.manifest
25 | *.spec
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 | htmlcov/
29 | .tox/
30 | .coverage
31 | .coverage.*
32 | .cache
33 | nosetests.xml
34 | coverage.xml
35 | *.cover
36 | .hypothesis/
37 | *.mo
38 | *.pot
39 | *.log
40 | local_settings.py
41 | instance/
42 | .webassets-cache
43 | .scrapy
44 | docs/_build/
45 | target/
46 | .ipynb_checkpoints
47 | .python-version
48 | celerybeat-schedule
49 | *.sage.py
50 | .env
51 | .venv
52 | env/
53 | venv/
54 | ENV/
55 | env.bak/
56 | venv.bak/
57 | .spyderproject
58 | .spyproject
59 | .ropeproject
60 | /site
61 | .mypy_cache/
62 | index.bin
63 | data
64 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.python.org/simple"
 3 | name = "pypi"
 4 | verify_ssl = true
 5 | 
 6 | [dev-packages]
 7 | 
 8 | [packages]
 9 | nltk = "*"
10 | halo = "*"
11 | 
12 | 
13 | [requires]
14 | python_version = "3.5"


--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_meta": {
 3 |         "hash": {
 4 |             "sha256": "3e336dcded8f717d91613a8638e557e22f73a50d56d97af723a37ed2913ef231"
 5 |         },
 6 |         "host-environment-markers": {
 7 |             "implementation_name": "cpython",
 8 |             "implementation_version": "3.5.2",
 9 |             "os_name": "posix",
10 |             "platform_machine": "x86_64",
11 |             "platform_python_implementation": "CPython",
12 |             "platform_release": "4.10.0-35-generic",
13 |             "platform_system": "Linux",
14 |             "platform_version": "#39~16.04.1-Ubuntu SMP Wed Sep 13 09:02:42 UTC 2017",
15 |             "python_full_version": "3.5.2",
16 |             "python_version": "3.5",
17 |             "sys_platform": "linux"
18 |         },
19 |         "pipfile-spec": 6,
20 |         "requires": {
21 |             "python_version": "3.5"
22 |         },
23 |         "sources": [
24 |             {
25 |                 "name": "pypi",
26 |                 "url": "https://pypi.python.org/simple",
27 |                 "verify_ssl": true
28 |             }
29 |         ]
30 |     },
31 |     "default": {
32 |         "colorama": {
33 |             "hashes": [
34 |                 "sha256:463f8483208e921368c9f306094eb6f725c6ca42b0f97e313cb5d5512459feda",
35 |                 "sha256:48eb22f4f8461b1df5734a074b57042430fb06e1d61bd1e11b078c0fe6d7a1f1"
36 |             ],
37 |             "version": "==0.3.9"
38 |         },
39 |         "cursor": {
40 |             "hashes": [
41 |                 "sha256:61041d4362ce3a486f3bb2f412b9f6e492c90e0abfa54d0f69ac2e08984b6e6d"
42 |             ],
43 |             "version": "==1.1.0"
44 |         },
45 |         "enum34": {
46 |             "hashes": [
47 |                 "sha256:6bd0f6ad48ec2aa117d3d141940d484deccda84d4fcd884f5c3d93c23ecd8c79",
48 |                 "sha256:644837f692e5f550741432dd3f223bbb9852018674981b1664e5dc339387588a",
49 |                 "sha256:8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1",
50 |                 "sha256:2d81cbbe0e73112bdfe6ef8576f2238f2ba27dd0d55752a776c41d38b7da2850"
51 |             ],
52 |             "version": "==1.1.6"
53 |         },
54 |         "halo": {
55 |             "hashes": [
56 |                 "sha256:d0b5fb361c17ad31e5eea8220f9460997b80e2b05b5d98b1510f5506bb465d86"
57 |             ],
58 |             "version": "==0.0.6"
59 |         },
60 |         "log-symbols": {
61 |             "hashes": [
62 |                 "sha256:87be2f283cd6f455d89b76abcf2805fad5692ec9dcd8a31d10a4c975f51392eb"
63 |             ],
64 |             "version": "==0.0.11"
65 |         },
66 |         "nltk": {
67 |             "hashes": [
68 |                 "sha256:2661f9971d983db314bbebd51ba770811a362c6597fd0f303bb1d3beadcb4834",
69 |                 "sha256:8a3bad9ff7f67d2828a7915c2fab93e6ca910c2015ef40799e1805fccf2354e5"
70 |             ],
71 |             "version": "==3.2.5"
72 |         },
73 |         "six": {
74 |             "hashes": [
75 |                 "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb",
76 |                 "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9"
77 |             ],
78 |             "version": "==1.11.0"
79 |         },
80 |         "spinners": {
81 |             "hashes": [
82 |                 "sha256:f38891d1e21bf188e1adbe832fbe7e8e365ae255ae71dd230b25fc83ea6c98e2"
83 |             ],
84 |             "version": "==0.0.19"
85 |         },
86 |         "termcolor": {
87 |             "hashes": [
88 |                 "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b"
89 |             ],
90 |             "version": "==1.1.0"
91 |         }
92 |     },
93 |     "develop": {}
94 | }
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Index Search Query
  2 | 
  3 | Inverted Index, Query Formulation and Ranking from Scratch in Python.
  4 | 
  5 | > Part of Information Retrieval Lab (Autumn 2017-18)
  6 | 
  7 | ## Part 1: The Inverted Index
  8 | 
  9 | ### Dataset
 10 | 
 11 | The dataset used for this purpose is taken from the `FIRE 2011` corpus. It can be downloaded from [here](http://www.isical.ac.in/~fire/data/docs/adhoc/en.docs.2011.tar.gpg). It contains articles from two different magazines. The methods for handling these files are present in the [`magazine_index`](magazine_index) package.
 12 | 
 13 | ### Usage
 14 | 
 15 | If you wish to index all the files recursively from a directory, use the following command -
 16 | 
 17 | ```bash
 18 | $ python lab1.py path/to/files
 19 | ```
 20 | 
 21 | This will create an inverted index and save it to a file called `index.bin`. You can directly use this file if created already by not passing any argument to the script -
 22 | 
 23 | ```bash
 24 | $ python lab1.py
 25 | Loading index from "index.bin"
 26 | <Index documents=392577 words=105314026>
 27 | ...
 28 | ```
 29 | 
 30 | ### Using a pre-built index
 31 | 
 32 | Since indexing documents can take a lot of time, here are some already indexed files which can be renamed to `index.bin` and used directly -
 33 | 
 34 | | Name | Link | Size | Comments |
 35 | |------|------|------|----------|
 36 | | `index.bin` | [LINK](https://drive.google.com/open?id=0BxDMRh_L_8pOUzlZQ0JJMUtYd1E) | 478 MB | Full index, 392k documents |
 37 | | `index.bin.bak1` | [LINK](https://drive.google.com/file/d/0BxDMRh_L_8pObWU0ZkE1NHBTUUU/view?usp=sharing) | 374 MB | 303k documents |
 38 | | `index.bin.bak` | [LINK](https://drive.google.com/file/d/0BxDMRh_L_8pOYmRKU0I5MWJhbG8/view?usp=sharing) | 36 MB | 25.8k documents |
 39 | 
 40 | ### Example
 41 | 
 42 | ```bash
 43 | $ python lab_1.py
 44 | Loading index from "index.bin"
 45 | <Index documents=303290 words=83225120>
 46 | Please start entering words to get top 5 documents containing them (CTRL+C to exit) -
 47 | Enter word: market
 48 | [('1100110_calcutta_story_11965855.utf8', 58), ('1070603_calcutta_story_7858507.utf8', 31), ('1100326_opinion_story_12251777.utf8', 30), ('1050912_frontpage_story_5227346.utf8', 30), ('1040406_opinion_story_2948544.utf8', 29)]
 49 | Enter word: delhi
 50 | [('1080422_sports_ipl.utf8', 30), ('1031223_opinion_story_2710457.utf8', 28), ('1090225_sports_story_10587273.utf8', 22), ('1090812_sports_story_11351508.utf8', 21), ('1100223_sports_story_12140507.utf8', 21)]
 51 | Enter word: messi
 52 | [('1100612_sports_story_12557276.utf8', 27), ('1100527_sports_story_12492679.utf8', 17), ('1100619_sports_story_12582889.utf8', 17), ('1090529_calcutta_story_11031479.utf8', 16), ('1100613_frontpage_story_12560387.utf8', 12)]
 53 | ```
 54 | 
 55 | 
 56 | 
 57 | ## Part 2: Ranking of Documents
 58 | 
 59 | ### Usage
 60 | 
 61 | You can use the pre-built index here.
 62 | 
 63 | ```bash
 64 | $ python lab_2.py index.bin
 65 | Loading index from index.bin
 66 | Enter query: programming
 67 | en.15.66.21.2008.5.9 : 97.3490637742472
 68 | en.3.347.409.2010.2.2 : 79.64923399711134
 69 | en.3.373.142.2007.6.8 : 53.09948933140756
 70 | en.3.406.410.2007.11.18 : 48.6745318871236
 71 | en.2.296.350.2010.1.20 : 48.6745318871236
 72 | en.3.393.372.2007.9.23 : 44.24957444283963
 73 | en.3.321.344.2009.7.31 : 44.24957444283963
 74 | en.3.373.299.2007.6.11 : 44.24957444283963
 75 | en.15.109.486.2009.4.1 : 44.24957444283963
 76 | en.3.393.75.2007.9.24 : 44.24957444283963
 77 | ```
 78 | 
 79 | 
 80 | ---
 81 | 
 82 | ## Development
 83 | 
 84 | `pipenv` is used for this project - 
 85 | 
 86 | ```bash
 87 | $ sudo -H pip install pipenv
 88 | ```
 89 | 
 90 | To install dependencies, simply
 91 | 
 92 | ```bash
 93 | $ pipenv install
 94 | ```
 95 | 
 96 | To enter a virtualenv shell
 97 | 
 98 | ```bash
 99 | $ pipenv shell
100 | ```
101 | 
102 | This will spawn a new shell where all dependencies will be present.
103 | 


--------------------------------------------------------------------------------
/data/.gitinclude:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/singhpratyush/index-search-query/75d608f40ef98294475cced9a3003ab8fe9fd777/data/.gitinclude


--------------------------------------------------------------------------------
/example_1.py:
--------------------------------------------------------------------------------
1 | import index
2 | 
3 | my_index = index.Index()
4 | 
5 | my_index.index('1', 'hello, my world! going to work world! :(')
6 | my_index.index('2', 'bye bye world, going to sleep :)')
7 | print(my_index.get_docs_for_token('world'))
8 | 


--------------------------------------------------------------------------------
/index/__init__.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import operator
  3 | import pickle
  4 | import queue
  5 | import threading
  6 | 
  7 | from nltk.corpus import stopwords as nltk_stopwords
  8 | from nltk.stem import PorterStemmer
  9 | from nltk.tokenize import RegexpTokenizer
 10 | 
 11 | 
 12 | class Index:
 13 | 
 14 |     tokenizer = RegexpTokenizer(r'\w+')
 15 |     stop_words = nltk_stopwords.words()
 16 |     stemmer = PorterStemmer()
 17 | 
 18 |     def __init__(self, verbose=False):
 19 |         self._doc_set = set()
 20 |         self._total_words = 0
 21 |         self._inverted_index = {}
 22 |         self._lock = threading.Lock()
 23 |         self._bulk_index_queue = queue.Queue()
 24 |         self._verbose = verbose
 25 | 
 26 |     def idf(self, token):
 27 |         if token not in self._inverted_index:
 28 |             return 0
 29 |         count = len(self._inverted_index[token]['frequency'])
 30 |         return math.log(self.doc_count() / count)
 31 | 
 32 |     def print(self, content):
 33 |         if self._verbose:
 34 |             print(content)
 35 | 
 36 |     def __str__(self):
 37 |         return '<Index documents=%s words=%s>' % (self.doc_count(), self.word_count())
 38 | 
 39 |     @staticmethod
 40 |     def clean(content):
 41 |         tokens = Index.tokenizer.tokenize(content)
 42 |         tokens = [Index.stemmer.stem(i) for i in tokens if i not in Index.stop_words]
 43 |         return tokens
 44 | 
 45 |     def repopulate_counts(self):
 46 |         self._total_words = sum([i[1]['count'] for i in self._inverted_index.items()])
 47 | 
 48 |     def word_count(self):
 49 |         return self._total_words
 50 | 
 51 |     def index(self, document_id, content, repopulate=True):
 52 |         tokens = Index.clean(content)
 53 |         token_set = set(tokens)
 54 |         for token in token_set:
 55 |             t_c = tokens.count(token)
 56 |             self._update_inverted_index(token, document_id, t_c)
 57 |         if repopulate:
 58 |             self.repopulate_counts()
 59 |         self._doc_set.add(document_id)
 60 |         return self.word_count()
 61 | 
 62 |     def _update_inverted_index(self, token, document, count):
 63 |         if token in self._inverted_index:
 64 |             with self._lock:
 65 |                 self._inverted_index[token]['frequency'][document] = count
 66 |                 self._inverted_index[token]['count'] += count
 67 |         else:
 68 |             with self._lock:
 69 |                 self._inverted_index[token] = {
 70 |                     'count': count,
 71 |                     'frequency': {document: count}
 72 |                 }
 73 | 
 74 |     def doc_count(self):
 75 |         return len(self._doc_set)
 76 | 
 77 |     def save(self, filename):
 78 |         with open(filename, 'wb') as f:
 79 |             pickle.dump(self._inverted_index, f)
 80 | 
 81 |     def load(self, filename):
 82 |         with open(filename, 'rb') as f:
 83 |             self._inverted_index = pickle.load(f)
 84 |         return self._populate_documents()
 85 | 
 86 |     def _populate_documents(self):
 87 |         self._doc_set = set()
 88 |         for token in self._inverted_index:
 89 |             frequencies = self._inverted_index[token]['frequency']
 90 |             for doc in frequencies:
 91 |                 self._doc_set.add(doc)
 92 |         self.repopulate_counts()
 93 |         return self.doc_count()
 94 | 
 95 |     @staticmethod
 96 |     def from_file(filename):
 97 |         index = Index()
 98 |         index.load(filename)
 99 |         return index
100 | 
101 |     def get_docs_for_token(self, token, count=None):
102 |         token = Index.clean(token)
103 |         if len(token) == 0:
104 |             return []
105 |         token = token[0]
106 |         docs = self._inverted_index.get(token, {'frequency': {}})['frequency']
107 |         sorted_docs = sorted(docs.items(), key=operator.itemgetter(1), reverse=True)
108 |         doc_list = list(sorted_docs)
109 |         return doc_list if count is None else doc_list[:count]
110 | 
111 |     def bulk_index(self, doc_list, threads=8):
112 |         for doc_item in doc_list:
113 |             self.print('Added doc %s to queue' % doc_item[0])
114 |             self._bulk_index_queue.put(doc_item)
115 |         thread_list = []
116 |         for i in range(threads):
117 |             th = threading.Thread(target=self._bulk_index_worker)
118 |             th.start()
119 |             thread_list.append(th)
120 |         for th in thread_list:
121 |             th.join()
122 |         self.repopulate_counts()
123 |         return self.doc_count()
124 | 
125 |     def _bulk_index_worker(self):
126 |         while True:
127 |             try:
128 |                 doc_id, content = self._bulk_index_queue.get(timeout=0.1)
129 |             except:
130 |                 return
131 |             self.index(doc_id, content, repopulate=False)
132 |             self.print('Remaining - %s. Indexed %s' % (self._bulk_index_queue.qsize(), doc_id))
133 | 


--------------------------------------------------------------------------------
/lab_1.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import index as ir_index
 3 | import magazine_index
 4 | 
 5 | 
 6 | def main():
 7 |     if len(sys.argv) == 2:
 8 |         path = sys.argv[1]
 9 |         print('Creating index from data at %s' % path)
10 |         index = magazine_index.create_dictionary_index(path)
11 |         print('Saving index to "index.bin"')
12 |         index.save('index.bin')
13 |     print('Loading index from "index.bin"')
14 |     index = ir_index.Index.from_file('index.bin')
15 |     print(index)
16 |     print('Please start entering words to get top 5 documents containing them (CTRL+C to exit) - ')
17 |     while True:
18 |         try:
19 |             token = input('Enter word: ')
20 |             print(index.get_docs_for_token(token, count=5))
21 |         except KeyboardInterrupt:
22 |             break
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     main()
27 | 


--------------------------------------------------------------------------------
/lab_2.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from index import Index
 4 | from ranking.tf_idf_ranker import TfIdfRanker
 5 | 
 6 | 
 7 | def main():
 8 |     file = sys.argv[1]
 9 |     print('Loading index from %s' % file)
10 |     index = Index.from_file(file)
11 |     while True:
12 |         try:
13 |             query = input('Enter query: ')
14 |             documents = TfIdfRanker.search(index, query)
15 |             for doc, score in documents:
16 |                 print('%s : %s' % (doc, score))
17 |         except KeyboardInterrupt:
18 |             break
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     main()
23 | 


--------------------------------------------------------------------------------
/magazine_index/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | from index import Index
 4 | 
 5 | re_text = re.compile(r'<TEXT>(.*?)</TEXT>', re.DOTALL)
 6 | 
 7 | 
 8 | def render(path):
 9 |     with open(path) as f:
10 |         return path.split('/')[-1], re_text.findall(f.read())[0]
11 | 
12 | 
13 | def lazy_load_docs(path):
14 |     docs = [path]
15 |     while docs:
16 |         top = docs.pop()
17 |         if os.path.isdir(top):
18 |             for i in os.listdir(top):
19 |                 abs_path = os.path.join(top, i)
20 |                 docs.append(abs_path)
21 |         elif top.endswith('.utf8') or top.split('/')[-1].startswith('en.'):
22 |             yield render(top)
23 | 
24 | 
25 | def create_dictionary_index(path):
26 |     index = Index(verbose=True)
27 |     index.bulk_index(lazy_load_docs(path), threads=16)
28 |     return index
29 | 


--------------------------------------------------------------------------------
/ranking/__init__.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | from index import Index
 4 | 
 5 | 
 6 | class Ranker(abc.ABC):
 7 | 
 8 |     def __init__(self):
 9 |         pass
10 | 
11 |     @staticmethod
12 |     @abc.abstractclassmethod
13 |     def get_top_docs(index, tokens):
14 |         pass
15 | 
16 |     @classmethod
17 |     def search(cls, index, query, count=10):
18 |         assert isinstance(index, Index)
19 |         tokens = Index.clean(query)
20 |         top_docs = list(cls.get_top_docs(index, tokens))
21 |         return top_docs[:count]
22 | 


--------------------------------------------------------------------------------
/ranking/tf_idf_ranker.py:
--------------------------------------------------------------------------------
 1 | import operator
 2 | 
 3 | from ranking import Ranker
 4 | 
 5 | 
 6 | class TfIdfRanker(Ranker):
 7 | 
 8 |     @staticmethod
 9 |     def get_top_docs(index, tokens):
10 |         documents = {}
11 |         for token in tokens:
12 |             relevant_docs = index.get_docs_for_token(token)
13 |             for doc_id, freq in relevant_docs:
14 |                 if doc_id not in documents:
15 |                     documents[doc_id] = 0.
16 |                 documents[doc_id] += freq * index.idf(token)
17 |         return sorted(documents.items(), key=operator.itemgetter(1), reverse=True)
18 | 


--------------------------------------------------------------------------------