├── .gitignore
├── conda.txt
├── preprocess.py
├── vector_space_model.py
├── newsgroup.py
├── README.md
└── inverted_index.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/conda.txt:
--------------------------------------------------------------------------------
1 | scikit-learn
2 | nltk


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.stem.porter import PorterStemmer
 3 | 
 4 | stemmer = PorterStemmer()
 5 | 
 6 | 
 7 | def stem_tokens(tokens):
 8 | 
 9 |     '''
10 |     Stem tokens using the Porter Stemmer
11 | 
12 |     :param tokens: list of tokens to stem
13 |     :return: stemmed tokens
14 |     '''
15 | 
16 |     stemmed = []
17 |     for item in tokens:
18 |         stemmed.append(stemmer.stem(item))
19 |     return stemmed
20 | 
21 | 
22 | def tokenize(text):
23 | 
24 |     '''
25 |     Tokenize and stem text received as input
26 | 
27 |     :param text: text to tokenize
28 |     :return: tokenized and stemmed text
29 |     '''
30 | 
31 |     tokens = nltk.word_tokenize(text)
32 |     stems = stem_tokens(tokens)
33 |     return stems
34 | 


--------------------------------------------------------------------------------
/vector_space_model.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import TfidfVectorizer
 2 | from preprocess import tokenize
 3 | import numpy
 4 | 
 5 | 
 6 | def create_vectors(newsgroups):
 7 | 
 8 |     '''
 9 |     Create tfidf vectors for the 20 newsgroup dataset
10 | 
11 |     :param newsgroups:
12 |     :return: the vectorizer object and tfidf vectors
13 |     '''
14 | 
15 |     # the tfidf vectorizer using a custom tokenizer and a list of English
16 |     # stop-words
17 |     vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
18 | 
19 |     # fit the vectorizer using the 20 newsgroup corpus
20 |     tfidf_vectors = vectorizer.fit_transform(newsgroups.data)
21 | 
22 |     # return the tfidf vectorizer object and the vectors obtained after fitting
23 |     # the corpus
24 |     return vectorizer, tfidf_vectors
25 | 
26 | 
27 | def rank_documents(query, k, vectorizer, tfidf_vectors):
28 | 
29 |     '''
30 |     Rank documents based on a query, using the tfidf vectors
31 | 
32 |     :param query: the query
33 |     :param k: the number of documents to return
34 |     :param vectorizer: the vectorizer object
35 |     :param tfidf_vectors: the tfidf vectors fitted for the 20 newsgroups corpus
36 |     :return: top k ranked document according to the query
37 |     '''
38 | 
39 |     # transform the query
40 |     result = vectorizer.transform([query])
41 | 
42 |     # get the terms of the vectorizer
43 |     terms = vectorizer.get_feature_names()
44 | 
45 |     # if the query consists of multiple terms, aggregate the result in this
46 |     # accumulator
47 |     accumulator = numpy.zeros(tfidf_vectors.shape[0])
48 | 
49 |     assert k < tfidf_vectors.shape[0], 'k should be smaller than the number ' \
50 |                                        'of documents'
51 | 
52 |     for col in result.nonzero()[1]:
53 | 
54 |         # a column corresponding to a term in the query, for now a sparse
55 |         # representation
56 |         column = tfidf_vectors[:, col]
57 | 
58 |         # convert the sparse respresentation to a dense matrix
59 |         dense_column = column.todense()
60 | 
61 |         # reshape to a numpy array
62 |         reshaped_dense_column = numpy.asarray(dense_column).reshape(-1)
63 | 
64 |         # add the column to the accumulator
65 |         accumulator += reshaped_dense_column
66 | 
67 |     # sort the documents
68 |     sorted_column_index = numpy.argsort(accumulator)
69 | 
70 |     # create a slice only with the top-k results
71 |     result = sorted_column_index[-k:].tolist()
72 | 
73 |     return result[::-1]
74 | 


--------------------------------------------------------------------------------
/newsgroup.py:
--------------------------------------------------------------------------------
 1 | import inverted_index as ii
 2 | import vector_space_model as vsm
 3 | 
 4 | from sklearn.datasets import fetch_20newsgroups
 5 | 
 6 | 
 7 | def query_iindex(query, k, newsgroups):
 8 | 
 9 |     '''
10 |     Helper function to query the inverted index
11 | 
12 |     :param query: the query string
13 |     :param k: number of results to display
14 |     :param newsgroups: corpus
15 |     :return: -
16 |     '''
17 | 
18 |     result = ii.boolean_search(query, iindex)
19 | 
20 |     print 'The result of the boolean query: {0}'.format(query)
21 | 
22 |     if result:
23 |         print 'Obtained {0} results'.format(len(result))
24 | 
25 |         print 'A subset of {0} results'.format(k)
26 |         for r in result[:k]:
27 |             print newsgroups.filenames[r]
28 |     else:
29 |         print 'No result for this query: {0}'.format(query)
30 | 
31 | 
32 | def query_tfidf_vectors(query, k, vectorizer, tfidf_vectors, newsgroups):
33 | 
34 |     '''
35 |     Helper function to query using tfidf vectors
36 | 
37 |     :param query: query string
38 |     :param k: number of top ranked results
39 |     :param vectorizer: vectorizer object
40 |     :param tfidf_vectors: the tfidf weighted vectors
41 |     :param newsgroups: corpus
42 |     :return: -
43 |     '''
44 | 
45 |     result = vsm.rank_documents(query, k, vectorizer, tfidf_vectors)
46 | 
47 |     if result:
48 |         print 'Top {0} documents as result of the query: {1}'.format(k, query)
49 | 
50 |         for r in result:
51 |             print newsgroups.filenames[r]
52 |     else:
53 |         print 'No result for this query: {0}'.format(query)
54 | 
55 | 
56 | if __name__ == '__main__':
57 | 
58 |     # fetch the 20newsgroups dataset
59 |     newsgroups = fetch_20newsgroups(subset='all')
60 | 
61 |     # obtain the documents and the words in the 20 newsgroups corpus
62 |     docs, words = ii.read_newsgroups(newsgroups.data)
63 | 
64 |     print 'Statistics about the corpus'
65 |     print 'Number of documents {0} and number of words {1}'.format(len(docs),
66 |                                                                    len(words))
67 | 
68 |     print 'Creating the inverted index...'
69 |     iindex = ii.inverted_index(docs, words)
70 | 
71 |     # perform boolean queries
72 |     query_iindex('science', 5, newsgroups)
73 |     query_iindex('science and religion', 5, newsgroups)
74 |     query_iindex('science or religion', 5, newsgroups)
75 | 
76 |     print 'Creating the tfidf vectors...'
77 | 
78 |     vectorizer, tfidf_vectors = vsm.create_vectors(newsgroups)
79 | 
80 |     # perform queries using tfidf vectors
81 |     query_tfidf_vectors('science religion', 5, vectorizer, tfidf_vectors,
82 |                         newsgroups)
83 | 
84 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # newsgroup-search
 2 | 
 3 | A simple search engine for the 20 newsgroups text dataset. 
 4 | 
 5 | Support for boolean queries: AND, OR
 6 | ------------------------------------
 7 | 
 8 | An implementation of the boolean retrieval model based on an inverted index. 
 9 | The words in the documents belonging to the 20 newsgroups corpus represent 
10 | the terms in the inverted index. To each term a document id list is assigned, 
11 | representing the ids of documents where the word appears. The inverted index 
12 | is stored in memory (both the terms and the list of document ids 
13 | corresponding to each term).
14 | 
15 | Another option (not explored here) would have been, instead of using words in 
16 | the collection for the terms of the inverted index, to use the newsgroup 
17 | assigned to each document - it is unclear if the search is to be performed 
18 | based on the newsgroup or based on words in the corpus.
19 | 
20 | To support boolean queries each query is converted to a syntax tree (using 
21 | the ast python package). The tree is recursively traversed in order to obtain
22 |  the query result. 
23 | 
24 | A tf-idf-based ranker for simple queries
25 | ----------------------------------------
26 | 
27 | The 20 newsgroups corpus is converted to a sparse tfidf matrix where the rows 
28 | are the documents and the columns are the terms present in the corpus (using 
29 | scikit-learn's TfidfVectorizer).
30 |   
31 | A given query is converted to a sparse vector based on the tfidf matrix 
32 | previously obtained. This vector will have values of 0 but for the terms 
33 | which match the terms in the matrix. The rows (corresponding to document ids) 
34 | of these non-zero terms are retrieved and ordered by the tfidf weight. The 
35 | top k documents are returned as a result of the query.
36 | 
37 | Improvements
38 | ------------
39 | 
40 | * enhance the pre-processing steps: tokenization, normalization, handling of 
41 | punctuation
42 | 
43 | * for the inverted index, store the list of document ids on disk (not 
44 | necessary in this example as the size of the collection permits in-memory 
45 | storage).
46 | 
47 | * use a state-of-the-art search engine such as Elasticsearch (based on Lucene)
48 | 
49 | * account for the meaning of words in context, as querying for 'kiwi bird' 
50 | should not yield results which are relevant for fruits. This can be achieved 
51 | by changing the current bag-of-words model with a model such as word2vec or 
52 | similar Neural word representations which can model semantic and syntactic 
53 | word relationships
54 | 
55 | Usage
56 | -----
57 | 
58 | `python newsgroup.py`
59 | 
60 | To create the conda environment
61 | 
62 | `conda create -p env --file conda.txt`
63 | 
64 | `source activate env/`
65 | 
66 | 


--------------------------------------------------------------------------------
/inverted_index.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import _ast
  3 | 
  4 | 
  5 | def read_newsgroups(newsgroups_data):
  6 | 
  7 |     '''
  8 |     Read newsgroups and return the documents in the newsgroups corpus and
  9 |     all the words
 10 | 
 11 |     :param newsgroups_data: the newsgroups corpus
 12 |     :return: documents in the corpus and all words
 13 |     '''
 14 | 
 15 |     docs, words = {}, set()
 16 | 
 17 |     # use only lowercasing as a pre-processing step
 18 |     # this can be further extended to use tokenization, stemming, stop-words
 19 |     # e.g. the output of the TfidfVectorizer
 20 |     for idx, item in enumerate(newsgroups_data):
 21 |         txt = item.split()
 22 |         txt = [t.lower() for t in txt]
 23 |         words |= set(txt)
 24 |         docs[idx] = txt
 25 |     return docs, words
 26 | 
 27 | 
 28 | def inverted_index(docs, words):
 29 | 
 30 |     '''
 31 |     Generate the inverted index based on the documents and the words
 32 | 
 33 |     :param docs: the documents in the collection as a dictionary with
 34 |     document id keys and words as values
 35 |     :param words: all words in the collection
 36 |     :return: inverted index
 37 |     '''
 38 | 
 39 |     # initialize the inverted index for faster index generation
 40 |     iindex = {word: [] for word in words}
 41 | 
 42 |     for did, txt in docs.items():
 43 |         for word in txt:
 44 |             iindex[word] += [did]
 45 | 
 46 |     return iindex
 47 | 
 48 | 
 49 | def traverse_syntax_tree(tree, iindex):
 50 | 
 51 |     '''
 52 |     Traverse the syntax tree representing a boolean query. By traversing the
 53 |     tree we can evaluate the query.
 54 | 
 55 |     :param tree: syntax tree
 56 |     :param iindex: inverted index
 57 |     :return: list of document ids for the query
 58 |     '''
 59 | 
 60 |     assert isinstance(tree, _ast.BoolOp), 'Only boolean operators are ' \
 61 |                                                'allowed'
 62 | 
 63 |     doc_ids = [None, None]
 64 |     for i in [0, 1]:
 65 |         if isinstance(tree.values[i], _ast.Name): # recursion ends
 66 |             term = tree.values[i].id
 67 |             doc_ids[i] = iindex.get(term) or []
 68 |         else:                                     # recursive call
 69 |             doc_ids[i] = traverse_syntax_tree(tree.values[i], iindex)
 70 | 
 71 |     # if it is an OR, then perform set union
 72 |     if isinstance(tree.op, _ast.Or):
 73 |         return set(doc_ids[0]) | set(doc_ids[1])
 74 |     # if it is an AND, perform set intersection
 75 |     elif isinstance(tree.op, _ast.And):
 76 |         return set(doc_ids[0]) & set(doc_ids[1])
 77 |     else:
 78 |         print 'Not a supported boolean operator'
 79 |         raise Exception
 80 | 
 81 | 
 82 | def boolean_search(query, iindex):
 83 | 
 84 |     '''
 85 |     Perform boolean search using arbitrary AND or OR operators
 86 | 
 87 |     :param query: the query to perform
 88 |     :param iindex: the inverted index
 89 |     :return: a list of document ids matching the query
 90 |     '''
 91 | 
 92 |     if not query:
 93 |         print 'No query received as input'
 94 |         return
 95 | 
 96 |     # convert query is lowercase
 97 |     query = query.lower()
 98 | 
 99 |     # special case if the query only contains one term
100 |     query_elems = query.split()
101 |     if len(query_elems) <= 1:
102 |         return list(iindex.get(query))
103 | 
104 |     # use a syntax tree to parse the query
105 |     tree = ast.parse(query, mode='eval')
106 | 
107 |     # traverse the syntax tree to obtain a result
108 |     return list(traverse_syntax_tree(tree.body, iindex))
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------