├── .gitignore ├── conda.txt ├── preprocess.py ├── vector_space_model.py ├── newsgroup.py ├── README.md └── inverted_index.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /conda.txt: -------------------------------------------------------------------------------- 1 | scikit-learn 2 | nltk -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem.porter import PorterStemmer 3 | 4 | stemmer = PorterStemmer() 5 | 6 | 7 | def stem_tokens(tokens): 8 | 9 | ''' 10 | Stem tokens using the Porter Stemmer 11 | 12 | :param tokens: list of tokens to stem 13 | :return: stemmed tokens 14 | ''' 15 | 16 | stemmed = [] 17 | for item in tokens: 18 | stemmed.append(stemmer.stem(item)) 19 | return stemmed 20 | 21 | 22 | def tokenize(text): 23 | 24 | ''' 25 | Tokenize and stem text received as input 26 | 27 | :param text: text to tokenize 28 | :return: tokenized and stemmed text 29 | ''' 30 | 31 | tokens = nltk.word_tokenize(text) 32 | stems = stem_tokens(tokens) 33 | return stems 34 | -------------------------------------------------------------------------------- /vector_space_model.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | from preprocess import tokenize 3 | import numpy 4 | 5 | 6 | def create_vectors(newsgroups): 7 | 8 | ''' 9 | Create tfidf vectors for the 20 newsgroup dataset 10 | 11 | :param newsgroups: 12 | :return: the vectorizer object and tfidf vectors 13 | ''' 14 | 15 | # the tfidf vectorizer using a custom tokenizer and a list of English 16 | # stop-words 17 | vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english') 18 | 19 | # fit the vectorizer using the 20 newsgroup corpus 20 | tfidf_vectors = vectorizer.fit_transform(newsgroups.data) 21 | 22 | # return the tfidf vectorizer object and the vectors obtained after fitting 23 | # the corpus 24 | return vectorizer, tfidf_vectors 25 | 26 | 27 | def rank_documents(query, k, vectorizer, tfidf_vectors): 28 | 29 | ''' 30 | Rank documents based on a query, using the tfidf vectors 31 | 32 | :param query: the query 33 | :param k: the number of documents to return 34 | :param vectorizer: the vectorizer object 35 | :param tfidf_vectors: the tfidf vectors fitted for the 20 newsgroups corpus 36 | :return: top k ranked document according to the query 37 | ''' 38 | 39 | # transform the query 40 | result = vectorizer.transform([query]) 41 | 42 | # get the terms of the vectorizer 43 | terms = vectorizer.get_feature_names() 44 | 45 | # if the query consists of multiple terms, aggregate the result in this 46 | # accumulator 47 | accumulator = numpy.zeros(tfidf_vectors.shape[0]) 48 | 49 | assert k < tfidf_vectors.shape[0], 'k should be smaller than the number ' \ 50 | 'of documents' 51 | 52 | for col in result.nonzero()[1]: 53 | 54 | # a column corresponding to a term in the query, for now a sparse 55 | # representation 56 | column = tfidf_vectors[:, col] 57 | 58 | # convert the sparse respresentation to a dense matrix 59 | dense_column = column.todense() 60 | 61 | # reshape to a numpy array 62 | reshaped_dense_column = numpy.asarray(dense_column).reshape(-1) 63 | 64 | # add the column to the accumulator 65 | accumulator += reshaped_dense_column 66 | 67 | # sort the documents 68 | sorted_column_index = numpy.argsort(accumulator) 69 | 70 | # create a slice only with the top-k results 71 | result = sorted_column_index[-k:].tolist() 72 | 73 | return result[::-1] 74 | -------------------------------------------------------------------------------- /newsgroup.py: -------------------------------------------------------------------------------- 1 | import inverted_index as ii 2 | import vector_space_model as vsm 3 | 4 | from sklearn.datasets import fetch_20newsgroups 5 | 6 | 7 | def query_iindex(query, k, newsgroups): 8 | 9 | ''' 10 | Helper function to query the inverted index 11 | 12 | :param query: the query string 13 | :param k: number of results to display 14 | :param newsgroups: corpus 15 | :return: - 16 | ''' 17 | 18 | result = ii.boolean_search(query, iindex) 19 | 20 | print 'The result of the boolean query: {0}'.format(query) 21 | 22 | if result: 23 | print 'Obtained {0} results'.format(len(result)) 24 | 25 | print 'A subset of {0} results'.format(k) 26 | for r in result[:k]: 27 | print newsgroups.filenames[r] 28 | else: 29 | print 'No result for this query: {0}'.format(query) 30 | 31 | 32 | def query_tfidf_vectors(query, k, vectorizer, tfidf_vectors, newsgroups): 33 | 34 | ''' 35 | Helper function to query using tfidf vectors 36 | 37 | :param query: query string 38 | :param k: number of top ranked results 39 | :param vectorizer: vectorizer object 40 | :param tfidf_vectors: the tfidf weighted vectors 41 | :param newsgroups: corpus 42 | :return: - 43 | ''' 44 | 45 | result = vsm.rank_documents(query, k, vectorizer, tfidf_vectors) 46 | 47 | if result: 48 | print 'Top {0} documents as result of the query: {1}'.format(k, query) 49 | 50 | for r in result: 51 | print newsgroups.filenames[r] 52 | else: 53 | print 'No result for this query: {0}'.format(query) 54 | 55 | 56 | if __name__ == '__main__': 57 | 58 | # fetch the 20newsgroups dataset 59 | newsgroups = fetch_20newsgroups(subset='all') 60 | 61 | # obtain the documents and the words in the 20 newsgroups corpus 62 | docs, words = ii.read_newsgroups(newsgroups.data) 63 | 64 | print 'Statistics about the corpus' 65 | print 'Number of documents {0} and number of words {1}'.format(len(docs), 66 | len(words)) 67 | 68 | print 'Creating the inverted index...' 69 | iindex = ii.inverted_index(docs, words) 70 | 71 | # perform boolean queries 72 | query_iindex('science', 5, newsgroups) 73 | query_iindex('science and religion', 5, newsgroups) 74 | query_iindex('science or religion', 5, newsgroups) 75 | 76 | print 'Creating the tfidf vectors...' 77 | 78 | vectorizer, tfidf_vectors = vsm.create_vectors(newsgroups) 79 | 80 | # perform queries using tfidf vectors 81 | query_tfidf_vectors('science religion', 5, vectorizer, tfidf_vectors, 82 | newsgroups) 83 | 84 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # newsgroup-search 2 | 3 | A simple search engine for the 20 newsgroups text dataset. 4 | 5 | Support for boolean queries: AND, OR 6 | ------------------------------------ 7 | 8 | An implementation of the boolean retrieval model based on an inverted index. 9 | The words in the documents belonging to the 20 newsgroups corpus represent 10 | the terms in the inverted index. To each term a document id list is assigned, 11 | representing the ids of documents where the word appears. The inverted index 12 | is stored in memory (both the terms and the list of document ids 13 | corresponding to each term). 14 | 15 | Another option (not explored here) would have been, instead of using words in 16 | the collection for the terms of the inverted index, to use the newsgroup 17 | assigned to each document - it is unclear if the search is to be performed 18 | based on the newsgroup or based on words in the corpus. 19 | 20 | To support boolean queries each query is converted to a syntax tree (using 21 | the ast python package). The tree is recursively traversed in order to obtain 22 | the query result. 23 | 24 | A tf-idf-based ranker for simple queries 25 | ---------------------------------------- 26 | 27 | The 20 newsgroups corpus is converted to a sparse tfidf matrix where the rows 28 | are the documents and the columns are the terms present in the corpus (using 29 | scikit-learn's TfidfVectorizer). 30 | 31 | A given query is converted to a sparse vector based on the tfidf matrix 32 | previously obtained. This vector will have values of 0 but for the terms 33 | which match the terms in the matrix. The rows (corresponding to document ids) 34 | of these non-zero terms are retrieved and ordered by the tfidf weight. The 35 | top k documents are returned as a result of the query. 36 | 37 | Improvements 38 | ------------ 39 | 40 | * enhance the pre-processing steps: tokenization, normalization, handling of 41 | punctuation 42 | 43 | * for the inverted index, store the list of document ids on disk (not 44 | necessary in this example as the size of the collection permits in-memory 45 | storage). 46 | 47 | * use a state-of-the-art search engine such as Elasticsearch (based on Lucene) 48 | 49 | * account for the meaning of words in context, as querying for 'kiwi bird' 50 | should not yield results which are relevant for fruits. This can be achieved 51 | by changing the current bag-of-words model with a model such as word2vec or 52 | similar Neural word representations which can model semantic and syntactic 53 | word relationships 54 | 55 | Usage 56 | ----- 57 | 58 | `python newsgroup.py` 59 | 60 | To create the conda environment 61 | 62 | `conda create -p env --file conda.txt` 63 | 64 | `source activate env/` 65 | 66 | -------------------------------------------------------------------------------- /inverted_index.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import _ast 3 | 4 | 5 | def read_newsgroups(newsgroups_data): 6 | 7 | ''' 8 | Read newsgroups and return the documents in the newsgroups corpus and 9 | all the words 10 | 11 | :param newsgroups_data: the newsgroups corpus 12 | :return: documents in the corpus and all words 13 | ''' 14 | 15 | docs, words = {}, set() 16 | 17 | # use only lowercasing as a pre-processing step 18 | # this can be further extended to use tokenization, stemming, stop-words 19 | # e.g. the output of the TfidfVectorizer 20 | for idx, item in enumerate(newsgroups_data): 21 | txt = item.split() 22 | txt = [t.lower() for t in txt] 23 | words |= set(txt) 24 | docs[idx] = txt 25 | return docs, words 26 | 27 | 28 | def inverted_index(docs, words): 29 | 30 | ''' 31 | Generate the inverted index based on the documents and the words 32 | 33 | :param docs: the documents in the collection as a dictionary with 34 | document id keys and words as values 35 | :param words: all words in the collection 36 | :return: inverted index 37 | ''' 38 | 39 | # initialize the inverted index for faster index generation 40 | iindex = {word: [] for word in words} 41 | 42 | for did, txt in docs.items(): 43 | for word in txt: 44 | iindex[word] += [did] 45 | 46 | return iindex 47 | 48 | 49 | def traverse_syntax_tree(tree, iindex): 50 | 51 | ''' 52 | Traverse the syntax tree representing a boolean query. By traversing the 53 | tree we can evaluate the query. 54 | 55 | :param tree: syntax tree 56 | :param iindex: inverted index 57 | :return: list of document ids for the query 58 | ''' 59 | 60 | assert isinstance(tree, _ast.BoolOp), 'Only boolean operators are ' \ 61 | 'allowed' 62 | 63 | doc_ids = [None, None] 64 | for i in [0, 1]: 65 | if isinstance(tree.values[i], _ast.Name): # recursion ends 66 | term = tree.values[i].id 67 | doc_ids[i] = iindex.get(term) or [] 68 | else: # recursive call 69 | doc_ids[i] = traverse_syntax_tree(tree.values[i], iindex) 70 | 71 | # if it is an OR, then perform set union 72 | if isinstance(tree.op, _ast.Or): 73 | return set(doc_ids[0]) | set(doc_ids[1]) 74 | # if it is an AND, perform set intersection 75 | elif isinstance(tree.op, _ast.And): 76 | return set(doc_ids[0]) & set(doc_ids[1]) 77 | else: 78 | print 'Not a supported boolean operator' 79 | raise Exception 80 | 81 | 82 | def boolean_search(query, iindex): 83 | 84 | ''' 85 | Perform boolean search using arbitrary AND or OR operators 86 | 87 | :param query: the query to perform 88 | :param iindex: the inverted index 89 | :return: a list of document ids matching the query 90 | ''' 91 | 92 | if not query: 93 | print 'No query received as input' 94 | return 95 | 96 | # convert query is lowercase 97 | query = query.lower() 98 | 99 | # special case if the query only contains one term 100 | query_elems = query.split() 101 | if len(query_elems) <= 1: 102 | return list(iindex.get(query)) 103 | 104 | # use a syntax tree to parse the query 105 | tree = ast.parse(query, mode='eval') 106 | 107 | # traverse the syntax tree to obtain a result 108 | return list(traverse_syntax_tree(tree.body, iindex)) 109 | 110 | 111 | 112 | --------------------------------------------------------------------------------