├── LICENSE
├── README.md
└── invertedindex
    ├── invertedindex.go
    └── invertedindex_test.go


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Elton SV
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Inverted-Index-Generator
2 | An Inverted Index generator implemented in Go
3 | 
4 | An inverted index is used to quickly do text searches in a lot of documents. 
5 | It is done in Full Text Search Engines such as ElasticSearch. It does so by creating a table of each word, its frequency and all the documents it has appeared in.
6 | 
7 | Reference: [Stanford NLP Group](https://nlp.stanford.edu/IR-book/html/htmledition/a-first-take-at-building-an-inverted-index-1.html)
8 | 


--------------------------------------------------------------------------------
/invertedindex/invertedindex.go:
--------------------------------------------------------------------------------
  1 | package invertedindex
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"regexp"
  6 | 	"strings"
  7 | )
  8 | 
  9 | // InvertedIndexEntry contains the term followed by the
 10 | // number of times it has appeared across all documents
 11 | // and an array of documents it is persent in
 12 | type InvertedIndexEntry struct {
 13 | 	Term            string
 14 | 	Frequency       int
 15 | 	DocumentListing []int
 16 | }
 17 | 
 18 | // InvertedIndex contains a hash map to easily check if the
 19 | // term is present and an array of InvertedIndexEntry
 20 | type InvertedIndex struct {
 21 | 	HashMap map[string]*InvertedIndexEntry
 22 | 	Items   []*InvertedIndexEntry
 23 | }
 24 | 
 25 | // FindItem returns the position of a given
 26 | // Item in an Inverted Index
 27 | func (invertedIndex *InvertedIndex) FindItem(Term string) int {
 28 | 	for index, item := range invertedIndex.Items {
 29 | 		if item.Term == Term {
 30 | 			return index
 31 | 		}
 32 | 	}
 33 | 	panic("Not Found")
 34 | }
 35 | 
 36 | // AddItem works by first checking if a given term is already present
 37 | // in the inverse index or not by checking the hashmap. If it is
 38 | // present it updates the Items by increasing the frequency and
 39 | // adding the document it is found in. If it is not present it
 40 | // adds it to the hash map and adds it to the items list
 41 | func (invertedIndex *InvertedIndex) AddItem(Term string, Document int) {
 42 | 	if invertedIndex.HashMap[Term] != nil {
 43 | 		// log.Println("Index item", Term, "already exists :: updating existing item")
 44 | 
 45 | 		FoundItemPosition := invertedIndex.FindItem(Term)
 46 | 
 47 | 		invertedIndex.Items[FoundItemPosition].Frequency++
 48 | 		invertedIndex.Items[FoundItemPosition].DocumentListing = append(invertedIndex.Items[FoundItemPosition].DocumentListing, Document)
 49 | 	} else {
 50 | 		// log.Println("Index item", Term, " does not exist :: creating new item")
 51 | 
 52 | 		InvertedIndexEntry := &InvertedIndexEntry{
 53 | 			Term:            Term,
 54 | 			Frequency:       1,
 55 | 			DocumentListing: []int{Document},
 56 | 		}
 57 | 
 58 | 		invertedIndex.HashMap[Term] = InvertedIndexEntry
 59 | 		invertedIndex.Items = append(invertedIndex.Items, InvertedIndexEntry)
 60 | 	}
 61 | }
 62 | 
 63 | // CreateInvertedIndex initializes an
 64 | // empty Inverted Index
 65 | func CreateInvertedIndex() *InvertedIndex {
 66 | 	invertedIndex := &InvertedIndex{
 67 | 		HashMap: make(map[string]*InvertedIndexEntry),
 68 | 		Items:   []*InvertedIndexEntry{},
 69 | 	}
 70 | 	return invertedIndex
 71 | }
 72 | 
 73 | // RemoveDuplicates filters out all duplicate
 74 | // words from each document
 75 | func RemoveDuplicates(wordList []string) []string {
 76 | 	keys := make(map[string]bool)
 77 | 	uniqueWords := []string{}
 78 | 
 79 | 	for _, entry := range wordList {
 80 | 		if _, value := keys[entry]; !value {
 81 | 			keys[entry] = true
 82 | 			uniqueWords = append(uniqueWords, entry)
 83 | 		}
 84 | 	}
 85 | 
 86 | 	return uniqueWords
 87 | }
 88 | 
 89 | // Preprocessing converts each word to lowercase
 90 | // TODO: Clean up each word for symbols
 91 | func Preprocessing(wordList []string) []string {
 92 | 	ProcessedWordList := []string{}
 93 | 
 94 | 	// Convert each string to lowercase from
 95 | 	// wordList and add to ProcessedWordList
 96 | 	for _, word := range wordList {
 97 | 		ProcessedWordList = append(ProcessedWordList, strings.ToLower(word))
 98 | 	}
 99 | 
100 | 	return ProcessedWordList
101 | }
102 | 
103 | // Tokenize gets the individual words from each
104 | // document and generates a wordlist
105 | func Tokenize(Doc string) []string {
106 | 	wordList := []string{}
107 | 
108 | 	// The following regexp finds individual
109 | 	// words in a sentence
110 | 	r := regexp.MustCompile("[^\\s]+")
111 | 	wordList = r.FindAllString(Doc, -1)
112 | 
113 | 	wordList = Preprocessing(wordList)
114 | 	wordList = RemoveDuplicates(wordList)
115 | 
116 | 	return wordList
117 | }
118 | 
119 | // GenerateDocMap creates a hash map of
120 | // each word in the document
121 | func GenerateDocMap(token []string) map[string]bool {
122 | 	docMap := make(map[string]bool)
123 | 
124 | 	for _, word := range token {
125 | 		if _, value := docMap[word]; !value {
126 | 			docMap[word] = true
127 | 		}
128 | 	}
129 | 
130 | 	return docMap
131 | }
132 | 
133 | // GenerateInvertedIndex for each document list
134 | // gets each word as a token, processes it and
135 | // generates a hash map for each document
136 | // using them it then generates the
137 | // inverted index of all words
138 | func GenerateInvertedIndex(DocList []string) InvertedIndex {
139 | 	globalDocMap := make([]map[string]bool, 0)
140 | 
141 | 	for _, Doc := range DocList {
142 | 		token := Tokenize(Doc)
143 | 		docMap := GenerateDocMap(token)
144 | 		globalDocMap = append(globalDocMap, docMap)
145 | 	}
146 | 
147 | 	// Create an empty inverted index
148 | 	invertedIndex := CreateInvertedIndex()
149 | 
150 | 	// Using the generated hash maps add
151 | 	// each word to the inverted index
152 | 	for DocMapIndex, DocMap := range globalDocMap {
153 | 		for DocEntry := range DocMap {
154 | 			invertedIndex.AddItem(DocEntry, DocMapIndex)
155 | 		}
156 | 	}
157 | 	return *invertedIndex
158 | }
159 | 
160 | // Find for a given inverted index and search term
161 | // checks if the term exists and then
162 | // outputs the documents the
163 | // term is in
164 | func Find(index InvertedIndex, searchTerm string) {
165 | 	Term := strings.ToLower(searchTerm)
166 | 
167 | 	if index.HashMap[Term] != nil {
168 | 		itemPosition := index.FindItem(Term)
169 | 		item := index.Items[itemPosition]
170 | 
171 | 		fmt.Println("Found:", searchTerm, "in documents:", item.DocumentListing)
172 | 	} else {
173 | 		fmt.Println("Not Found:", searchTerm)
174 | 	}
175 | }
176 | 


--------------------------------------------------------------------------------
/invertedindex/invertedindex_test.go:
--------------------------------------------------------------------------------
 1 | package invertedindex
 2 | 
 3 | import (
 4 | 	"reflect"
 5 | 	"testing"
 6 | )
 7 | 
 8 | var wordListTest = []string{
 9 | 	"new", "HOme", "sales", "top", "forecasts",
10 | 	"home", "sales", "rise", "in", "July",
11 | 	"increase", "in", "home", "SALES", "in",
12 | 	"July", "new", "home", "sales", "rise", "July",
13 | }
14 | 
15 | func TestPreprocessing(t *testing.T) {
16 | 	wordList := wordListTest
17 | 
18 | 	expectedList := []string{
19 | 		"new", "home", "sales", "top", "forecasts",
20 | 		"home", "sales", "rise", "in", "july",
21 | 		"increase", "in", "home", "sales", "in",
22 | 		"july", "new", "home", "sales", "rise", "july",
23 | 	}
24 | 
25 | 	actualList := Preprocessing(wordList)
26 | 
27 | 	if !reflect.DeepEqual(expectedList, actualList) {
28 | 		t.Fatalf("\nExpected:%v \nGot:%v", expectedList, actualList)
29 | 	}
30 | }
31 | 
32 | func TestPreprocessing_NoWordList(t *testing.T) {
33 | 	wordList := make([]string, 0)
34 | 
35 | 	expectedList := make([]string, 0)
36 | 
37 | 	actualList := Preprocessing(wordList)
38 | 
39 | 	if !reflect.DeepEqual(expectedList, actualList) {
40 | 		t.Fatalf("\nExpected:%v \nGot:%v", expectedList, actualList)
41 | 	}
42 | }
43 | 
44 | func TestRemoveDuplicates(t *testing.T) {
45 | 	wordList := Preprocessing(wordListTest)
46 | 
47 | 	expectedList := []string{
48 | 		"new", "home", "sales", "top", "forecasts",
49 | 		"rise", "in", "july",
50 | 		"increase",
51 | 	}
52 | 
53 | 	actualList := RemoveDuplicates(wordList)
54 | 
55 | 	if !reflect.DeepEqual(expectedList, actualList) {
56 | 		t.Fatalf("\nExpected:%v \nGot:%v", expectedList, actualList)
57 | 	}
58 | }
59 | 
60 | func TestRemoveDuplicates_NoWordList(t *testing.T) {
61 | 	wordList := make([]string, 0)
62 | 
63 | 	expectedList := make([]string, 0)
64 | 
65 | 	actualList := RemoveDuplicates(wordList)
66 | 
67 | 	if !reflect.DeepEqual(expectedList, actualList) {
68 | 		t.Fatalf("\nExpected:%v \nGot:%v", expectedList, actualList)
69 | 	}
70 | }
71 | 
72 | func TestTokenize(t *testing.T) {
73 | 	doc := "new home sales top forecasts NEW"
74 | 
75 | 	expectedList := []string{
76 | 		"new", "home", "sales", "top", "forecasts",
77 | 	}
78 | 
79 | 	actualList := Tokenize(doc)
80 | 
81 | 	if !reflect.DeepEqual(expectedList, actualList) {
82 | 		t.Fatalf("\nExpected:%v \nGot:%v", expectedList, actualList)
83 | 	}
84 | }
85 | 
86 | func TestTokenize_NoDoc(t *testing.T) {
87 | 	var doc string
88 | 
89 | 	expectedList := []string{}
90 | 
91 | 	actualList := Tokenize(doc)
92 | 
93 | 	if !reflect.DeepEqual(expectedList, actualList) {
94 | 		t.Fatalf("\nExpected:%v \nGot:%v", expectedList, actualList)
95 | 	}
96 | }
97 | 


--------------------------------------------------------------------------------