├── LICENSE ├── README.md └── invertedindex ├── invertedindex.go └── invertedindex_test.go /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Elton SV 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Inverted-Index-Generator 2 | An Inverted Index generator implemented in Go 3 | 4 | An inverted index is used to quickly do text searches in a lot of documents. 5 | It is done in Full Text Search Engines such as ElasticSearch. It does so by creating a table of each word, its frequency and all the documents it has appeared in. 6 | 7 | Reference: [Stanford NLP Group](https://nlp.stanford.edu/IR-book/html/htmledition/a-first-take-at-building-an-inverted-index-1.html) 8 | -------------------------------------------------------------------------------- /invertedindex/invertedindex.go: -------------------------------------------------------------------------------- 1 | package invertedindex 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | "strings" 7 | ) 8 | 9 | // InvertedIndexEntry contains the term followed by the 10 | // number of times it has appeared across all documents 11 | // and an array of documents it is persent in 12 | type InvertedIndexEntry struct { 13 | Term string 14 | Frequency int 15 | DocumentListing []int 16 | } 17 | 18 | // InvertedIndex contains a hash map to easily check if the 19 | // term is present and an array of InvertedIndexEntry 20 | type InvertedIndex struct { 21 | HashMap map[string]*InvertedIndexEntry 22 | Items []*InvertedIndexEntry 23 | } 24 | 25 | // FindItem returns the position of a given 26 | // Item in an Inverted Index 27 | func (invertedIndex *InvertedIndex) FindItem(Term string) int { 28 | for index, item := range invertedIndex.Items { 29 | if item.Term == Term { 30 | return index 31 | } 32 | } 33 | panic("Not Found") 34 | } 35 | 36 | // AddItem works by first checking if a given term is already present 37 | // in the inverse index or not by checking the hashmap. If it is 38 | // present it updates the Items by increasing the frequency and 39 | // adding the document it is found in. If it is not present it 40 | // adds it to the hash map and adds it to the items list 41 | func (invertedIndex *InvertedIndex) AddItem(Term string, Document int) { 42 | if invertedIndex.HashMap[Term] != nil { 43 | // log.Println("Index item", Term, "already exists :: updating existing item") 44 | 45 | FoundItemPosition := invertedIndex.FindItem(Term) 46 | 47 | invertedIndex.Items[FoundItemPosition].Frequency++ 48 | invertedIndex.Items[FoundItemPosition].DocumentListing = append(invertedIndex.Items[FoundItemPosition].DocumentListing, Document) 49 | } else { 50 | // log.Println("Index item", Term, " does not exist :: creating new item") 51 | 52 | InvertedIndexEntry := &InvertedIndexEntry{ 53 | Term: Term, 54 | Frequency: 1, 55 | DocumentListing: []int{Document}, 56 | } 57 | 58 | invertedIndex.HashMap[Term] = InvertedIndexEntry 59 | invertedIndex.Items = append(invertedIndex.Items, InvertedIndexEntry) 60 | } 61 | } 62 | 63 | // CreateInvertedIndex initializes an 64 | // empty Inverted Index 65 | func CreateInvertedIndex() *InvertedIndex { 66 | invertedIndex := &InvertedIndex{ 67 | HashMap: make(map[string]*InvertedIndexEntry), 68 | Items: []*InvertedIndexEntry{}, 69 | } 70 | return invertedIndex 71 | } 72 | 73 | // RemoveDuplicates filters out all duplicate 74 | // words from each document 75 | func RemoveDuplicates(wordList []string) []string { 76 | keys := make(map[string]bool) 77 | uniqueWords := []string{} 78 | 79 | for _, entry := range wordList { 80 | if _, value := keys[entry]; !value { 81 | keys[entry] = true 82 | uniqueWords = append(uniqueWords, entry) 83 | } 84 | } 85 | 86 | return uniqueWords 87 | } 88 | 89 | // Preprocessing converts each word to lowercase 90 | // TODO: Clean up each word for symbols 91 | func Preprocessing(wordList []string) []string { 92 | ProcessedWordList := []string{} 93 | 94 | // Convert each string to lowercase from 95 | // wordList and add to ProcessedWordList 96 | for _, word := range wordList { 97 | ProcessedWordList = append(ProcessedWordList, strings.ToLower(word)) 98 | } 99 | 100 | return ProcessedWordList 101 | } 102 | 103 | // Tokenize gets the individual words from each 104 | // document and generates a wordlist 105 | func Tokenize(Doc string) []string { 106 | wordList := []string{} 107 | 108 | // The following regexp finds individual 109 | // words in a sentence 110 | r := regexp.MustCompile("[^\\s]+") 111 | wordList = r.FindAllString(Doc, -1) 112 | 113 | wordList = Preprocessing(wordList) 114 | wordList = RemoveDuplicates(wordList) 115 | 116 | return wordList 117 | } 118 | 119 | // GenerateDocMap creates a hash map of 120 | // each word in the document 121 | func GenerateDocMap(token []string) map[string]bool { 122 | docMap := make(map[string]bool) 123 | 124 | for _, word := range token { 125 | if _, value := docMap[word]; !value { 126 | docMap[word] = true 127 | } 128 | } 129 | 130 | return docMap 131 | } 132 | 133 | // GenerateInvertedIndex for each document list 134 | // gets each word as a token, processes it and 135 | // generates a hash map for each document 136 | // using them it then generates the 137 | // inverted index of all words 138 | func GenerateInvertedIndex(DocList []string) InvertedIndex { 139 | globalDocMap := make([]map[string]bool, 0) 140 | 141 | for _, Doc := range DocList { 142 | token := Tokenize(Doc) 143 | docMap := GenerateDocMap(token) 144 | globalDocMap = append(globalDocMap, docMap) 145 | } 146 | 147 | // Create an empty inverted index 148 | invertedIndex := CreateInvertedIndex() 149 | 150 | // Using the generated hash maps add 151 | // each word to the inverted index 152 | for DocMapIndex, DocMap := range globalDocMap { 153 | for DocEntry := range DocMap { 154 | invertedIndex.AddItem(DocEntry, DocMapIndex) 155 | } 156 | } 157 | return *invertedIndex 158 | } 159 | 160 | // Find for a given inverted index and search term 161 | // checks if the term exists and then 162 | // outputs the documents the 163 | // term is in 164 | func Find(index InvertedIndex, searchTerm string) { 165 | Term := strings.ToLower(searchTerm) 166 | 167 | if index.HashMap[Term] != nil { 168 | itemPosition := index.FindItem(Term) 169 | item := index.Items[itemPosition] 170 | 171 | fmt.Println("Found:", searchTerm, "in documents:", item.DocumentListing) 172 | } else { 173 | fmt.Println("Not Found:", searchTerm) 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /invertedindex/invertedindex_test.go: -------------------------------------------------------------------------------- 1 | package invertedindex 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | var wordListTest = []string{ 9 | "new", "HOme", "sales", "top", "forecasts", 10 | "home", "sales", "rise", "in", "July", 11 | "increase", "in", "home", "SALES", "in", 12 | "July", "new", "home", "sales", "rise", "July", 13 | } 14 | 15 | func TestPreprocessing(t *testing.T) { 16 | wordList := wordListTest 17 | 18 | expectedList := []string{ 19 | "new", "home", "sales", "top", "forecasts", 20 | "home", "sales", "rise", "in", "july", 21 | "increase", "in", "home", "sales", "in", 22 | "july", "new", "home", "sales", "rise", "july", 23 | } 24 | 25 | actualList := Preprocessing(wordList) 26 | 27 | if !reflect.DeepEqual(expectedList, actualList) { 28 | t.Fatalf("\nExpected:%v \nGot:%v", expectedList, actualList) 29 | } 30 | } 31 | 32 | func TestPreprocessing_NoWordList(t *testing.T) { 33 | wordList := make([]string, 0) 34 | 35 | expectedList := make([]string, 0) 36 | 37 | actualList := Preprocessing(wordList) 38 | 39 | if !reflect.DeepEqual(expectedList, actualList) { 40 | t.Fatalf("\nExpected:%v \nGot:%v", expectedList, actualList) 41 | } 42 | } 43 | 44 | func TestRemoveDuplicates(t *testing.T) { 45 | wordList := Preprocessing(wordListTest) 46 | 47 | expectedList := []string{ 48 | "new", "home", "sales", "top", "forecasts", 49 | "rise", "in", "july", 50 | "increase", 51 | } 52 | 53 | actualList := RemoveDuplicates(wordList) 54 | 55 | if !reflect.DeepEqual(expectedList, actualList) { 56 | t.Fatalf("\nExpected:%v \nGot:%v", expectedList, actualList) 57 | } 58 | } 59 | 60 | func TestRemoveDuplicates_NoWordList(t *testing.T) { 61 | wordList := make([]string, 0) 62 | 63 | expectedList := make([]string, 0) 64 | 65 | actualList := RemoveDuplicates(wordList) 66 | 67 | if !reflect.DeepEqual(expectedList, actualList) { 68 | t.Fatalf("\nExpected:%v \nGot:%v", expectedList, actualList) 69 | } 70 | } 71 | 72 | func TestTokenize(t *testing.T) { 73 | doc := "new home sales top forecasts NEW" 74 | 75 | expectedList := []string{ 76 | "new", "home", "sales", "top", "forecasts", 77 | } 78 | 79 | actualList := Tokenize(doc) 80 | 81 | if !reflect.DeepEqual(expectedList, actualList) { 82 | t.Fatalf("\nExpected:%v \nGot:%v", expectedList, actualList) 83 | } 84 | } 85 | 86 | func TestTokenize_NoDoc(t *testing.T) { 87 | var doc string 88 | 89 | expectedList := []string{} 90 | 91 | actualList := Tokenize(doc) 92 | 93 | if !reflect.DeepEqual(expectedList, actualList) { 94 | t.Fatalf("\nExpected:%v \nGot:%v", expectedList, actualList) 95 | } 96 | } 97 | --------------------------------------------------------------------------------