├── 1.csv ├── 2.csv ├── 2.json ├── .travis.yml ├── go.mod ├── .gitignore ├── 1.json ├── go.sum ├── LICENSE ├── example.go ├── README.md ├── fehrist_test.go └── fehrist └── fehrist.go /1.csv: -------------------------------------------------------------------------------- 1 | Name, Age, Profession 2 | Jhon Doe,24,Computer Engineer 3 | -------------------------------------------------------------------------------- /2.csv: -------------------------------------------------------------------------------- 1 | Name, Tax_2005, Tax_2006 2 | Adnan Siddiqi,12399, 29393 3 | -------------------------------------------------------------------------------- /2.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | { 4 | "car": "Toyota", 5 | "color": "Gray" 6 | }, 7 | { 8 | "car": "Honda", 9 | "color": "Green" 10 | } 11 | ] -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: stable 4 | 5 | env: 6 | global: 7 | - GIMME_DEBUG=1 8 | 9 | install: skip 10 | 11 | notifications: 12 | email: false 13 | 14 | script: 15 | - go test -v -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/kadnan/fehrist 2 | 3 | go 1.14 4 | 5 | require ( 6 | github.com/bbalet/stopwords v1.0.0 7 | github.com/reiver/go-porterstemmer v1.0.1 8 | github.com/vmihailenco/msgpack v4.0.4+incompatible 9 | golang.org/x/text v0.3.2 // indirect 10 | ) 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | .vscode/* 15 | .DS_Store 16 | -------------------------------------------------------------------------------- /1.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | { 4 | "fruit": "Mango Sindh", 5 | "size": "Small", 6 | "color": "Yellow", 7 | "Etc":["A","B"] 8 | }, 9 | { 10 | "fruit": "Apple Tasty", 11 | "size": "Small", 12 | "color": "Red", 13 | "Etc":["D","E"] 14 | } 15 | ] -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/bbalet/stopwords v1.0.0 h1:0TnGycCtY0zZi4ltKoOGRFIlZHv0WqpoIGUsObjztfo= 2 | github.com/bbalet/stopwords v1.0.0/go.mod h1:sAWrQoDMfqARGIn4s6dp7OW7ISrshUD8IP2q3KoqPjc= 3 | github.com/reiver/go-porterstemmer v1.0.1 h1:WyERBkASXgoXrTwq/IQ6wyNj/YG7j/ZURvTuMCoud5w= 4 | github.com/reiver/go-porterstemmer v1.0.1/go.mod h1:Z8uL/f/7UEwaeAJNwx1sO8kbqXiEuQieNuD735hLrSU= 5 | github.com/vmihailenco/msgpack v4.0.4+incompatible h1:dSLoQfGFAo3F6OoNhwUmLwVgaUXK79GlxNBwueZn0xI= 6 | github.com/vmihailenco/msgpack v4.0.4+incompatible/go.mod h1:fy3FlTQTDXWkZ7Bh6AcGMlsjHatGryHQYUTf1ShIgkk= 7 | golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= 8 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= 9 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Adnan Siddiqi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /example.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "strconv" 7 | 8 | "github.com/kadnan/fehrist/fehrist" 9 | ) 10 | 11 | func main() { 12 | path, _ := os.Getwd() 13 | 14 | //Indexing CSV Files 15 | CSVDocument := &fehrist.CSV{IndexName: "local"} 16 | for i := 1; i < 3; i++ { 17 | fileName := path + "/" + strconv.Itoa(i) + ".csv" 18 | fmt.Println("Indexing CSV data from the file,", fileName, ". Please wait...") 19 | 20 | indexCount, err := CSVDocument.Index(fileName) 21 | if err != nil { 22 | fmt.Println(err) 23 | } else { 24 | fmt.Println("Total Words indexed", indexCount) 25 | } 26 | } 27 | 28 | //Indexing JSON files 29 | JSONDocument := &fehrist.JSON{IndexName: "local"} 30 | for i := 1; i < 3; i++ { 31 | fileName := path + "/" + strconv.Itoa(i) + ".json" 32 | fmt.Println("Indexing CSV data from the file,", fileName, ". Please wait...") 33 | 34 | indexCount, err := JSONDocument.Index(fileName) 35 | if err != nil { 36 | fmt.Println(err) 37 | } else { 38 | fmt.Println("Total Words indexed", indexCount) 39 | } 40 | } 41 | 42 | /* Searching Documents */ 43 | 44 | CSVDocument.Init() 45 | result, _, err := CSVDocument.Search("siddiqi") 46 | if err != nil { 47 | fmt.Println(err) 48 | } 49 | fmt.Println("Printing the text present in CSV Document") 50 | fmt.Println(result) 51 | 52 | JSONDocument.Init() 53 | result, searchCount, err := JSONDocument.Search("mango") 54 | fmt.Println(searchCount) 55 | 56 | if err != nil { 57 | fmt.Println(err) 58 | } 59 | fmt.Println("Printing the text present in JSON Document") 60 | fmt.Println(result) 61 | 62 | } 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fehrist 2 | [![Build Status](https://api.travis-ci.org/kadnan/fehrist.svg)](https://travis-ci.org/kadnan/fehrist) 3 | 4 | _Fehrist_ is a pure Go library for indexing different types of documents. Currently it supports only CSV and JSON but flexible architecture gives you liberty to add more documents. Fehrist(فہرست) is an Urdu word for **Index**. Similar terminologies used in Arabic(فھرس) and Farsi(فہرست) as well. 5 | 6 | Fehrist is based on [Inverted Index](https://en.wikipedia.org/wiki/Inverted_index) data structure for indexing purposes. 7 | 8 | ## Examples 9 | ### For indexing 10 | ``` 11 | import ( 12 | "fmt" 13 | "os" 14 | "strconv" 15 | 16 | "github.com/kadnan/fehrist/fehrist" 17 | ) 18 | func main() { 19 | path, _ := os.Getwd() 20 | 21 | //Indexing CSV Files 22 | CSVDocument := &fehrist.CSV{IndexName: "local"} 23 | for i := 1; i < 3; i++ { 24 | fileName := path + "/" + strconv.Itoa(i) + ".csv" 25 | fmt.Println("Indexing CSV data from the file,", fileName, ". Please wait...") 26 | 27 | indexCount, err := CSVDocument.Index(fileName) 28 | if err != nil { 29 | fmt.Println(err) 30 | } else { 31 | fmt.Println("Total Words indexed", indexCount) 32 | } 33 | } 34 | 35 | //Indexing JSON files 36 | JSONDocument := &fehrist.JSON{IndexName: "local"} 37 | for i := 1; i < 3; i++ { 38 | fileName := path + "/" + strconv.Itoa(i) + ".json" 39 | fmt.Println("Indexing CSV data from the file,", fileName, ". Please wait...") 40 | 41 | indexCount, err := JSONDocument.Index(fileName) 42 | if err != nil { 43 | fmt.Println(err) 44 | } else { 45 | fmt.Println("Total Words indexed", indexCount) 46 | } 47 | } 48 | } 49 | ``` 50 | 51 | ### For Searching 52 | ``` 53 | /* Searching Documents */ 54 | 55 | CSVDocument.Init() 56 | result, _, err := CSVDocument.Search("siddiqi") 57 | if err != nil { 58 | fmt.Println(err) 59 | } 60 | fmt.Println("Printing the text present in CSV Document") 61 | fmt.Println(result) 62 | ``` 63 | If you want to learn how this all work then visit the [blog post](http://blog.adnansiddiqi.me/fehrist-document-indexing-library-in-go/) 64 | -------------------------------------------------------------------------------- /fehrist_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "testing" 8 | 9 | "github.com/kadnan/fehrist/fehrist" 10 | ) 11 | 12 | func TestDocumentFileNotFoundCSV(t *testing.T) { 13 | path, _ := os.Getwd() 14 | fileName := path + "/" + "LOL.csv" 15 | CSVDocument := &fehrist.CSV{IndexName: "local"} 16 | indexCount, _ := CSVDocument.Index(fileName) 17 | 18 | if indexCount != -1 { 19 | t.Errorf("Test DocumentFileNotFound Failed") 20 | } 21 | } 22 | 23 | func TestDocumentIndexedCSV(t *testing.T) { 24 | path, _ := os.Getwd() 25 | fileName := path + "/" + "1.csv" 26 | CSVDocument := &fehrist.CSV{IndexName: "local"} 27 | indexCount, err := CSVDocument.Index(fileName) 28 | fmt.Println(indexCount) 29 | fmt.Println(err) 30 | 31 | if indexCount < 1 { 32 | t.Errorf("Test Document Index Failed CSV") 33 | } 34 | } 35 | 36 | func TestDocumentFileNotFoundJSON(t *testing.T) { 37 | path, _ := os.Getwd() 38 | fileName := path + "/" + "LOL.json" 39 | CSVDocument := &fehrist.CSV{IndexName: "local"} 40 | indexCount, _ := CSVDocument.Index(fileName) 41 | if indexCount != -1 { 42 | t.Errorf("Test Document Indexed Failed") 43 | } 44 | } 45 | 46 | func TestDocumentIndexedJSON(t *testing.T) { 47 | path, _ := os.Getwd() 48 | fileName := path + "/" + "1.json" 49 | JSONDocument := &fehrist.JSON{IndexName: "local"} 50 | indexCount, _ := JSONDocument.Index(fileName) 51 | if indexCount < 1 { 52 | t.Errorf("Test Document Index Failed JSON") 53 | } 54 | } 55 | func TestInitCSV(t *testing.T) { 56 | Document := &fehrist.CSV{IndexName: "local"} 57 | result := Document.Init() 58 | if result != 1 { 59 | t.Errorf("Test Initialization failed for CSV Document") 60 | } 61 | } 62 | func TestInitJSON(t *testing.T) { 63 | Document := &fehrist.JSON{IndexName: "local"} 64 | result := Document.Init() 65 | if result != 1 { 66 | t.Errorf("Test Initialization failed for JSON Document") 67 | } 68 | } 69 | 70 | func TestSearchCSVKWExist(t *testing.T) { 71 | var input map[string]interface{} 72 | 73 | Document := &fehrist.CSV{IndexName: "local"} 74 | result := Document.Init() 75 | 76 | if result == 1 { 77 | out, _, _ := Document.Search("mango") 78 | err := json.Unmarshal([]byte(out), &input) 79 | if err != nil { 80 | fmt.Println(err) 81 | } 82 | _, ok := input["Total"] 83 | if !ok { 84 | t.Errorf("Test TestSearchCSVKWExist failed for CSV Document") 85 | } 86 | } 87 | } 88 | 89 | func TestSearchJSONKWExist(t *testing.T) { 90 | var input map[string]interface{} 91 | 92 | Document := &fehrist.JSON{IndexName: "local"} 93 | result := Document.Init() 94 | 95 | if result == 1 { 96 | out, _, _ := Document.Search("mango") 97 | err := json.Unmarshal([]byte(out), &input) 98 | if err != nil { 99 | fmt.Println(err) 100 | } 101 | _, ok := input["Total"] 102 | if !ok { 103 | t.Errorf("Test TestSearchCSVKWExist failed for JSON Document") 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /fehrist/fehrist.go: -------------------------------------------------------------------------------- 1 | // Package fehrist implements routines related to different kind of indexing 2 | 3 | package fehrist 4 | 5 | import ( 6 | "bytes" 7 | "crypto/sha1" 8 | "encoding/csv" 9 | "encoding/hex" 10 | "encoding/json" 11 | "errors" 12 | "fmt" 13 | "io/ioutil" 14 | "os" 15 | "path/filepath" 16 | "strconv" 17 | "strings" 18 | "time" 19 | 20 | "github.com/vmihailenco/msgpack" 21 | ) 22 | 23 | var entries = make(map[string][2]string) //to store values 24 | var tokenized = make(map[string]string) 25 | var mergedIndexMap = make(map[string]string) 26 | var mergedMap = make(map[string]string) 27 | var mergedMapDocuments = make(map[string][2]string) 28 | var idx = 0 29 | var isLoaded = false 30 | 31 | //All Constants 32 | const success int = 1 33 | const failure int = -1 34 | 35 | /* unmarshalData removes all the field that are not required and returns a flat Map 36 | Author: Peter Hellberg (https://gophers.slack.com/) - Thanks Peter!! 37 | */ 38 | func unmarshalData(data []byte) ([]map[string]interface{}, error) { 39 | var input []map[string]interface{} 40 | 41 | if err := json.Unmarshal(data, &input); err != nil { 42 | return nil, err 43 | } 44 | 45 | var resp []map[string]interface{} 46 | 47 | for _, in := range input { 48 | d := map[string]interface{}{} 49 | for k, v := range in { 50 | switch v.(type) { 51 | case string, float64: 52 | d[k] = v 53 | } 54 | } 55 | resp = append(resp, d) 56 | } 57 | 58 | return resp, nil 59 | } 60 | 61 | // Save function saves the value in the in a file. 62 | func save(content map[string][2]string, t map[string]string, fileHandleDocument *os.File, fileHandleIndex *os.File) (int, error) { 63 | 64 | defer fileHandleDocument.Close() 65 | defer fileHandleIndex.Close() 66 | 67 | // Saving Content Data after Marshalizing 68 | if len(content) > 0 { 69 | b, err := msgpack.Marshal(content) 70 | 71 | if err != nil { 72 | return 0, errors.New("Decoding Failed for Document") 73 | } 74 | 75 | fileHandleDocument.Write(b) 76 | } 77 | 78 | // Saving Index Data after Marshalizing 79 | if len(t) > 0 { 80 | b, err := msgpack.Marshal(t) 81 | 82 | if err != nil { 83 | return 0, errors.New("Decoding Failed for Index") 84 | } 85 | 86 | fileHandleIndex.Write(b) 87 | 88 | } 89 | 90 | return success, nil 91 | } 92 | 93 | /* saveIndex saves all index and document related info on disk. It is responsible for: 94 | - Create a Folder of Index Name 95 | - For each document index it creates a file with numeric sequence name with extension .idx 96 | - It stores original document along with assignedID 97 | */ 98 | func saveIndex(indexName string, path string, documentFileName string) { 99 | fileSquence := 0 100 | indexPath := indexName + "/" 101 | pattern := filepath.Join(indexPath, "*.idx") 102 | 103 | if _, err := os.Stat(indexName); os.IsNotExist(err) { 104 | os.Mkdir(indexName, 0700) //Write from the same program 105 | } 106 | 107 | // Folder Created. Now we have to check the next available sequence of file 108 | 109 | existingIndexFiles, err := filepath.Glob(pattern) 110 | 111 | fileSquence = len(existingIndexFiles) 112 | 113 | if err != nil { 114 | fmt.Println(err.Error()) 115 | } 116 | 117 | if len(existingIndexFiles) > 0 { 118 | fileSquence = len(existingIndexFiles) 119 | } 120 | 121 | indexFileName := indexPath + strconv.Itoa(fileSquence) + ".idx" 122 | docFileName := indexPath + documentFileName + ".document" 123 | 124 | // Save the Document File with .document extension 125 | docFile, err := os.OpenFile(docFileName, os.O_CREATE|os.O_RDWR, 0644) 126 | if err != nil { 127 | fmt.Println(err.Error()) 128 | } 129 | 130 | indexFile, err := os.OpenFile(indexFileName, os.O_CREATE|os.O_RDWR, 0644) 131 | if err != nil { 132 | fmt.Println(err.Error()) 133 | } 134 | 135 | if err != nil { 136 | fmt.Println(err.Error()) 137 | } 138 | 139 | _, err = save(entries, tokenized, docFile, indexFile) 140 | 141 | if err != nil { 142 | fmt.Println(err.Error()) 143 | } 144 | } 145 | 146 | // generateDocID generates a random DocID 147 | func generateDocID(text string) string { 148 | algorithm := sha1.New() 149 | algorithm.Write([]byte(text)) 150 | idx++ 151 | return hex.EncodeToString(algorithm.Sum(nil)) //ALERT: Implement hex based ID 152 | //return strconv.Itoa(idx) 153 | } 154 | 155 | // Indexer is the interface that implements important stuff 156 | type Indexer interface { 157 | Index(fileName string) 158 | assignDocID(entry string) 159 | tokenizeDocument() map[string]string 160 | } 161 | 162 | // A CSV represents a CSV Doccument 163 | type CSV struct { 164 | IndexName string 165 | } 166 | 167 | // A JSON represents a CSV Doccument 168 | type JSON struct { 169 | IndexName string 170 | } 171 | 172 | //DocumentList holds the return searched doc structure 173 | type DocumentList struct { 174 | FileName string 175 | DocText string 176 | } 177 | 178 | //SearchResult implements Search JSON 179 | type SearchResult struct { 180 | Total int 181 | Result []DocumentList 182 | } 183 | 184 | func msgPack2MapIndex(marshalled string) map[string]string { 185 | var tempTokenizedMap = make(map[string]string) 186 | msgpack.Unmarshal([]byte(marshalled), &tempTokenizedMap) 187 | return tempTokenizedMap 188 | } 189 | func msgPack2MapDocument(marshalled string) map[string][2]string { 190 | var tempDocMap = make(map[string][2]string) 191 | msgpack.Unmarshal([]byte(marshalled), &tempDocMap) 192 | return tempDocMap 193 | } 194 | 195 | //generateJSONArray checks whether the JSON is array of object or not, if no then make it one 196 | func generateJSONArray(data string) []byte { 197 | //jsonWithoutSpace := strings.ReplaceAll(string(data), " ", "") 198 | jsonWithoutSpace := strings.TrimSpace(data) 199 | if string(jsonWithoutSpace[0]) != "[" && string(jsonWithoutSpace[len(jsonWithoutSpace)-1]) != "]" { 200 | jsonWithoutSpace = "[" + jsonWithoutSpace + "]" 201 | } 202 | 203 | return []byte(jsonWithoutSpace) 204 | } 205 | 206 | //Index is used to index JSON documents after assigning Document ID 207 | func (c *JSON) Index(fileName string) (int, error) { 208 | _, fileNameOnly := filepath.Split(fileName) 209 | // Read the file 210 | file, _ := os.Open(fileName) 211 | defer file.Close() 212 | 213 | byteValue, _ := ioutil.ReadAll(file) 214 | s := string(byteValue) 215 | fixedJSON := generateJSONArray(s) 216 | 217 | cleanJSON, err := unmarshalData(fixedJSON) 218 | for _, v := range cleanJSON { 219 | b, _ := json.Marshal(v) 220 | c.assignDocID(string(b), fileNameOnly) 221 | } 222 | 223 | c.tokenizeDocument() 224 | // Index is created now save the Index files and original mapped document in files 225 | saveIndex(c.IndexName, ".", fileNameOnly) 226 | 227 | if err != nil { 228 | return failure, nil 229 | } 230 | return len(tokenized), nil 231 | } 232 | 233 | func (c *JSON) assignDocID(entry string, documentFile string) { 234 | var rec [2]string 235 | rec[0] = documentFile 236 | rec[1] = entry 237 | now := time.Now() 238 | docID := "f_" + generateDocID(now.String()) 239 | entries[docID] = rec 240 | } 241 | 242 | //tokenzeDocument tokenize the document into words and store them into Array. 243 | func (c *JSON) tokenizeDocument() { 244 | 245 | rec := make(map[string]string) 246 | for key, entry := range entries { 247 | key = strings.TrimSpace(key) 248 | 249 | json.Unmarshal([]byte(entry[1]), &rec) 250 | for _, v := range rec { 251 | 252 | val := strings.ToLower(v) 253 | words := strings.Fields(val) 254 | for _, word := range words { 255 | _, ok := tokenized[word] 256 | 257 | if ok { 258 | tokenized[word] = tokenized[word] + "|" + key 259 | } else { 260 | tokenized[word] = key 261 | } 262 | } 263 | } 264 | } 265 | 266 | } 267 | 268 | //Init initializes the index and document related maps of the given index for JSON Documents 269 | func (c *JSON) Init() int { 270 | var tempTokenizedMap = make(map[string]string) 271 | var tempDocmentMap = make(map[string][2]string) 272 | path, err := os.Getwd() 273 | 274 | //Fetching all index files and merge their maps into a single map 275 | indexPath := c.IndexName + "/" 276 | pattern := filepath.Join(indexPath, "*.idx") 277 | existingIndexFiles, err := filepath.Glob(pattern) 278 | if err != nil { 279 | fmt.Println(err.Error()) 280 | } 281 | pattern = filepath.Join(indexPath, "*.document") 282 | existingDocumentFiles, err := filepath.Glob(pattern) 283 | 284 | if err != nil { 285 | fmt.Println(err.Error()) 286 | } 287 | 288 | // Iterating Index files and map merging. 289 | for _, z := range existingIndexFiles { 290 | file, _ := os.Open(filepath.Join(path, z)) 291 | buf := new(bytes.Buffer) 292 | buf.ReadFrom(file) 293 | contents := buf.String() 294 | file.Close() // Close the file immediately once done 295 | tempTokenizedMap = msgPack2MapIndex(contents) 296 | 297 | if len(tempTokenizedMap) > 0 { 298 | for key, value := range tempTokenizedMap { 299 | 300 | if _, found := mergedMap[key]; found { 301 | mergedMap[key] = mergedMap[key] + "|" + value 302 | } else { 303 | mergedMap[key] = value 304 | } 305 | 306 | } 307 | } 308 | } 309 | 310 | //Iterating Document files and merge them 311 | for _, z := range existingDocumentFiles { 312 | file, _ := os.Open(filepath.Join(path, z)) 313 | 314 | buf := new(bytes.Buffer) 315 | buf.ReadFrom(file) 316 | contents := buf.String() 317 | file.Close() // Close the file immediately once done 318 | tempDocmentMap = msgPack2MapDocument(contents) 319 | 320 | if len(tempDocmentMap) > 0 { 321 | for key, value := range tempDocmentMap { 322 | mergedMapDocuments[key] = value 323 | } 324 | } 325 | } 326 | if err != nil { 327 | return failure 328 | } 329 | return success 330 | } 331 | 332 | // Search returns the result against the keyword being provided. 333 | func (c *JSON) Search(keyword string) (string, int, error) { 334 | var documents []DocumentList 335 | // var result SearchResult 336 | //var docs []string 337 | 338 | if len(mergedMap) == 0 && len(mergedMapDocuments) == 0 { 339 | return "", -1, errors.New("No data was found. Did you call Init function?") 340 | } 341 | 342 | //Check the index map first 343 | v, found := mergedMap[strings.ToLower(keyword)] 344 | keys := strings.Split(v, "|") 345 | 346 | //result = SearchResult{Total:len(keys),Result: } 347 | 348 | //fmt.Println(mergedMapDocuments[keys[0]]) 349 | 350 | for _, documentID := range keys { 351 | entry := mergedMapDocuments[documentID] 352 | 353 | if len(entry) == 2 { 354 | documents = append(documents, DocumentList{FileName: entry[0], DocText: entry[1]}) 355 | } 356 | } 357 | x := SearchResult{Total: len(keys), Result: documents} 358 | jsonData, err := json.Marshal(x) 359 | if err != nil { 360 | return "", failure, errors.New("Could not decode") 361 | } 362 | 363 | if found { 364 | return string(jsonData), 1, nil 365 | } 366 | return "", failure, nil 367 | } 368 | 369 | //Init initializes the index and document related maps of the given index for CSV Documents 370 | func (c *CSV) Init() int { 371 | var tempTokenizedMap = make(map[string]string) 372 | var tempDocmentMap = make(map[string][2]string) 373 | path, _ := os.Getwd() 374 | 375 | //Fetching all index files and merge their maps into a single map 376 | indexPath := c.IndexName + "/" 377 | pattern := filepath.Join(indexPath, "*.idx") 378 | existingIndexFiles, err := filepath.Glob(pattern) 379 | if err != nil { 380 | fmt.Println(err.Error()) 381 | } 382 | pattern = filepath.Join(indexPath, "*.document") 383 | existingDocumentFiles, err := filepath.Glob(pattern) 384 | 385 | if err != nil { 386 | fmt.Println(err.Error()) 387 | } 388 | 389 | // Iterating Index files and map merging. 390 | for _, z := range existingIndexFiles { 391 | file, _ := os.Open(filepath.Join(path, z)) 392 | buf := new(bytes.Buffer) 393 | buf.ReadFrom(file) 394 | contents := buf.String() 395 | file.Close() // Close the file immediately once done 396 | tempTokenizedMap = msgPack2MapIndex(contents) 397 | 398 | if len(tempTokenizedMap) > 0 { 399 | for key, value := range tempTokenizedMap { 400 | 401 | if _, found := mergedMap[key]; found { 402 | mergedMap[key] = mergedMap[key] + "|" + value 403 | } else { 404 | mergedMap[key] = value 405 | } 406 | 407 | } 408 | } 409 | } 410 | 411 | //Iterating Document files and merge them 412 | for _, z := range existingDocumentFiles { 413 | file, _ := os.Open(filepath.Join(path, z)) 414 | 415 | buf := new(bytes.Buffer) 416 | buf.ReadFrom(file) 417 | contents := buf.String() 418 | file.Close() // Close the file immediately once done 419 | tempDocmentMap = msgPack2MapDocument(contents) 420 | 421 | if len(tempDocmentMap) > 0 { 422 | for key, value := range tempDocmentMap { 423 | mergedMapDocuments[key] = value 424 | } 425 | } 426 | } 427 | if err != nil { 428 | return failure 429 | } 430 | return success 431 | 432 | } 433 | 434 | // Search returns the result against the keyword being provided. 435 | func (c *CSV) Search(keyword string) (string, int, error) { 436 | var documents []DocumentList 437 | // var result SearchResult 438 | //var docs []string 439 | 440 | if len(mergedMap) == 0 && len(mergedMapDocuments) == 0 { 441 | return "", failure, errors.New("No data was found. Did you call Init function?") 442 | } 443 | 444 | //Check the index map first 445 | v, found := mergedMap[strings.ToLower(keyword)] 446 | keys := strings.Split(v, "|") 447 | 448 | for _, documentID := range keys { 449 | entry := mergedMapDocuments[documentID] 450 | 451 | if len(entry) == 2 { 452 | documents = append(documents, DocumentList{FileName: entry[0], DocText: entry[1]}) 453 | } 454 | } 455 | x := SearchResult{Total: len(keys), Result: documents} 456 | jsonData, err := json.Marshal(x) 457 | if err != nil { 458 | return "", failure, errors.New("Could not decode") 459 | } 460 | 461 | if found { 462 | return string(jsonData), 1, nil 463 | } 464 | return "", success, nil 465 | } 466 | 467 | func (c *CSV) assignDocID(entry string, documentFile string) { 468 | var rec [2]string 469 | rec[0] = documentFile 470 | rec[1] = entry 471 | now := time.Now() 472 | docID := "f_" + generateDocID(now.String()) 473 | entries[docID] = rec 474 | } 475 | 476 | //tokenzeDocument tokenize the document into words and store them into Array. 477 | func (c *CSV) tokenizeDocument() { 478 | 479 | for key, entry := range entries { 480 | key = strings.TrimSpace(key) 481 | line := strings.Replace(entry[1], ",", " ", 3) 482 | line = strings.ToLower(line) 483 | words := strings.Fields(line) 484 | 485 | for _, word := range words { 486 | _, ok := tokenized[word] 487 | if ok { 488 | tokenized[word] = tokenized[word] + "|" + key 489 | } else { 490 | tokenized[word] = key 491 | } 492 | } 493 | } 494 | } 495 | 496 | //Index indexes the document 497 | func (c *CSV) Index(fileName string) (int, error) { 498 | 499 | _, fileNameOnly := filepath.Split(fileName) 500 | // Read the file 501 | file, _ := os.Open(fileName) 502 | defer file.Close() 503 | 504 | parser := csv.NewReader(file) 505 | parser.FieldsPerRecord = -1 506 | 507 | if _, err := parser.Read(); err != nil { 508 | return failure, errors.New("File not found") 509 | } 510 | 511 | records, err := parser.ReadAll() 512 | 513 | if err != nil { 514 | return failure, errors.New("Could not read the CSV file") 515 | } 516 | 517 | // Assign Document ID to each record 518 | for _, record := range records { 519 | rec := strings.Join(record, ",") 520 | c.assignDocID(rec, fileNameOnly) 521 | } 522 | if len(entries) > 0 { 523 | c.tokenizeDocument() 524 | // Index is created now save the Index files and original mapped document in files 525 | saveIndex(c.IndexName, ".", fileNameOnly) 526 | } 527 | 528 | return len(tokenized), nil 529 | } 530 | --------------------------------------------------------------------------------