├── 1.csv
├── 2.csv
├── 2.json
├── .travis.yml
├── go.mod
├── .gitignore
├── 1.json
├── go.sum
├── LICENSE
├── example.go
├── README.md
├── fehrist_test.go
└── fehrist
    └── fehrist.go


/1.csv:
--------------------------------------------------------------------------------
1 | Name, Age, Profession
2 | Jhon Doe,24,Computer Engineer
3 | 


--------------------------------------------------------------------------------
/2.csv:
--------------------------------------------------------------------------------
1 | Name, Tax_2005, Tax_2006
2 | Adnan Siddiqi,12399, 29393
3 | 


--------------------------------------------------------------------------------
/2.json:
--------------------------------------------------------------------------------
 1 | [
 2 | 
 3 |     {
 4 |         "car": "Toyota",
 5 |         "color": "Gray"
 6 |     },
 7 |     {
 8 |         "car": "Honda",
 9 |         "color": "Green"
10 |     }
11 | ]


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: go
 2 | 
 3 | go: stable
 4 | 
 5 | env:
 6 |   global:
 7 |     - GIMME_DEBUG=1
 8 | 
 9 | install: skip
10 | 
11 | notifications:
12 |   email: false
13 | 
14 | script:
15 |     - go test -v


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/kadnan/fehrist
 2 | 
 3 | go 1.14
 4 | 
 5 | require (
 6 | 	github.com/bbalet/stopwords v1.0.0
 7 | 	github.com/reiver/go-porterstemmer v1.0.1
 8 | 	github.com/vmihailenco/msgpack v4.0.4+incompatible
 9 | 	golang.org/x/text v0.3.2 // indirect
10 | )
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | 
 8 | # Test binary, built with `go test -c`
 9 | *.test
10 | 
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 | 
14 | .vscode/*
15 | .DS_Store
16 | 


--------------------------------------------------------------------------------
/1.json:
--------------------------------------------------------------------------------
 1 | [
 2 | 
 3 |     {
 4 |         "fruit": "Mango Sindh",
 5 |         "size": "Small",
 6 |         "color": "Yellow",
 7 |         "Etc":["A","B"]
 8 |     },
 9 |     {
10 |         "fruit": "Apple Tasty",
11 |         "size": "Small",
12 |         "color": "Red",
13 |         "Etc":["D","E"]
14 |     }
15 | ]


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/bbalet/stopwords v1.0.0 h1:0TnGycCtY0zZi4ltKoOGRFIlZHv0WqpoIGUsObjztfo=
 2 | github.com/bbalet/stopwords v1.0.0/go.mod h1:sAWrQoDMfqARGIn4s6dp7OW7ISrshUD8IP2q3KoqPjc=
 3 | github.com/reiver/go-porterstemmer v1.0.1 h1:WyERBkASXgoXrTwq/IQ6wyNj/YG7j/ZURvTuMCoud5w=
 4 | github.com/reiver/go-porterstemmer v1.0.1/go.mod h1:Z8uL/f/7UEwaeAJNwx1sO8kbqXiEuQieNuD735hLrSU=
 5 | github.com/vmihailenco/msgpack v4.0.4+incompatible h1:dSLoQfGFAo3F6OoNhwUmLwVgaUXK79GlxNBwueZn0xI=
 6 | github.com/vmihailenco/msgpack v4.0.4+incompatible/go.mod h1:fy3FlTQTDXWkZ7Bh6AcGMlsjHatGryHQYUTf1ShIgkk=
 7 | golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
 8 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
 9 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Adnan Siddiqi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/example.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 	"strconv"
 7 | 
 8 | 	"github.com/kadnan/fehrist/fehrist"
 9 | )
10 | 
11 | func main() {
12 | 	path, _ := os.Getwd()
13 | 
14 | 	//Indexing CSV Files
15 | 	CSVDocument := &fehrist.CSV{IndexName: "local"}
16 | 	for i := 1; i < 3; i++ {
17 | 		fileName := path + "/" + strconv.Itoa(i) + ".csv"
18 | 		fmt.Println("Indexing CSV data from the file,", fileName, ". Please wait...")
19 | 
20 | 		indexCount, err := CSVDocument.Index(fileName)
21 | 		if err != nil {
22 | 			fmt.Println(err)
23 | 		} else {
24 | 			fmt.Println("Total Words indexed", indexCount)
25 | 		}
26 | 	}
27 | 
28 | 	//Indexing JSON files
29 | 	JSONDocument := &fehrist.JSON{IndexName: "local"}
30 | 	for i := 1; i < 3; i++ {
31 | 		fileName := path + "/" + strconv.Itoa(i) + ".json"
32 | 		fmt.Println("Indexing CSV data from the file,", fileName, ". Please wait...")
33 | 
34 | 		indexCount, err := JSONDocument.Index(fileName)
35 | 		if err != nil {
36 | 			fmt.Println(err)
37 | 		} else {
38 | 			fmt.Println("Total Words indexed", indexCount)
39 | 		}
40 | 	}
41 | 
42 | 	/* Searching Documents */
43 | 
44 | 	CSVDocument.Init()
45 | 	result, _, err := CSVDocument.Search("siddiqi")
46 | 	if err != nil {
47 | 		fmt.Println(err)
48 | 	}
49 | 	fmt.Println("Printing the text present in CSV Document")
50 | 	fmt.Println(result)
51 | 
52 | 	JSONDocument.Init()
53 | 	result, searchCount, err := JSONDocument.Search("mango")
54 | 	fmt.Println(searchCount)
55 | 
56 | 	if err != nil {
57 | 		fmt.Println(err)
58 | 	}
59 | 	fmt.Println("Printing the text present in JSON Document")
60 | 	fmt.Println(result)
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Fehrist
 2 | [![Build Status](https://api.travis-ci.org/kadnan/fehrist.svg)](https://travis-ci.org/kadnan/fehrist)
 3 | 
 4 | _Fehrist_ is a pure Go library for indexing different types of documents. Currently it supports only CSV and JSON but flexible architecture gives you liberty to add more documents. Fehrist(فہرست) is an Urdu word for **Index**. Similar terminologies used in Arabic(فھرس) and Farsi(فہرست) as well.
 5 | 
 6 | Fehrist is based on [Inverted Index](https://en.wikipedia.org/wiki/Inverted_index) data structure for indexing purposes.
 7 | 
 8 | ## Examples
 9 | ### For indexing
10 | ```
11 | import (
12 | 	"fmt"
13 | 	"os"
14 | 	"strconv"
15 | 
16 | 	"github.com/kadnan/fehrist/fehrist"
17 | )
18 | func main() {
19 | 	path, _ := os.Getwd()
20 |     
21 |     //Indexing CSV Files
22 |     CSVDocument := &fehrist.CSV{IndexName: "local"}
23 | 	for i := 1; i < 3; i++ {
24 | 		fileName := path + "/" + strconv.Itoa(i) + ".csv"
25 | 		fmt.Println("Indexing CSV data from the file,", fileName, ". Please wait...")
26 | 
27 | 		indexCount, err := CSVDocument.Index(fileName)
28 | 		if err != nil {
29 | 			fmt.Println(err)
30 | 		} else {
31 | 			fmt.Println("Total Words indexed", indexCount)
32 | 		}
33 | 	}
34 | 
35 |     //Indexing JSON files
36 | 	JSONDocument := &fehrist.JSON{IndexName: "local"}
37 | 	for i := 1; i < 3; i++ {
38 | 		fileName := path + "/" + strconv.Itoa(i) + ".json"
39 | 		fmt.Println("Indexing CSV data from the file,", fileName, ". Please wait...")
40 | 
41 | 		indexCount, err := JSONDocument.Index(fileName)
42 | 		if err != nil {
43 | 			fmt.Println(err)
44 | 		} else {
45 | 			fmt.Println("Total Words indexed", indexCount)
46 | 		}
47 | 	}
48 | }
49 | ```
50 | 
51 | ### For Searching
52 | ```
53 | /* Searching Documents */
54 | 
55 | 	CSVDocument.Init()
56 | 	result, _, err := CSVDocument.Search("siddiqi")
57 | 	if err != nil {
58 | 		fmt.Println(err)
59 | 	}
60 | 	fmt.Println("Printing the text present in CSV Document")
61 | 	fmt.Println(result)
62 | ```
63 | If you want to learn how this all work then visit the [blog post](http://blog.adnansiddiqi.me/fehrist-document-indexing-library-in-go/)
64 | 


--------------------------------------------------------------------------------
/fehrist_test.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"testing"
  8 | 
  9 | 	"github.com/kadnan/fehrist/fehrist"
 10 | )
 11 | 
 12 | func TestDocumentFileNotFoundCSV(t *testing.T) {
 13 | 	path, _ := os.Getwd()
 14 | 	fileName := path + "/" + "LOL.csv"
 15 | 	CSVDocument := &fehrist.CSV{IndexName: "local"}
 16 | 	indexCount, _ := CSVDocument.Index(fileName)
 17 | 
 18 | 	if indexCount != -1 {
 19 | 		t.Errorf("Test DocumentFileNotFound Failed")
 20 | 	}
 21 | }
 22 | 
 23 | func TestDocumentIndexedCSV(t *testing.T) {
 24 | 	path, _ := os.Getwd()
 25 | 	fileName := path + "/" + "1.csv"
 26 | 	CSVDocument := &fehrist.CSV{IndexName: "local"}
 27 | 	indexCount, err := CSVDocument.Index(fileName)
 28 | 	fmt.Println(indexCount)
 29 | 	fmt.Println(err)
 30 | 
 31 | 	if indexCount < 1 {
 32 | 		t.Errorf("Test Document Index Failed CSV")
 33 | 	}
 34 | }
 35 | 
 36 | func TestDocumentFileNotFoundJSON(t *testing.T) {
 37 | 	path, _ := os.Getwd()
 38 | 	fileName := path + "/" + "LOL.json"
 39 | 	CSVDocument := &fehrist.CSV{IndexName: "local"}
 40 | 	indexCount, _ := CSVDocument.Index(fileName)
 41 | 	if indexCount != -1 {
 42 | 		t.Errorf("Test Document Indexed Failed")
 43 | 	}
 44 | }
 45 | 
 46 | func TestDocumentIndexedJSON(t *testing.T) {
 47 | 	path, _ := os.Getwd()
 48 | 	fileName := path + "/" + "1.json"
 49 | 	JSONDocument := &fehrist.JSON{IndexName: "local"}
 50 | 	indexCount, _ := JSONDocument.Index(fileName)
 51 | 	if indexCount < 1 {
 52 | 		t.Errorf("Test Document Index Failed JSON")
 53 | 	}
 54 | }
 55 | func TestInitCSV(t *testing.T) {
 56 | 	Document := &fehrist.CSV{IndexName: "local"}
 57 | 	result := Document.Init()
 58 | 	if result != 1 {
 59 | 		t.Errorf("Test Initialization failed for CSV Document")
 60 | 	}
 61 | }
 62 | func TestInitJSON(t *testing.T) {
 63 | 	Document := &fehrist.JSON{IndexName: "local"}
 64 | 	result := Document.Init()
 65 | 	if result != 1 {
 66 | 		t.Errorf("Test Initialization failed for JSON Document")
 67 | 	}
 68 | }
 69 | 
 70 | func TestSearchCSVKWExist(t *testing.T) {
 71 | 	var input map[string]interface{}
 72 | 
 73 | 	Document := &fehrist.CSV{IndexName: "local"}
 74 | 	result := Document.Init()
 75 | 
 76 | 	if result == 1 {
 77 | 		out, _, _ := Document.Search("mango")
 78 | 		err := json.Unmarshal([]byte(out), &input)
 79 | 		if err != nil {
 80 | 			fmt.Println(err)
 81 | 		}
 82 | 		_, ok := input["Total"]
 83 | 		if !ok {
 84 | 			t.Errorf("Test TestSearchCSVKWExist failed for CSV Document")
 85 | 		}
 86 | 	}
 87 | }
 88 | 
 89 | func TestSearchJSONKWExist(t *testing.T) {
 90 | 	var input map[string]interface{}
 91 | 
 92 | 	Document := &fehrist.JSON{IndexName: "local"}
 93 | 	result := Document.Init()
 94 | 
 95 | 	if result == 1 {
 96 | 		out, _, _ := Document.Search("mango")
 97 | 		err := json.Unmarshal([]byte(out), &input)
 98 | 		if err != nil {
 99 | 			fmt.Println(err)
100 | 		}
101 | 		_, ok := input["Total"]
102 | 		if !ok {
103 | 			t.Errorf("Test TestSearchCSVKWExist failed for JSON  Document")
104 | 		}
105 | 	}
106 | }
107 | 


--------------------------------------------------------------------------------
/fehrist/fehrist.go:
--------------------------------------------------------------------------------
  1 | // Package fehrist implements routines related to different kind of indexing
  2 | 
  3 | package fehrist
  4 | 
  5 | import (
  6 | 	"bytes"
  7 | 	"crypto/sha1"
  8 | 	"encoding/csv"
  9 | 	"encoding/hex"
 10 | 	"encoding/json"
 11 | 	"errors"
 12 | 	"fmt"
 13 | 	"io/ioutil"
 14 | 	"os"
 15 | 	"path/filepath"
 16 | 	"strconv"
 17 | 	"strings"
 18 | 	"time"
 19 | 
 20 | 	"github.com/vmihailenco/msgpack"
 21 | )
 22 | 
 23 | var entries = make(map[string][2]string) //to store values
 24 | var tokenized = make(map[string]string)
 25 | var mergedIndexMap = make(map[string]string)
 26 | var mergedMap = make(map[string]string)
 27 | var mergedMapDocuments = make(map[string][2]string)
 28 | var idx = 0
 29 | var isLoaded = false
 30 | 
 31 | //All Constants
 32 | const success int = 1
 33 | const failure int = -1
 34 | 
 35 | /* unmarshalData removes all the field that are not required and returns a flat Map
 36 | Author: Peter Hellberg (https://gophers.slack.com/) - Thanks Peter!!
 37 | */
 38 | func unmarshalData(data []byte) ([]map[string]interface{}, error) {
 39 | 	var input []map[string]interface{}
 40 | 
 41 | 	if err := json.Unmarshal(data, &input); err != nil {
 42 | 		return nil, err
 43 | 	}
 44 | 
 45 | 	var resp []map[string]interface{}
 46 | 
 47 | 	for _, in := range input {
 48 | 		d := map[string]interface{}{}
 49 | 		for k, v := range in {
 50 | 			switch v.(type) {
 51 | 			case string, float64:
 52 | 				d[k] = v
 53 | 			}
 54 | 		}
 55 | 		resp = append(resp, d)
 56 | 	}
 57 | 
 58 | 	return resp, nil
 59 | }
 60 | 
 61 | // Save function saves the value in the in a file.
 62 | func save(content map[string][2]string, t map[string]string, fileHandleDocument *os.File, fileHandleIndex *os.File) (int, error) {
 63 | 
 64 | 	defer fileHandleDocument.Close()
 65 | 	defer fileHandleIndex.Close()
 66 | 
 67 | 	// Saving Content Data after Marshalizing
 68 | 	if len(content) > 0 {
 69 | 		b, err := msgpack.Marshal(content)
 70 | 
 71 | 		if err != nil {
 72 | 			return 0, errors.New("Decoding Failed for Document")
 73 | 		}
 74 | 
 75 | 		fileHandleDocument.Write(b)
 76 | 	}
 77 | 
 78 | 	// Saving Index Data after Marshalizing
 79 | 	if len(t) > 0 {
 80 | 		b, err := msgpack.Marshal(t)
 81 | 
 82 | 		if err != nil {
 83 | 			return 0, errors.New("Decoding Failed for Index")
 84 | 		}
 85 | 
 86 | 		fileHandleIndex.Write(b)
 87 | 
 88 | 	}
 89 | 
 90 | 	return success, nil
 91 | }
 92 | 
 93 | /* saveIndex saves all index and document related info on disk. It is responsible for:
 94 | - Create a Folder of Index Name
 95 | - For each document index it creates a file with numeric sequence name with extension .idx
 96 | - It stores original document along with assignedID
 97 | */
 98 | func saveIndex(indexName string, path string, documentFileName string) {
 99 | 	fileSquence := 0
100 | 	indexPath := indexName + "/"
101 | 	pattern := filepath.Join(indexPath, "*.idx")
102 | 
103 | 	if _, err := os.Stat(indexName); os.IsNotExist(err) {
104 | 		os.Mkdir(indexName, 0700) //Write from the same program
105 | 	}
106 | 
107 | 	// Folder Created. Now we have to check the next available sequence of file
108 | 
109 | 	existingIndexFiles, err := filepath.Glob(pattern)
110 | 
111 | 	fileSquence = len(existingIndexFiles)
112 | 
113 | 	if err != nil {
114 | 		fmt.Println(err.Error())
115 | 	}
116 | 
117 | 	if len(existingIndexFiles) > 0 {
118 | 		fileSquence = len(existingIndexFiles)
119 | 	}
120 | 
121 | 	indexFileName := indexPath + strconv.Itoa(fileSquence) + ".idx"
122 | 	docFileName := indexPath + documentFileName + ".document"
123 | 
124 | 	// Save the Document File with .document extension
125 | 	docFile, err := os.OpenFile(docFileName, os.O_CREATE|os.O_RDWR, 0644)
126 | 	if err != nil {
127 | 		fmt.Println(err.Error())
128 | 	}
129 | 
130 | 	indexFile, err := os.OpenFile(indexFileName, os.O_CREATE|os.O_RDWR, 0644)
131 | 	if err != nil {
132 | 		fmt.Println(err.Error())
133 | 	}
134 | 
135 | 	if err != nil {
136 | 		fmt.Println(err.Error())
137 | 	}
138 | 
139 | 	_, err = save(entries, tokenized, docFile, indexFile)
140 | 
141 | 	if err != nil {
142 | 		fmt.Println(err.Error())
143 | 	}
144 | }
145 | 
146 | // generateDocID generates a random DocID
147 | func generateDocID(text string) string {
148 | 	algorithm := sha1.New()
149 | 	algorithm.Write([]byte(text))
150 | 	idx++
151 | 	return hex.EncodeToString(algorithm.Sum(nil)) //ALERT: Implement hex based ID
152 | 	//return strconv.Itoa(idx)
153 | }
154 | 
155 | // Indexer is the interface that implements important stuff
156 | type Indexer interface {
157 | 	Index(fileName string)
158 | 	assignDocID(entry string)
159 | 	tokenizeDocument() map[string]string
160 | }
161 | 
162 | // A CSV represents a CSV Doccument
163 | type CSV struct {
164 | 	IndexName string
165 | }
166 | 
167 | // A JSON represents a CSV Doccument
168 | type JSON struct {
169 | 	IndexName string
170 | }
171 | 
172 | //DocumentList holds the return searched doc structure
173 | type DocumentList struct {
174 | 	FileName string
175 | 	DocText  string
176 | }
177 | 
178 | //SearchResult implements Search JSON
179 | type SearchResult struct {
180 | 	Total  int
181 | 	Result []DocumentList
182 | }
183 | 
184 | func msgPack2MapIndex(marshalled string) map[string]string {
185 | 	var tempTokenizedMap = make(map[string]string)
186 | 	msgpack.Unmarshal([]byte(marshalled), &tempTokenizedMap)
187 | 	return tempTokenizedMap
188 | }
189 | func msgPack2MapDocument(marshalled string) map[string][2]string {
190 | 	var tempDocMap = make(map[string][2]string)
191 | 	msgpack.Unmarshal([]byte(marshalled), &tempDocMap)
192 | 	return tempDocMap
193 | }
194 | 
195 | //generateJSONArray checks whether the JSON is array of object or not, if no then make it one
196 | func generateJSONArray(data string) []byte {
197 | 	//jsonWithoutSpace := strings.ReplaceAll(string(data), " ", "")
198 | 	jsonWithoutSpace := strings.TrimSpace(data)
199 | 	if string(jsonWithoutSpace[0]) != "[" && string(jsonWithoutSpace[len(jsonWithoutSpace)-1]) != "]" {
200 | 		jsonWithoutSpace = "[" + jsonWithoutSpace + "]"
201 | 	}
202 | 
203 | 	return []byte(jsonWithoutSpace)
204 | }
205 | 
206 | //Index is used to index JSON documents after assigning Document ID
207 | func (c *JSON) Index(fileName string) (int, error) {
208 | 	_, fileNameOnly := filepath.Split(fileName)
209 | 	// Read the file
210 | 	file, _ := os.Open(fileName)
211 | 	defer file.Close()
212 | 
213 | 	byteValue, _ := ioutil.ReadAll(file)
214 | 	s := string(byteValue)
215 | 	fixedJSON := generateJSONArray(s)
216 | 
217 | 	cleanJSON, err := unmarshalData(fixedJSON)
218 | 	for _, v := range cleanJSON {
219 | 		b, _ := json.Marshal(v)
220 | 		c.assignDocID(string(b), fileNameOnly)
221 | 	}
222 | 
223 | 	c.tokenizeDocument()
224 | 	// Index is created now save the Index files and original mapped document in files
225 | 	saveIndex(c.IndexName, ".", fileNameOnly)
226 | 
227 | 	if err != nil {
228 | 		return failure, nil
229 | 	}
230 | 	return len(tokenized), nil
231 | }
232 | 
233 | func (c *JSON) assignDocID(entry string, documentFile string) {
234 | 	var rec [2]string
235 | 	rec[0] = documentFile
236 | 	rec[1] = entry
237 | 	now := time.Now()
238 | 	docID := "f_" + generateDocID(now.String())
239 | 	entries[docID] = rec
240 | }
241 | 
242 | //tokenzeDocument tokenize the document into words and store them into Array.
243 | func (c *JSON) tokenizeDocument() {
244 | 
245 | 	rec := make(map[string]string)
246 | 	for key, entry := range entries {
247 | 		key = strings.TrimSpace(key)
248 | 
249 | 		json.Unmarshal([]byte(entry[1]), &rec)
250 | 		for _, v := range rec {
251 | 
252 | 			val := strings.ToLower(v)
253 | 			words := strings.Fields(val)
254 | 			for _, word := range words {
255 | 				_, ok := tokenized[word]
256 | 
257 | 				if ok {
258 | 					tokenized[word] = tokenized[word] + "|" + key
259 | 				} else {
260 | 					tokenized[word] = key
261 | 				}
262 | 			}
263 | 		}
264 | 	}
265 | 
266 | }
267 | 
268 | //Init initializes the index and document related maps of the given index for JSON Documents
269 | func (c *JSON) Init() int {
270 | 	var tempTokenizedMap = make(map[string]string)
271 | 	var tempDocmentMap = make(map[string][2]string)
272 | 	path, err := os.Getwd()
273 | 
274 | 	//Fetching all index files and merge their maps into a single map
275 | 	indexPath := c.IndexName + "/"
276 | 	pattern := filepath.Join(indexPath, "*.idx")
277 | 	existingIndexFiles, err := filepath.Glob(pattern)
278 | 	if err != nil {
279 | 		fmt.Println(err.Error())
280 | 	}
281 | 	pattern = filepath.Join(indexPath, "*.document")
282 | 	existingDocumentFiles, err := filepath.Glob(pattern)
283 | 
284 | 	if err != nil {
285 | 		fmt.Println(err.Error())
286 | 	}
287 | 
288 | 	// Iterating Index files and map merging.
289 | 	for _, z := range existingIndexFiles {
290 | 		file, _ := os.Open(filepath.Join(path, z))
291 | 		buf := new(bytes.Buffer)
292 | 		buf.ReadFrom(file)
293 | 		contents := buf.String()
294 | 		file.Close() // Close the file immediately once done
295 | 		tempTokenizedMap = msgPack2MapIndex(contents)
296 | 
297 | 		if len(tempTokenizedMap) > 0 {
298 | 			for key, value := range tempTokenizedMap {
299 | 
300 | 				if _, found := mergedMap[key]; found {
301 | 					mergedMap[key] = mergedMap[key] + "|" + value
302 | 				} else {
303 | 					mergedMap[key] = value
304 | 				}
305 | 
306 | 			}
307 | 		}
308 | 	}
309 | 
310 | 	//Iterating Document files and merge them
311 | 	for _, z := range existingDocumentFiles {
312 | 		file, _ := os.Open(filepath.Join(path, z))
313 | 
314 | 		buf := new(bytes.Buffer)
315 | 		buf.ReadFrom(file)
316 | 		contents := buf.String()
317 | 		file.Close() // Close the file immediately once done
318 | 		tempDocmentMap = msgPack2MapDocument(contents)
319 | 
320 | 		if len(tempDocmentMap) > 0 {
321 | 			for key, value := range tempDocmentMap {
322 | 				mergedMapDocuments[key] = value
323 | 			}
324 | 		}
325 | 	}
326 | 	if err != nil {
327 | 		return failure
328 | 	}
329 | 	return success
330 | }
331 | 
332 | // Search returns the result against the keyword being provided.
333 | func (c *JSON) Search(keyword string) (string, int, error) {
334 | 	var documents []DocumentList
335 | 	// var result SearchResult
336 | 	//var docs []string
337 | 
338 | 	if len(mergedMap) == 0 && len(mergedMapDocuments) == 0 {
339 | 		return "", -1, errors.New("No data was found. Did you call Init function?")
340 | 	}
341 | 
342 | 	//Check the index map first
343 | 	v, found := mergedMap[strings.ToLower(keyword)]
344 | 	keys := strings.Split(v, "|")
345 | 
346 | 	//result = SearchResult{Total:len(keys),Result: }
347 | 
348 | 	//fmt.Println(mergedMapDocuments[keys[0]])
349 | 
350 | 	for _, documentID := range keys {
351 | 		entry := mergedMapDocuments[documentID]
352 | 
353 | 		if len(entry) == 2 {
354 | 			documents = append(documents, DocumentList{FileName: entry[0], DocText: entry[1]})
355 | 		}
356 | 	}
357 | 	x := SearchResult{Total: len(keys), Result: documents}
358 | 	jsonData, err := json.Marshal(x)
359 | 	if err != nil {
360 | 		return "", failure, errors.New("Could not decode")
361 | 	}
362 | 
363 | 	if found {
364 | 		return string(jsonData), 1, nil
365 | 	}
366 | 	return "", failure, nil
367 | }
368 | 
369 | //Init initializes the index and document related maps of the given index for CSV Documents
370 | func (c *CSV) Init() int {
371 | 	var tempTokenizedMap = make(map[string]string)
372 | 	var tempDocmentMap = make(map[string][2]string)
373 | 	path, _ := os.Getwd()
374 | 
375 | 	//Fetching all index files and merge their maps into a single map
376 | 	indexPath := c.IndexName + "/"
377 | 	pattern := filepath.Join(indexPath, "*.idx")
378 | 	existingIndexFiles, err := filepath.Glob(pattern)
379 | 	if err != nil {
380 | 		fmt.Println(err.Error())
381 | 	}
382 | 	pattern = filepath.Join(indexPath, "*.document")
383 | 	existingDocumentFiles, err := filepath.Glob(pattern)
384 | 
385 | 	if err != nil {
386 | 		fmt.Println(err.Error())
387 | 	}
388 | 
389 | 	// Iterating Index files and map merging.
390 | 	for _, z := range existingIndexFiles {
391 | 		file, _ := os.Open(filepath.Join(path, z))
392 | 		buf := new(bytes.Buffer)
393 | 		buf.ReadFrom(file)
394 | 		contents := buf.String()
395 | 		file.Close() // Close the file immediately once done
396 | 		tempTokenizedMap = msgPack2MapIndex(contents)
397 | 
398 | 		if len(tempTokenizedMap) > 0 {
399 | 			for key, value := range tempTokenizedMap {
400 | 
401 | 				if _, found := mergedMap[key]; found {
402 | 					mergedMap[key] = mergedMap[key] + "|" + value
403 | 				} else {
404 | 					mergedMap[key] = value
405 | 				}
406 | 
407 | 			}
408 | 		}
409 | 	}
410 | 
411 | 	//Iterating Document files and merge them
412 | 	for _, z := range existingDocumentFiles {
413 | 		file, _ := os.Open(filepath.Join(path, z))
414 | 
415 | 		buf := new(bytes.Buffer)
416 | 		buf.ReadFrom(file)
417 | 		contents := buf.String()
418 | 		file.Close() // Close the file immediately once done
419 | 		tempDocmentMap = msgPack2MapDocument(contents)
420 | 
421 | 		if len(tempDocmentMap) > 0 {
422 | 			for key, value := range tempDocmentMap {
423 | 				mergedMapDocuments[key] = value
424 | 			}
425 | 		}
426 | 	}
427 | 	if err != nil {
428 | 		return failure
429 | 	}
430 | 	return success
431 | 
432 | }
433 | 
434 | // Search returns the result against the keyword being provided.
435 | func (c *CSV) Search(keyword string) (string, int, error) {
436 | 	var documents []DocumentList
437 | 	// var result SearchResult
438 | 	//var docs []string
439 | 
440 | 	if len(mergedMap) == 0 && len(mergedMapDocuments) == 0 {
441 | 		return "", failure, errors.New("No data was found. Did you call Init function?")
442 | 	}
443 | 
444 | 	//Check the index map first
445 | 	v, found := mergedMap[strings.ToLower(keyword)]
446 | 	keys := strings.Split(v, "|")
447 | 
448 | 	for _, documentID := range keys {
449 | 		entry := mergedMapDocuments[documentID]
450 | 
451 | 		if len(entry) == 2 {
452 | 			documents = append(documents, DocumentList{FileName: entry[0], DocText: entry[1]})
453 | 		}
454 | 	}
455 | 	x := SearchResult{Total: len(keys), Result: documents}
456 | 	jsonData, err := json.Marshal(x)
457 | 	if err != nil {
458 | 		return "", failure, errors.New("Could not decode")
459 | 	}
460 | 
461 | 	if found {
462 | 		return string(jsonData), 1, nil
463 | 	}
464 | 	return "", success, nil
465 | }
466 | 
467 | func (c *CSV) assignDocID(entry string, documentFile string) {
468 | 	var rec [2]string
469 | 	rec[0] = documentFile
470 | 	rec[1] = entry
471 | 	now := time.Now()
472 | 	docID := "f_" + generateDocID(now.String())
473 | 	entries[docID] = rec
474 | }
475 | 
476 | //tokenzeDocument tokenize the document into words and store them into Array.
477 | func (c *CSV) tokenizeDocument() {
478 | 
479 | 	for key, entry := range entries {
480 | 		key = strings.TrimSpace(key)
481 | 		line := strings.Replace(entry[1], ",", " ", 3)
482 | 		line = strings.ToLower(line)
483 | 		words := strings.Fields(line)
484 | 
485 | 		for _, word := range words {
486 | 			_, ok := tokenized[word]
487 | 			if ok {
488 | 				tokenized[word] = tokenized[word] + "|" + key
489 | 			} else {
490 | 				tokenized[word] = key
491 | 			}
492 | 		}
493 | 	}
494 | }
495 | 
496 | //Index indexes the document
497 | func (c *CSV) Index(fileName string) (int, error) {
498 | 
499 | 	_, fileNameOnly := filepath.Split(fileName)
500 | 	// Read the file
501 | 	file, _ := os.Open(fileName)
502 | 	defer file.Close()
503 | 
504 | 	parser := csv.NewReader(file)
505 | 	parser.FieldsPerRecord = -1
506 | 
507 | 	if _, err := parser.Read(); err != nil {
508 | 		return failure, errors.New("File not found")
509 | 	}
510 | 
511 | 	records, err := parser.ReadAll()
512 | 
513 | 	if err != nil {
514 | 		return failure, errors.New("Could not read the CSV file")
515 | 	}
516 | 
517 | 	// Assign Document ID to each record
518 | 	for _, record := range records {
519 | 		rec := strings.Join(record, ",")
520 | 		c.assignDocID(rec, fileNameOnly)
521 | 	}
522 | 	if len(entries) > 0 {
523 | 		c.tokenizeDocument()
524 | 		// Index is created now save the Index files and original mapped document in files
525 | 		saveIndex(c.IndexName, ".", fileNameOnly)
526 | 	}
527 | 
528 | 	return len(tokenized), nil
529 | }
530 | 


--------------------------------------------------------------------------------