├── .gitignore ├── illustration.png ├── go.mod ├── Makefile ├── go.sum ├── README.md ├── search_index_test.go └── search_index.go /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .vscode 3 | vendor 4 | coverage.out 5 | -------------------------------------------------------------------------------- /illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twelvedata/searchindex/HEAD/illustration.png -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/twelvedata/searchindex 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/iancoleman/orderedmap v0.0.0-20190318233801-ac98e3ecb4b0 7 | golang.org/x/text v0.3.3 8 | ) 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PATH_THIS:=$(realpath $(dir $(lastword ${MAKEFILE_LIST}))) 2 | DIR:=$(PATH_THIS) 3 | 4 | 5 | help: 6 | @echo " test" 7 | @echo " Run tests" 8 | 9 | 10 | .PHONY: test 11 | test: 12 | @cd $(DIR) \ 13 | && go test ./... 14 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/iancoleman/orderedmap v0.0.0-20190318233801-ac98e3ecb4b0 h1:i462o439ZjprVSFSZLZxcsoAe592sZB1rci2Z8j4wdk= 2 | github.com/iancoleman/orderedmap v0.0.0-20190318233801-ac98e3ecb4b0/go.mod h1:N0Wam8K1arqPXNWjMo21EXnBPOPp36vB07FNRdD2geA= 3 | golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k= 4 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 5 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # In-memory search index 2 | 3 | ![](illustration.png) 4 | 5 | ## Features 6 | 7 | - Indexation using simple tree 8 | - Search by beginning of string 9 | - Exact search 10 | - Suitable for non-long strings 11 | 12 | More about in [article on Medium](https://medium.com/twelve-data/in-memory-text-search-index-for-quotes-on-go-5243adc62c26) 13 | 14 | ## How to use 15 | 16 | ```go 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "github.com/twelvedata/searchindex" 22 | ) 23 | 24 | type SymbolInfo struct { 25 | Symbol string 26 | Exchange string 27 | Instrument string 28 | } 29 | 30 | func main() { 31 | // Values for indexation 32 | searchList := searchindex.SearchList{ 33 | &searchindex.SearchItem{ 34 | Key: "AAPL", 35 | Data: &SymbolInfo{Symbol: "AAPL", Exchange: "NASDAQ", Instrument: "Apple Inc"}, 36 | }, 37 | &searchindex.SearchItem{ 38 | Key: "AMZN", 39 | Data: &SymbolInfo{Symbol: "AMZN", Exchange: "NASDAQ", Instrument: "Amazon.com Inc"}, 40 | }, 41 | } 42 | 43 | // Fill index 44 | searchIndex := searchindex.NewSearchIndex(searchList, 10, nil, nil, true, nil) 45 | 46 | // Search 47 | result := searchIndex.Search(searchindex.SearchParams{ 48 | Text: "aa", 49 | OutputSize: 10, 50 | Matching: searchindex.Beginning, 51 | }) 52 | 53 | fmt.Println(result[0]) 54 | } 55 | ``` 56 | 57 | Run tests: 58 | 59 | ```bash 60 | make test 61 | ``` 62 | -------------------------------------------------------------------------------- /search_index_test.go: -------------------------------------------------------------------------------- 1 | package searchindex 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "testing" 7 | ) 8 | 9 | type SymbolInfo struct { 10 | Symbol string 11 | Exchange string 12 | Instrument string 13 | } 14 | 15 | func sortFunc(i, j int, data interface{}) bool { 16 | if data.(SearchList)[i].Key == data.(SearchList)[j].Key { 17 | return data.(SearchList)[i].Data.(*SymbolInfo).Exchange < data.(SearchList)[j].Data.(*SymbolInfo).Exchange 18 | } 19 | return data.(SearchList)[i].Key < data.(SearchList)[j].Key 20 | } 21 | 22 | func TestSearchIndex(t *testing.T) { 23 | 24 | a_1 := SearchItem{Key: "A", Data: &SymbolInfo{Symbol: "A", Exchange: "1", Instrument: "A company"}} 25 | a_2 := SearchItem{Key: "A", Data: &SymbolInfo{Symbol: "A", Exchange: "2", Instrument: "A company"}} 26 | a_3 := SearchItem{Key: "A", Data: &SymbolInfo{Symbol: "A", Exchange: "3", Instrument: "A company"}} 27 | an := SearchItem{Key: "AN", Data: &SymbolInfo{Symbol: "AN", Exchange: "3", Instrument: "A company"}} 28 | ap := SearchItem{Key: "AP", Data: &SymbolInfo{Symbol: "AP", Exchange: "3", Instrument: "APPROVE company"}} 29 | ag := SearchItem{Key: "AG", Data: &SymbolInfo{Symbol: "AG", Exchange: "3", Instrument: "AG company"}} 30 | az := SearchItem{Key: "AZ", Data: &SymbolInfo{Symbol: "AZ", Exchange: "3"}} 31 | b1 := SearchItem{Key: "B", Data: &SymbolInfo{Symbol: "B", Exchange: "2", Instrument: "Company Betta"}} 32 | aa_1 := SearchItem{Key: "AA", Data: &SymbolInfo{Symbol: "AA", Instrument: "HA"}} 33 | a2 := SearchItem{Key: "A2", Data: &SymbolInfo{Symbol: "A2"}} 34 | gc_gd := SearchItem{Key: "GC/GD", Data: &SymbolInfo{Symbol: "GC/GD"}} 35 | 36 | var searchList SearchList 37 | searchList = append(searchList, &ag) 38 | searchList = append(searchList, &an) 39 | searchList = append(searchList, &ap) 40 | searchList = append(searchList, &az) 41 | searchList = append(searchList, &a_1) 42 | searchList = append(searchList, &b1) 43 | searchList = append(searchList, &a_3) 44 | searchList = append(searchList, &a_2) 45 | searchList = append(searchList, &aa_1) 46 | searchList = append(searchList, &a2) 47 | searchList = append(searchList, &gc_gd) 48 | 49 | type TestData struct { 50 | Search string 51 | Result []SearchData 52 | Limit int 53 | PageSize int 54 | Sort func(i, j int, data interface{}) bool 55 | } 56 | 57 | data := []*TestData{ 58 | { 59 | Search: "AA", 60 | Result: []SearchData{aa_1.Data}, 61 | Limit: 100, 62 | PageSize: 10, 63 | Sort: sortFunc, 64 | }, 65 | { 66 | Search: "B", 67 | Result: []SearchData{b1.Data}, 68 | Limit: 100, 69 | PageSize: 10, 70 | Sort: nil, 71 | }, 72 | { 73 | Search: "A", 74 | Result: []SearchData{a_1.Data, a_2.Data, a_3.Data, a2.Data, aa_1.Data, ag.Data, an.Data, ap.Data, az.Data}, 75 | Limit: 100, 76 | PageSize: 10, 77 | Sort: sortFunc, 78 | }, 79 | { 80 | Search: "A", 81 | Result: []SearchData{a_1.Data, a_2.Data}, 82 | Limit: 100, 83 | PageSize: 2, 84 | Sort: sortFunc, 85 | }, 86 | { 87 | Search: "A", 88 | Result: []SearchData{a_1.Data, a_2.Data, a_3.Data, a2.Data, aa_1.Data, ag.Data, an.Data, ap.Data}, 89 | Limit: 100, 90 | PageSize: 8, 91 | Sort: sortFunc, 92 | }, 93 | { 94 | Search: "B", 95 | Result: []SearchData{b1.Data}, 96 | Limit: 100, 97 | PageSize: 2, 98 | Sort: sortFunc, 99 | }, 100 | { 101 | Search: "GC-GD", 102 | Result: []SearchData{gc_gd.Data}, 103 | Limit: 100, 104 | PageSize: 2, 105 | Sort: sortFunc, 106 | }, 107 | { 108 | Search: "GCGD", 109 | Result: []SearchData{gc_gd.Data}, 110 | Limit: 100, 111 | PageSize: 2, 112 | Sort: sortFunc, 113 | }, 114 | } 115 | 116 | for index, item := range data { 117 | searchIndex := NewSearchIndex(searchList, item.Limit, item.Sort, nil, true, nil, 10) 118 | 119 | result := searchIndex.Search(SearchParams{Text: item.Search, OutputSize: item.PageSize, Matching: Beginning}) 120 | 121 | if !reflect.DeepEqual(item.Result, result) { 122 | expected := "" 123 | for _, elem := range item.Result { 124 | expected += fmt.Sprintf("%v ", elem) 125 | } 126 | 127 | actual := "" 128 | for _, elem := range result { 129 | actual += fmt.Sprintf("%v ", elem) 130 | } 131 | t.Errorf("Test %d failed (TestSearchIndex).\nExpected: %v\nActual: %v", index+1, expected, actual) 132 | } 133 | } 134 | 135 | } 136 | -------------------------------------------------------------------------------- /search_index.go: -------------------------------------------------------------------------------- 1 | package searchindex 2 | 3 | import ( 4 | "reflect" 5 | "regexp" 6 | s "sort" 7 | "strings" 8 | "sync" 9 | "unicode" 10 | 11 | "github.com/iancoleman/orderedmap" 12 | "golang.org/x/text/runes" 13 | "golang.org/x/text/transform" 14 | "golang.org/x/text/unicode/norm" 15 | ) 16 | 17 | type SearchIndexInterface interface { 18 | AppendData(data SearchList) 19 | Search(params SearchParams) []SearchData 20 | } 21 | 22 | type SearchIndex struct { 23 | SearchIndexInterface 24 | index Index 25 | limit int 26 | preprocessFunc func(key string, stopWords map[string]bool) []string 27 | sortFunc func(i, j int, data interface{}) bool 28 | indexParts bool 29 | stopWords map[string]bool 30 | appendThreadsCount uint 31 | } 32 | 33 | type Index struct { 34 | children *orderedmap.OrderedMap 35 | key string 36 | data SearchList 37 | } 38 | 39 | const ( 40 | Strict = iota 41 | Beginning = iota 42 | ) 43 | 44 | type SearchParams struct { 45 | Text string 46 | OutputSize int 47 | Matching int 48 | StartValues []SearchData 49 | } 50 | 51 | type SearchData interface{} 52 | 53 | type SearchItem struct { 54 | Key string 55 | Data SearchData 56 | } 57 | type SearchList []*SearchItem 58 | 59 | func defaultSortFunc(i, j int, data interface{}) bool { 60 | return data.(SearchList)[i].Key < data.(SearchList)[j].Key 61 | } 62 | 63 | func defaultPreprocessFunc(key string, stopWords map[string]bool) []string { 64 | // Replace punctuation to spaces 65 | rePunctuation := regexp.MustCompile("[`'\".,:;\\?!+\\-–*=<>_~@#№$%^&()|/\\\\]") 66 | // By default we remove special symbols, because we need searches BTCUSD and BTC-USD get BTC/USD key as result 67 | processed := rePunctuation.ReplaceAllString(key, "") 68 | 69 | // Replace double spaces to single space 70 | reSpaces := regexp.MustCompile("\\s+") 71 | processed = reSpaces.ReplaceAllString(processed, " ") 72 | 73 | processed = strings.Trim(processed, " ") 74 | processed = strings.ToLower(processed) 75 | 76 | // Replace "São, Österreich" to "Sao, Osterreich" 77 | t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) 78 | processed, _, _ = transform.String(t, processed) 79 | 80 | parts := strings.Split(processed, " ") 81 | 82 | // Exclude stop words 83 | var result []string 84 | for _, part := range parts { 85 | if _, ok := stopWords[part]; !ok { 86 | result = append(result, part) 87 | } 88 | } 89 | 90 | return result 91 | } 92 | 93 | func NewSearchIndex( 94 | data SearchList, 95 | limit int, 96 | sort func(i, j int, data interface{}) bool, 97 | preprocess func(key string, stopWords map[string]bool) []string, 98 | indexParts bool, 99 | stopWords []string, 100 | appendThreadsCount uint, 101 | ) SearchIndexInterface { 102 | preprocessFunc := preprocess 103 | if preprocessFunc == nil { 104 | preprocessFunc = defaultPreprocessFunc 105 | } 106 | 107 | sortFunc := defaultSortFunc 108 | if sort != nil { 109 | sortFunc = sort 110 | } 111 | 112 | // Prepare stop words 113 | sw := make(map[string]bool) 114 | for _, word := range stopWords { 115 | parts := preprocessFunc(word, make(map[string]bool)) 116 | for _, part := range parts { 117 | sw[part] = true 118 | } 119 | } 120 | 121 | // Create and fill index with initial data 122 | searchIndex := &SearchIndex{ 123 | index: Index{ 124 | children: orderedmap.New(), 125 | }, 126 | limit: limit, 127 | preprocessFunc: preprocessFunc, 128 | sortFunc: sortFunc, 129 | indexParts: indexParts, 130 | stopWords: sw, 131 | appendThreadsCount: appendThreadsCount, 132 | } 133 | searchIndex.AppendData(data) 134 | 135 | return searchIndex 136 | } 137 | 138 | func (c SearchIndex) AppendData(data SearchList) { 139 | threadsCount := c.appendThreadsCount 140 | if threadsCount == 0 { 141 | threadsCount = 1 142 | } 143 | 144 | // Copy original data 145 | copied := copyOriginalData(data) 146 | 147 | // Preprocess keys in parallel 148 | var preprocessWg sync.WaitGroup 149 | preprocessWorkChan := make(chan *SearchItem, len(copied)) 150 | preprocessResultChan := make(chan []*SearchItem, len(copied)) 151 | for i := 0; i < int(threadsCount); i++ { 152 | preprocessWg.Add(1) 153 | go func() { 154 | defer preprocessWg.Done() 155 | for item := range preprocessWorkChan { 156 | var items []*SearchItem 157 | sortedParts := c.preprocessFunc(item.Key, c.stopWords) 158 | for j := range sortedParts { 159 | d := *item 160 | copiedItem := &d 161 | copiedItem.Key = strings.Join(sortedParts[j:], " ") 162 | items = append(items, copiedItem) 163 | if !c.indexParts { 164 | break 165 | } 166 | } 167 | preprocessResultChan <- items 168 | } 169 | }() 170 | } 171 | for _, item := range copied { 172 | preprocessWorkChan <- item 173 | } 174 | close(preprocessWorkChan) 175 | preprocessWg.Wait() 176 | close(preprocessResultChan) 177 | var preprocessed SearchList 178 | for items := range preprocessResultChan { 179 | preprocessed = append(preprocessed, items...) 180 | } 181 | 182 | // Sort 183 | s.SliceStable(preprocessed, func(i, j int) bool { 184 | return c.sortFunc(i, j, preprocessed) 185 | }) 186 | 187 | // Group by key 188 | itemsByKey := orderedmap.New() 189 | for _, item := range preprocessed { 190 | current, ok := itemsByKey.Get(item.Key) 191 | if !ok { 192 | itemsByKey.Set(item.Key, SearchList{item}) 193 | } else { 194 | current = append(current.(SearchList), item) 195 | itemsByKey.Set(item.Key, current) 196 | } 197 | } 198 | 199 | for _, key := range itemsByKey.Keys() { 200 | item, _ := itemsByKey.Get(key) 201 | addToIndex(&c.index, key, key, item.(SearchList)) 202 | } 203 | } 204 | 205 | func copyOriginalData(data SearchList) SearchList { 206 | copied := make(SearchList, len(data)) 207 | for i, _ := range data { 208 | d := *data[i] 209 | copied[i] = &d 210 | } 211 | return copied 212 | } 213 | 214 | func addToIndex(index *Index, keyTail string, key string, data SearchList) { 215 | if len(keyTail) == 0 { 216 | index.key = key 217 | index.data = data 218 | return 219 | } 220 | first := keyTail[:1] 221 | tail := keyTail[1:] 222 | idx, ok := index.children.Get(first) 223 | if !ok { 224 | idx = &Index{ 225 | children: orderedmap.New(), 226 | } 227 | index.children.Set(first, idx) 228 | } 229 | addToIndex(idx.(*Index), tail, key, data) 230 | } 231 | 232 | func (c SearchIndex) Search(params SearchParams) []SearchData { 233 | outputSize := params.OutputSize 234 | if outputSize == 0 || outputSize > c.limit || outputSize <= 0 { 235 | outputSize = c.limit 236 | } 237 | 238 | start := make(map[uintptr]bool) 239 | for _, item := range params.StartValues { 240 | ptr := reflect.ValueOf(item).Pointer() 241 | start[ptr] = true 242 | } 243 | 244 | // Start search 245 | data := c.searchInIndex( 246 | &c.index, 247 | strings.Join(c.preprocessFunc(params.Text, c.stopWords), " "), 248 | params.Matching, 249 | outputSize-len(params.StartValues), 250 | start, 251 | ) 252 | 253 | // And append result after start 254 | result := make([]SearchData, len(params.StartValues)) 255 | copy(result, params.StartValues) 256 | result = append(result, data...) 257 | 258 | return result 259 | } 260 | 261 | func (c SearchIndex) searchInIndex(index *Index, key string, matching int, outputSize int, start map[uintptr]bool) []SearchData { 262 | if key == "" { 263 | found := make(map[uintptr]bool) 264 | searched := c.searchList(index, make(SearchList, 0), matching, outputSize, found, start) 265 | return c.getData(searched) 266 | } 267 | idx, ok := index.children.Get(key[:1]) 268 | if !ok { 269 | return make([]SearchData, 0) 270 | } 271 | return c.searchInIndex(idx.(*Index), key[1:], matching, outputSize, start) 272 | } 273 | 274 | func (c SearchIndex) searchList(index *Index, items SearchList, matching int, outputSize int, found map[uintptr]bool, start map[uintptr]bool) SearchList { 275 | if (outputSize > 0 && len(items) >= outputSize) || outputSize == 0 { 276 | return items 277 | } 278 | if index.data != nil { 279 | for _, item := range index.data { 280 | // Check data in found, because we do not need to add duplicates in result 281 | ptr := reflect.ValueOf(item.Data).Pointer() 282 | if _, exists := found[ptr]; !exists { 283 | if _, exists := start[ptr]; !exists { 284 | items = append(items, item) 285 | found[ptr] = true 286 | if outputSize > 0 && len(items) >= outputSize { 287 | return items 288 | } 289 | } 290 | } 291 | } 292 | } 293 | if len(index.children.Keys()) == 0 { 294 | return items 295 | } 296 | if matching == Beginning { 297 | for _, key := range index.children.Keys() { 298 | idx, _ := index.children.Get(key) 299 | items = c.searchList(idx.(*Index), items, matching, outputSize, found, start) 300 | } 301 | } 302 | return items 303 | } 304 | 305 | func (c SearchIndex) getData(data SearchList) []SearchData { 306 | result := make([]SearchData, len(data)) 307 | for i, item := range data { 308 | result[i] = item.Data 309 | } 310 | return result 311 | } 312 | --------------------------------------------------------------------------------