├── README.md ├── flash.go ├── flash_test.go ├── ico.png └── trie └── trie.go /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Flash 3 | 4 | ![](https://raw.githubusercontent.com/dav009/flash/master/ico.png) 5 | 6 | Fast Keyword extraction using [Aho–Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm) and Tries. 7 | 8 | Flash is a Golang reimplementation of [Flashtext](https://github.com/vi3k6i5/flashtext), 9 | 10 | This is meant to be used when you have a large number of words that you want to: 11 | - extract from text 12 | - search and replace 13 | 14 | Flash is meant as a replacement for Regex, which in such cases can be extremely slow. 15 | 16 | ## Usage 17 | 18 | ```go 19 | 20 | import "github.com/dav009/flash" 21 | 22 | words := flash.NewKeywords() 23 | words.Add("New York") 24 | words.Add("Hello") 25 | words.Add("Tokyo") 26 | foundKeywords := words.Extract("New York and Tokyo are Cities") 27 | fmt.Println(foundKeywords) 28 | // [New York, Tokyo] 29 | ``` 30 | 31 | 32 | ## Benchmarks 33 | 34 | As a reference using go-flash with 10K keywords in a 1000 sentence text, took 7.3ms, 35 | while using regexes took 1minute 37s. 36 | 37 | 38 | | Sentences | Keywords | String.Contains | Regex | Go-Flash | 39 | |-----------|----------|-----------------|----------|----------| 40 | | 1000 | 10K | 1.0035s | 1min 37s | 2.72ms 41 | 42 | 43 | ## Warning 44 | 45 | This is a toy-project for me to get more familiar with Golang 46 | Please be-aware of potential issues. 47 | 48 | 49 | -------------------------------------------------------------------------------- /flash.go: -------------------------------------------------------------------------------- 1 | package flash 2 | 3 | import "github.com/dav009/flash/trie" 4 | 5 | func extractKeywords(t *trie.Trie, sentence string) []string { 6 | matches := make([]string, 0) 7 | currentTrie := t 8 | //sequence_end_pos := 0 9 | idx := 0 10 | 11 | sentenceLen := len(sentence) 12 | 13 | for idx < sentenceLen { 14 | 15 | char := string(sentence[idx]) 16 | //fmt.Println(string(char)) 17 | // it is a boundary char (i.e: space) 18 | if isWordBoundarie(trie.Character(char)) { 19 | 20 | idx2, longestSequenceFound := checkIfMatch(currentTrie, sentence, idx) 21 | 22 | idx = idx2 23 | if longestSequenceFound != "" { 24 | 25 | matches = append(matches, longestSequenceFound) 26 | 27 | } 28 | currentTrie = t 29 | 30 | } else if insideTrie, _ := currentTrie.Retrieve(trie.Character(char)); insideTrie != nil { 31 | 32 | // if it is indexed in the current trie 33 | 34 | currentTrie = insideTrie 35 | 36 | } else { 37 | 38 | // if it is not index in the currrent trie 39 | 40 | currentTrie = t 41 | 42 | idy := idx + 1 43 | 44 | for idy < sentenceLen { 45 | 46 | char := sentence[idy] 47 | if isWordBoundarie(trie.Character(char)) { 48 | break 49 | } 50 | idy += 1 51 | } 52 | idx = idy 53 | } 54 | if idx+1 >= sentenceLen { 55 | if currentTrie.IsKeyword() { 56 | matches = append(matches, currentTrie.IndexedWord) 57 | } 58 | } 59 | 60 | idx += 1 61 | } 62 | 63 | return matches 64 | 65 | } 66 | 67 | type Keywords struct { 68 | t *trie.Trie 69 | } 70 | 71 | func NewKeywords() Keywords { 72 | return Keywords{trie.NewTrie()} 73 | } 74 | 75 | func (x Keywords) Extract(sentence string) []string { 76 | return extractKeywords(x.t, sentence) 77 | } 78 | 79 | func (x Keywords) Add(w string) { 80 | x.t.Index(trie.Keyword(w)) 81 | } 82 | 83 | func isWordBoundarie(c trie.Character) bool { 84 | return c == "" || c == " " || c == "\t" || c == "\n" 85 | } 86 | 87 | func checkIfMatch(t *trie.Trie, sentence string, idx int) (int, string) { 88 | char := sentence[idx] 89 | sequenceFound := "" 90 | longestSequenceFound := "" 91 | if t.IsKeyword() { 92 | sequenceFound = t.IndexedWord 93 | longestSequenceFound = t.IndexedWord 94 | } 95 | if t.IsCharIn(trie.Character(char)) { 96 | // look for longest sequence from here 97 | nextTrie, _ := t.Retrieve(trie.Character(char)) 98 | seqFound, sequenceEndpos := searchLongest(sentence, idx, nextTrie) 99 | longestSequenceFound = seqFound 100 | 101 | if longestSequenceFound != "" && sequenceFound != longestSequenceFound { 102 | idx = sequenceEndpos 103 | 104 | } 105 | } 106 | return idx, longestSequenceFound 107 | } 108 | 109 | func searchLongest(sentence string, idx int, t *trie.Trie) (string, int) { 110 | longestSequenceFound := "" 111 | sequenceEndpos := -1 112 | sentenceLen := len(sentence) 113 | idx = idx + 1 114 | currentTrie := t 115 | for idx < sentenceLen { 116 | char := sentence[idx] 117 | if (isWordBoundarie(trie.Character(char))) && currentTrie.IsKeyword() { 118 | longestSequenceFound = currentTrie.IndexedWord 119 | sequenceEndpos = idx 120 | } 121 | if trie, _ := currentTrie.Retrieve(trie.Character(char)); trie != nil { 122 | // if it is indexed in the current trie 123 | 124 | currentTrie = trie 125 | } else { 126 | break 127 | } 128 | idx = idx + 1 129 | } 130 | 131 | if idx >= sentenceLen { 132 | 133 | if currentTrie.IsKeyword() { 134 | longestSequenceFound = currentTrie.IndexedWord 135 | sequenceEndpos = idx 136 | } 137 | } 138 | 139 | return longestSequenceFound, sequenceEndpos 140 | 141 | } 142 | -------------------------------------------------------------------------------- /flash_test.go: -------------------------------------------------------------------------------- 1 | package flash 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestDummy(t *testing.T) { 8 | words := NewKeywords() 9 | words.Add("New York") 10 | words.Add("Hello") 11 | words.Add("Tokyo") 12 | foundKeywords := words.Extract("New York and Tokyo are Cities") 13 | if foundKeywords[0] != "New York" { 14 | t.Fail() 15 | } 16 | if foundKeywords[1] != "Tokyo" { 17 | t.Fail() 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /ico.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dav009/flash/a4f0c2e0fc62913bfd59e3d083d73285c2aec119/ico.png -------------------------------------------------------------------------------- /trie/trie.go: -------------------------------------------------------------------------------- 1 | package trie 2 | 3 | import "errors" 4 | 5 | type Character string 6 | type Keyword string 7 | 8 | type Trie struct { 9 | s map[Character]*Trie 10 | IndexedWord string 11 | } 12 | 13 | func NewTrie() *Trie { 14 | return &Trie{s: make(map[Character]*Trie), IndexedWord: ""} 15 | } 16 | 17 | func (t *Trie) IsKeyword() bool { 18 | return t.IndexedWord != "" 19 | } 20 | 21 | func (t *Trie) Retrieve(c Character) (*Trie, error) { 22 | if value, ok := t.s[c]; ok { 23 | return value, nil 24 | } 25 | return nil, errors.New("no item in trie") 26 | } 27 | 28 | func (t *Trie) Index(word Keyword) error { 29 | 30 | var currentTrie = t 31 | for _, char := range word { 32 | trie, err := currentTrie.Retrieve(Character(char)) 33 | if err != nil { 34 | currentTrie.s[Character(char)] = NewTrie() 35 | currentTrie = currentTrie.s[Character(char)] 36 | } else { 37 | currentTrie = trie 38 | } 39 | 40 | } 41 | currentTrie.IndexedWord = string(word) 42 | return nil 43 | } 44 | 45 | func (t *Trie) IsCharIn(c Character) bool { 46 | _, ok := t.s[c] 47 | return ok 48 | } 49 | --------------------------------------------------------------------------------