├── .travis.yml ├── README.md ├── LICENSE └── gtrie ├── gtrie_test.go └── gtrie.go /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | About 2 | ===== 3 | 4 | This library implements tries, also known as prefix trees, using minimal acyclic 5 | finite-state automata for the Go programming language (http://golang.org/). 6 | 7 | The implementation is based on [Jan Daciuk, Stoyan Mihov, Bruce W. Watson, 8 | Richard E. Watson (2000)](http://goo.gl/0XLPo). "Incremental Construction of 9 | Minimal Acyclic Finite-State Automata". Computational Linguistics: March 2000, 10 | Vol. 26, No. 1, Pages 3-16. 11 | 12 | The javascript equivalent of this library can be found at 13 | [MathieuTurcotte/node-trie](https://github.com/MathieuTurcotte/node-trie). 14 | 15 | Installing 16 | ========== 17 | 18 | $ go get github.com/MathieuTurcotte/go-trie/gtrie 19 | 20 | Documentation 21 | ============= 22 | 23 | Read it [online](http://go.pkgdoc.org/github.com/MathieuTurcotte/go-trie/gtrie) or run 24 | 25 | $ go doc github.com/MathieuTurcotte/go-trie/gtrie 26 | 27 | License 28 | ======= 29 | 30 | This code is free to use under the terms of the [MIT license](http://mturcotte.mit-license.org/). 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2013 Mathieu Turcotte 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /gtrie/gtrie_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013 Mathieu Turcotte 2 | // Licensed under the MIT license. 3 | 4 | package gtrie_test 5 | 6 | import ( 7 | "bufio" 8 | "github.com/MathieuTurcotte/go-trie/gtrie" 9 | "io" 10 | "log" 11 | "os" 12 | "strings" 13 | "testing" 14 | ) 15 | 16 | func TestCreateUnsortedWords(t *testing.T) { 17 | _, err := gtrie.Create([]string{"ab", "ef", "cd"}) 18 | 19 | if err == nil { 20 | t.Errorf("expected error when creating trie from unsorted words") 21 | } 22 | } 23 | 24 | func TestTrie(t *testing.T) { 25 | words := []string{"abfg", "acfg", "adfg"} 26 | missings := []string{"", "foo", "été", "adfgg", "adf"} 27 | 28 | trie, err := gtrie.Create(words) 29 | if err != nil { 30 | log.Fatal(err) 31 | } else if trie == nil { 32 | log.Fatal("returned trie was nil") 33 | } 34 | 35 | // Ensure that stored words are accepted. 36 | for _, word := range words { 37 | if !trie.Accepts(word) { 38 | t.Errorf("expected %s to be accepted", word) 39 | } 40 | } 41 | 42 | // Ensure that missings words aren't accepted. 43 | for _, word := range missings { 44 | if trie.Accepts(word) { 45 | t.Errorf("expected %s to be rejected", word) 46 | } 47 | } 48 | 49 | // Ensure that the graph is minimal by counting the number of nodes. 50 | size := gtrie.Size(trie) 51 | if size != 5 { 52 | t.Errorf("expected size of 5 but got %s", size) 53 | } 54 | } 55 | 56 | // Test behavior with a large dictionary. 57 | func TestAccept(t *testing.T) { 58 | words := readWords("words.txt") 59 | trie, err := gtrie.Create(readWords("words.txt")) 60 | if err != nil { 61 | log.Fatal(err) 62 | } 63 | 64 | for _, word := range words { 65 | if !trie.Accepts(word) { 66 | t.Error(word) 67 | } 68 | } 69 | } 70 | 71 | func BenchmarkAccepts(b *testing.B) { 72 | b.StopTimer() 73 | 74 | words := []string{"evropenescului", "simulantilor", "zburdalniciilor"} 75 | trie, err := gtrie.Create(readWords("words.txt")) 76 | if err != nil { 77 | b.Fatal(err) 78 | } 79 | 80 | b.StartTimer() 81 | 82 | for i := 0; i < b.N; i++ { 83 | for _, word := range words { 84 | if !trie.Accepts(word) { 85 | b.Fatal(word) 86 | } 87 | } 88 | } 89 | } 90 | 91 | func readWords(filename string) (words []string) { 92 | file, err := os.Open(filename) 93 | if err != nil { 94 | log.Fatal(err) 95 | } 96 | defer file.Close() 97 | 98 | reader := bufio.NewReader(file) 99 | 100 | for { 101 | word, rerr := reader.ReadString('\n') 102 | if rerr != nil { 103 | if rerr == io.EOF { 104 | break 105 | } else { 106 | log.Fatal(err) 107 | } 108 | } 109 | words = append(words, strings.TrimSpace(word)) 110 | } 111 | return 112 | } 113 | -------------------------------------------------------------------------------- /gtrie/gtrie.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013 Mathieu Turcotte 2 | // Licensed under the MIT license. 3 | 4 | // Package gtrie provides a trie implementation based on a minimal acyclic 5 | // finite-state automaton. 6 | package gtrie 7 | 8 | import ( 9 | "errors" 10 | "sort" 11 | "strconv" 12 | "strings" 13 | ) 14 | 15 | type nodeId int 16 | 17 | type nodeIdGen struct { 18 | id nodeId 19 | } 20 | 21 | func (g *nodeIdGen) next() (next nodeId) { 22 | next = g.id 23 | g.id++ 24 | return 25 | } 26 | 27 | // Represents a transition in the acyclic finite-state automaton. Each 28 | // transition has one label and leads to one node. 29 | type Transition struct { 30 | Child *Node 31 | Label rune 32 | } 33 | 34 | // Represents a node in the acyclic finite-state automaton. 35 | type Node struct { 36 | id nodeId 37 | Terminal bool 38 | Transitions []Transition 39 | } 40 | 41 | // Checks whether the node has children. 42 | func (n *Node) HasChildren() bool { 43 | return len(n.Transitions) > 0 44 | } 45 | 46 | // Checks whether the node has a child for the given letter. 47 | func (n *Node) HasChild(letter rune) bool { 48 | return n.GetChild(letter) != nil 49 | } 50 | 51 | // Retrieves the child for the given letter. Returns nil if there is no child 52 | // for this letter. 53 | func (n *Node) GetChild(letter rune) (child *Node) { 54 | transitions := n.Transitions 55 | finder := func(i int) bool { return transitions[i].Label >= letter } 56 | // It is possible to use a binary search here because we know, by 57 | // construction, that the transitions are sorted by their labels. 58 | index := sort.Search(len(transitions), finder) 59 | if index < len(transitions) && transitions[index].Label == letter { 60 | child = transitions[index].Child 61 | } 62 | return 63 | } 64 | 65 | // Whether the node recognizes the given suffix. A suffix is accepted if there 66 | // exists a path from the current node to a final node labeled with the suffix 67 | // elements. 68 | func (n *Node) Accepts(suffix string) bool { 69 | letters := []rune(suffix) 70 | current := n 71 | for i := 0; current != nil && i < len(letters); i++ { 72 | current = current.GetChild(letters[i]) 73 | } 74 | return current != nil && current.Terminal 75 | } 76 | 77 | // Gets the number of nodes in the given automaton. 78 | func Size(node *Node) int { 79 | ids := make(map[nodeId]bool) 80 | queue := []*Node{node} 81 | for len(queue) > 0 { 82 | node = queue[0] 83 | queue = queue[1:] 84 | ids[node.id] = true 85 | for _, t := range node.Transitions { 86 | queue = append(queue, t.Child) 87 | } 88 | } 89 | return len(ids) 90 | } 91 | 92 | func newNode(idGen *nodeIdGen) *Node { 93 | return &Node{id: idGen.next()} 94 | } 95 | 96 | func addTransition(node *Node, child *Node, letter rune) { 97 | node.Transitions = append(node.Transitions, Transition{child, letter}) 98 | } 99 | 100 | func addChild(node *Node, letter rune, idGen *nodeIdGen) (child *Node) { 101 | child = node.GetChild(letter) 102 | if child == nil { 103 | child = newNode(idGen) 104 | addTransition(node, child, letter) 105 | } 106 | return 107 | } 108 | 109 | func getLastChild(node *Node) *Node { 110 | t := node.Transitions 111 | return t[len(t)-1].Child 112 | } 113 | 114 | func setLastChild(node *Node, last *Node) { 115 | t := node.Transitions 116 | t[len(t)-1].Child = last 117 | } 118 | 119 | type eqClass struct { 120 | terminal bool 121 | children string 122 | } 123 | 124 | // Obtains the equivalence class for this node, knowing that two nodes p and 125 | // q belongs to the same class if and only if: 126 | // 1. they are either both final or both nonfinal; and 127 | // 2. they have the same number of outgoing transitions; and 128 | // 3. corresponding outgoing transitions have the same labels; and 129 | // 4. corresponding transitions lead to the same states. 130 | func getEquivalenceClass(node *Node) (class eqClass) { 131 | children := []string{} 132 | for _, t := range node.Transitions { 133 | child := string(t.Label) + ":" + strconv.Itoa(int(t.Child.id)) 134 | children = append(children, child) 135 | } 136 | class.children = strings.Join(children, ";") 137 | class.terminal = node.Terminal 138 | return 139 | } 140 | 141 | type registry struct { 142 | // Mapping from equivalence class to node. 143 | eqv map[eqClass]*Node 144 | // Set of nodes that are registered. 145 | nodes map[*Node]bool 146 | } 147 | 148 | func newRegistery() (reg *registry) { 149 | reg = new(registry) 150 | reg.eqv = make(map[eqClass]*Node) 151 | reg.nodes = make(map[*Node]bool) 152 | return 153 | } 154 | 155 | func (r *registry) find(class eqClass) *Node { 156 | return r.eqv[class] 157 | } 158 | 159 | func (r *registry) register(class eqClass, node *Node) { 160 | r.eqv[class] = node 161 | r.nodes[node] = true 162 | } 163 | 164 | func (r *registry) registered(node *Node) bool { 165 | return r.nodes[node] 166 | } 167 | 168 | // Creates an acyclic finite-state automaton from a sorted list of words and 169 | // returns the root node. Words can contain any unicode chararcters. An error 170 | // will be returned if the list of words is not lexicographically sorted. 171 | func Create(words []string) (automaton *Node, err error) { 172 | reg := newRegistery() 173 | idGen := new(nodeIdGen) 174 | automaton = newNode(idGen) 175 | 176 | if !sort.StringsAreSorted(words) { 177 | err = errors.New("the words are not sorted") 178 | return 179 | } 180 | 181 | for _, word := range words { 182 | insertWord(word, automaton, reg, idGen) 183 | } 184 | 185 | replaceOrRegister(automaton, reg) 186 | return 187 | } 188 | 189 | func insertWord(word string, automaton *Node, reg *registry, idGen *nodeIdGen) { 190 | letters := []rune(word) 191 | var last *Node 192 | 193 | if len(letters) == 0 { 194 | return 195 | } 196 | 197 | // Find last common state. 198 | for current := automaton; current != nil && len(letters) > 0; { 199 | last = current 200 | current = last.GetChild(letters[0]) 201 | if current != nil { 202 | letters = letters[1:] 203 | } 204 | } 205 | 206 | // Minimize. 207 | if last.HasChildren() { 208 | replaceOrRegister(last, reg) 209 | } 210 | 211 | // Add suffix. 212 | for len(letters) > 0 { 213 | last = addChild(last, letters[0], idGen) 214 | letters = letters[1:] 215 | } 216 | 217 | last.Terminal = true 218 | } 219 | 220 | func replaceOrRegister(node *Node, reg *registry) { 221 | var child = getLastChild(node) 222 | 223 | if reg.registered(child) { 224 | return 225 | } 226 | 227 | if child.HasChildren() { 228 | replaceOrRegister(child, reg) 229 | } 230 | 231 | class := getEquivalenceClass(child) 232 | 233 | if eq := reg.find(class); eq != nil { 234 | setLastChild(node, eq) 235 | } else { 236 | reg.register(class, child) 237 | } 238 | } 239 | --------------------------------------------------------------------------------