├── testfiles └── empty ├── init.go ├── .travis.yml ├── .gitignore ├── doc.go ├── LICENSE ├── README.md ├── trie.go ├── branch.go └── trie_test.go /testfiles/empty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /init.go: -------------------------------------------------------------------------------- 1 | package trie 2 | 3 | const PADDING_CHAR = "-" 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.0 5 | - 1.1 6 | - 1.2 7 | 8 | install: 9 | - go get github.com/fvbock/uds-go/set -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | 24 | # tmp 25 | *# 26 | .#* -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Trie is a prefix index package for golang. 3 | 4 | Terminology used 5 | 6 | Trie - Contains the Root of the Index - which is a Branch 7 | 8 | Branch - A Branch might have a LeafValue or not and might have other Branches splitting off. It has a flag `End` that marks the end of a term that has been inserted. 9 | 10 | Entry - an entry refers to a _complete_ term that is inserted, removed from, or matched in the index. It requires `End` on the Branch to be set to `true`, which makes it different from a 11 | 12 | Prefix - which does not require the Branch to have End set to `true` to match. 13 | */ 14 | package trie 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Florian von Bock 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | trie 2 | ==== 3 | 4 | A Trie (Prefix Index) implementation in golang. It works fine with all unicode characters. 5 | 6 | Documentation can be found [at godoc.org](http://godoc.org/github.com/fvbock/trie). 7 | 8 | Entries are reference counted: If you `Add("foo")` twice and `Del("foo")` it once it will still be found. 9 | 10 | [![Build Status](https://travis-ci.org/fvbock/trie.png)](https://travis-ci.org/fvbock/trie) 11 | 12 | Example 13 | ======= 14 | 15 | t := trie.NewTrie() 16 | t.Add("foo") 17 | t.Add("bar") 18 | t.PrintDump() 19 | 20 | // output: 21 | // I:f (-) 22 | // - V:oo (1) 23 | // --- $ 24 | // I:b (-) 25 | // - V:ar (1) 26 | // --- $ 27 | 28 | t.Add("foo") 29 | t.PrintDump() 30 | 31 | // output: 32 | // I:f (-) 33 | // - V:oo (2) 34 | // --- $ 35 | // I:b (-) 36 | // - V:ar (1) 37 | // --- $ 38 | 39 | fmt.Println(t.Has("foo")) 40 | // output: true 41 | 42 | fmt.Println(t.HasCount("foo")) 43 | // output: true 2 44 | 45 | fmt.Println(t.Has("foobar")) 46 | // output: false 47 | 48 | fmt.Println(t.Members()) 49 | // output: [foo(2) bar(1)] 50 | 51 | t.Add("food") 52 | t.Add("foobar") 53 | t.Add("foot") 54 | fmt.Println(t.HasPrefix("foo")) 55 | // output: true 56 | 57 | fmt.Println(t.PrefixMembers("foo")) 58 | // output: [foo(2) food(1) foobar(1) foot(1)] 59 | 60 | 61 | A `Trie` can be dumped into a file with 62 | 63 | t.DumpToFile("/tmp/trie_foo") 64 | 65 | And loaded with 66 | 67 | t2, _ := trie.LoadFromFile("/tmp/trie_foo") 68 | fmt.Println(t2.Members()) 69 | // output: [foo(2) food(1) foobar(1) foot(1) bar(1)] 70 | 71 | An existing `Trie` can be merged with a stored one with 72 | 73 | t3 := trie.NewTrie() 74 | t3.Add("フー") 75 | t3.Add("バー") 76 | t3.Add("日本語") 77 | fmt.Println(t3.Members()) 78 | // output: [フー(1) バー(1) 日本語(1)] 79 | 80 | t3.MergeFromFile("/tmp/trie_foo") 81 | fmt.Println(t3.Members()) 82 | // output: [フー(1) バー(1) 日本語(1) foo(2) food(1) foobar(1) foot(1) bar(1)] 83 | -------------------------------------------------------------------------------- /trie.go: -------------------------------------------------------------------------------- 1 | package trie 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "encoding/gob" 7 | "errors" 8 | "fmt" 9 | "io" 10 | "log" 11 | "os" 12 | "time" 13 | ) 14 | 15 | type Trie struct { 16 | Root *Branch 17 | } 18 | 19 | /* 20 | NewTrie returns the pointer to a new Trie with an initialized root Branch 21 | */ 22 | func NewTrie() *Trie { 23 | t := &Trie{ 24 | Root: &Branch{ 25 | Branches: make(map[byte]*Branch), 26 | }, 27 | } 28 | return t 29 | } 30 | 31 | /* 32 | Add adds an entry to the trie and returns the branch node that the insertion 33 | was made at - or rather where the end of the entry was marked. 34 | */ 35 | func (t *Trie) Add(entry string) *Branch { 36 | t.Root.Lock() 37 | b := t.Root.add([]byte(entry)) 38 | t.Root.Unlock() 39 | return b 40 | } 41 | 42 | /* 43 | Delete decrements the count of an existing entry by one. If the count equals 44 | zero it removes an the entry from the trie. Returns true if the entry existed, 45 | false otherwise. Note that the return value says something about the previous 46 | existence of the entry - not whether it has been completely removed or just 47 | its count decremented. 48 | */ 49 | func (t *Trie) Delete(entry string) bool { 50 | if len(entry) == 0 { 51 | return false 52 | } 53 | t.Root.Lock() 54 | deleted := t.Root.delete([]byte(entry)) 55 | t.Root.Unlock() 56 | return deleted 57 | } 58 | 59 | /* 60 | GetBranch returns the branch end if the `entry` exists in the `Trie` 61 | */ 62 | func (t *Trie) GetBranch(entry string) *Branch { 63 | return t.Root.getBranch([]byte(entry)) 64 | } 65 | 66 | /* 67 | Has returns true if the `entry` exists in the `Trie` 68 | */ 69 | func (t *Trie) Has(entry string) bool { 70 | return t.Root.has([]byte(entry)) 71 | } 72 | 73 | /* 74 | HasCount returns true if the `entry` exists in the `Trie`. The second returned 75 | value is the count how often the entry has been set. 76 | */ 77 | func (t *Trie) HasCount(entry string) (exists bool, count int64) { 78 | return t.Root.hasCount([]byte(entry)) 79 | } 80 | 81 | /* 82 | HasPrefix returns true if the the `Trie` contains entries with the given prefix 83 | */ 84 | func (t *Trie) HasPrefix(prefix string) bool { 85 | return t.Root.hasPrefix([]byte(prefix)) 86 | } 87 | 88 | /* 89 | HasPrefixCount returns true if the the `Trie` contains entries with the given 90 | prefix. The second returned value is the count how often the entry has been set. 91 | */ 92 | func (t *Trie) HasPrefixCount(prefix string) (exists bool, count int64) { 93 | return t.Root.hasPrefixCount([]byte(prefix)) 94 | } 95 | 96 | /* 97 | Members returns all entries of the Trie with their counts as MemberInfo 98 | */ 99 | func (t *Trie) Members() []*MemberInfo { 100 | return t.Root.members([]byte{}) 101 | } 102 | 103 | /* 104 | Members returns a Slice of all entries of the Trie 105 | */ 106 | func (t *Trie) MembersList() (members []string) { 107 | for _, mi := range t.Root.members([]byte{}) { 108 | members = append(members, mi.Value) 109 | } 110 | return 111 | } 112 | 113 | /* 114 | PrefixMembers returns all entries of the Trie that have the given prefix 115 | with their counts as MemberInfo 116 | */ 117 | func (t *Trie) PrefixMembers(prefix string) []*MemberInfo { 118 | return t.Root.prefixMembers([]byte{}, []byte(prefix)) 119 | } 120 | 121 | /* 122 | PrefixMembers returns a List of all entries of the Trie that have the 123 | given prefix 124 | */ 125 | func (t *Trie) PrefixMembersList(prefix string) (members []string) { 126 | for _, mi := range t.Root.prefixMembers([]byte{}, []byte(prefix)) { 127 | members = append(members, mi.Value) 128 | } 129 | return 130 | } 131 | 132 | /* 133 | Dump returns a string representation of the `Trie` 134 | */ 135 | func (t *Trie) Dump() string { 136 | return t.Root.Dump(0) 137 | } 138 | 139 | /* 140 | */ 141 | func (t *Trie) PrintDump() { 142 | t.Root.PrintDump() 143 | } 144 | 145 | /* 146 | DumpToFile dumps all values into a slice of strings and writes that to a file 147 | using encoding/gob. 148 | 149 | The Trie itself can currently not be encoded directly because gob does not 150 | directly support structs with a sync.Mutex on them. 151 | */ 152 | func (t *Trie) DumpToFile(fname string) (err error) { 153 | t.Root.Lock() 154 | entries := t.Members() 155 | t.Root.Unlock() 156 | 157 | buf := new(bytes.Buffer) 158 | enc := gob.NewEncoder(buf) 159 | if err = enc.Encode(entries); err != nil { 160 | err = errors.New(fmt.Sprintf("Could encode Trie entries for dump file: %v", err)) 161 | return 162 | } 163 | 164 | f, err := os.Create(fname) 165 | if err != nil { 166 | err = errors.New(fmt.Sprintf("Could not save dump file: %v", err)) 167 | return 168 | } 169 | defer f.Close() 170 | 171 | w := bufio.NewWriter(f) 172 | _, err = w.Write(buf.Bytes()) 173 | if err != nil { 174 | err = errors.New(fmt.Sprintf("Error writing to dump file: %v", err)) 175 | return 176 | } 177 | // log.Printf("wrote %d bytes to dumpfile %s\n", bl, fname) 178 | w.Flush() 179 | return 180 | } 181 | 182 | /* 183 | MergeFromFile loads a gib encoded wordlist from a file and Add() them to the `Trie`. 184 | */ 185 | // TODO: write tests for merge 186 | func (t *Trie) MergeFromFile(fname string) (err error) { 187 | entries, err := loadTrieFile(fname) 188 | if err != nil { 189 | return 190 | } 191 | log.Printf("Got %v entries\n", len(entries)) 192 | startTime := time.Now() 193 | for _, mi := range entries { 194 | b := t.GetBranch(mi.Value) 195 | if b != nil { 196 | b.Lock() 197 | b.Count += mi.Count 198 | b.Unlock() 199 | } else { 200 | b := t.Add(mi.Value) 201 | b.Lock() 202 | b.Count = mi.Count 203 | b.Unlock() 204 | } 205 | } 206 | log.Printf("merging words to index took: %v\n", time.Since(startTime)) 207 | return 208 | } 209 | 210 | /* 211 | LoadFromFile loads a gib encoded wordlist from a file and creates a new Trie 212 | by Add()ing all of them. 213 | */ 214 | func LoadFromFile(fname string) (tr *Trie, err error) { 215 | tr = NewTrie() 216 | entries, err := loadTrieFile(fname) 217 | if err != nil { 218 | return 219 | } 220 | log.Printf("Got %v entries\n", len(entries)) 221 | startTime := time.Now() 222 | for _, mi := range entries { 223 | b := tr.Add(mi.Value) 224 | b.Count = mi.Count 225 | } 226 | log.Printf("adding words to index took: %v\n", time.Since(startTime)) 227 | 228 | return 229 | } 230 | 231 | func loadTrieFile(fname string) (entries []*MemberInfo, err error) { 232 | log.Println("Load trie from", fname) 233 | f, err := os.Open(fname) 234 | if err != nil { 235 | err = errors.New(fmt.Sprintf("Could not open Trie file: %v", err)) 236 | } else { 237 | defer f.Close() 238 | 239 | buf := bufio.NewReader(f) 240 | dec := gob.NewDecoder(buf) 241 | if err = dec.Decode(&entries); err != nil { 242 | if err == io.EOF && entries == nil { 243 | log.Println("Nothing to decode. Seems the file is empty.") 244 | err = nil 245 | } else { 246 | err = errors.New(fmt.Sprintf("Decoding error: %v", err)) 247 | return 248 | } 249 | } 250 | } 251 | 252 | return 253 | } 254 | -------------------------------------------------------------------------------- /branch.go: -------------------------------------------------------------------------------- 1 | package trie 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "sync" 7 | ) 8 | 9 | type MemberInfo struct { 10 | Value string 11 | Count int64 12 | } 13 | 14 | func (m *MemberInfo) String() string { 15 | return fmt.Sprintf("%s(%v)", m.Value, m.Count) 16 | } 17 | 18 | type Branch struct { 19 | sync.RWMutex 20 | Branches map[byte]*Branch 21 | LeafValue []byte 22 | End bool 23 | Count int64 24 | } 25 | 26 | /* 27 | NewBranch returns a new initialezed *Branch 28 | */ 29 | func (b *Branch) NewBranch() *Branch { 30 | return &Branch{ 31 | Branches: make(map[byte]*Branch), 32 | Count: 0, 33 | } 34 | } 35 | 36 | /* 37 | Add adds an entry to the Branch 38 | */ 39 | func (b *Branch) add(entry []byte) (addedBranch *Branch) { 40 | if b.LeafValue == nil && len(b.Branches) == 0 { 41 | if len(entry) > 0 { 42 | b.LeafValue = entry 43 | } else { 44 | // something came in but we already have branches for it 45 | // so the tail was the current branches index but no value 46 | // to push. just mark the current idx position as End 47 | } 48 | b.setEnd(true) 49 | addedBranch = b 50 | return 51 | } 52 | 53 | // check the overlap between the current LeafValue and the new entry 54 | newLeaf := func(LeafValue, newEntry []byte) (leaf []byte) { 55 | for li, b := range LeafValue { 56 | if li > len(newEntry)-1 { 57 | break 58 | } 59 | if b == newEntry[li] { 60 | leaf = append(leaf, b) 61 | } else { 62 | break 63 | } 64 | } 65 | return 66 | }(b.LeafValue, entry) 67 | 68 | newLeafLen := len(newLeaf) 69 | 70 | // the new leaf is smaller than the current leaf. 71 | // we will push the old leaf down the branch 72 | if newLeafLen < len(b.LeafValue) { 73 | tail := b.LeafValue[newLeafLen:] 74 | idx := tail[0] 75 | newBranch := b.NewBranch() 76 | newBranch.LeafValue = tail[1:] 77 | 78 | b.LeafValue = newLeaf 79 | newBranch.Branches, b.Branches = b.Branches, newBranch.Branches 80 | newBranch.End, b.End = b.End, newBranch.End 81 | if newBranch.End { 82 | if b.Count > 0 { 83 | newBranch.Count = b.Count 84 | } else { 85 | newBranch.Count = 1 86 | } 87 | } else { 88 | newBranch.Count = 0 89 | } 90 | if b.End { 91 | b.Count = 1 92 | } else { 93 | b.Count = 0 94 | } 95 | b.Branches[idx] = newBranch 96 | } 97 | 98 | // new leaf is smaller than the entry, which means there will be more stuff 99 | // that we need to push down 100 | if newLeafLen < len(entry) { 101 | tail := entry[newLeafLen:] 102 | idx := tail[0] 103 | 104 | // create new branch at idx if it does not exists yet 105 | if _, notPresent := b.Branches[idx]; !notPresent { 106 | b.Branches[idx] = b.NewBranch() 107 | } 108 | // check whether the idx itself marks an End $. if so add a new idx 109 | addedBranch = b.Branches[idx].add(tail[1:]) 110 | } else { 111 | // if there is nothing else to be pushed down we just have to mark the 112 | // current branch as an end. this happens when you add a value that already 113 | // is covered by the index but this particular end had not been marked. 114 | // eg. you already have 'food' and 'foot' (shared LeafValue of 'foo') in 115 | // your index and now add 'foo'. 116 | b.setEnd(true) 117 | addedBranch = b 118 | } 119 | return addedBranch 120 | } 121 | 122 | /* 123 | Members returns slice of all Members of the Branch prepended with `branchPrefix` 124 | */ 125 | func (b *Branch) members(branchPrefix []byte) (members []*MemberInfo) { 126 | if b.End { 127 | members = append(members, &MemberInfo{string(append(branchPrefix, b.LeafValue...)), b.Count}) 128 | } 129 | for idx, br := range b.Branches { 130 | newPrefix := append(append(branchPrefix, b.LeafValue...), idx) 131 | members = append(members, br.members(newPrefix)...) 132 | } 133 | return 134 | } 135 | 136 | /* 137 | prefixMembers returns a slice of all Members of the Branch matching the given prefix. The values returned are prepended with `branchPrefix` 138 | */ 139 | func (b *Branch) prefixMembers(branchPrefix []byte, searchPrefix []byte) (members []*MemberInfo) { 140 | exists, br, matchedPrefix := b.hasPrefixBranch(searchPrefix) 141 | if exists { 142 | members = br.members(matchedPrefix) 143 | } 144 | return 145 | } 146 | 147 | // func (b *Branch) prefixMembers(branchPrefix []byte, searchPrefix []byte) (members []*MemberInfo) { 148 | // leafLen := len(b.LeafValue) 149 | // searchPrefixLen := len(searchPrefix) 150 | 151 | // // if the searchPrefix is empty we want all members 152 | // if searchPrefixLen == 0 { 153 | // members = append(members, b.members(branchPrefix)...) 154 | // return 155 | // } 156 | 157 | // // if the searchPrefix is shorter than the leaf we will add the LeafValue 158 | // // if it is an End and a the searchPrefix matches 159 | // // if searchPrefixLen < leafLen { 160 | // if searchPrefixLen > leafLen { 161 | // for idx, br := range b.Branches { 162 | // // does it match the next byte? 163 | // if idx == searchPrefix[leafLen] { 164 | // newSearchPrefix := searchPrefix[leafLen+1:] 165 | // members = append(members, br.prefixMembers(append(append(branchPrefix, b.LeafValue...), idx), newSearchPrefix)...) 166 | // } 167 | // } 168 | // } else if searchPrefixLen == leafLen { 169 | // for i, sb := range searchPrefix { 170 | // if sb != b.LeafValue[i] { 171 | // return 172 | // } 173 | // } 174 | // members = append(members, b.members(branchPrefix)...) 175 | // } else { 176 | // if b.End { 177 | // for i, sb := range searchPrefix { 178 | // if sb != b.LeafValue[i] { 179 | // return 180 | // } 181 | // } 182 | // members = append(members, b.members(branchPrefix)...) 183 | // // members = append(members, &MemberInfo{string(append(branchPrefix, b.LeafValue...)), b.Count}) 184 | // } 185 | // } 186 | // return 187 | // } 188 | 189 | /* 190 | */ 191 | func (b *Branch) delete(entry []byte) (deleted bool) { 192 | leafLen := len(b.LeafValue) 193 | entryLen := len(entry) 194 | // does the leafValue match? 195 | if leafLen > 0 { 196 | if entryLen >= leafLen { 197 | for i, lb := range b.LeafValue { 198 | if entry[i] != lb { 199 | return false 200 | } 201 | } 202 | } else { 203 | return false 204 | } 205 | } 206 | 207 | // entry matches leaf. zero+ length 208 | // if there are branches there cant be End == true with a LeafValue. 209 | // if there are NO branches there MUST be End == true with either a LeafValue or not 210 | 211 | // we are at the leafend 212 | // log.Println("entryLen-leafLen", entryLen, leafLen, entryLen-leafLen) 213 | if b.End && ((entryLen - leafLen) == 0) { 214 | b.setEnd(false) 215 | if len(b.Branches) == 0 && b.Count == 0 { 216 | b.LeafValue = nil 217 | } else if len(b.Branches) == 1 && b.Count == 0 { 218 | b = b.pullUp() 219 | } 220 | return true 221 | } 222 | 223 | // if End == true and there are no Branches we can delete the branch because either the idx or the LeafValue mark the end - if it is matched it can be deleted 224 | // this is being checked in the branch above 225 | // prefix is matched. check for branches 226 | if leafLen < entryLen && b.hasBranch(entry[leafLen]) { 227 | // next branch matches. check the leaf/branches again 228 | nextBranch := b.Branches[entry[leafLen]] 229 | if len(nextBranch.Branches) == 0 && nextBranch.Count == 0 { 230 | delete(b.Branches, entry[leafLen]) 231 | return true 232 | } else { 233 | deleted := nextBranch.delete(entry[leafLen+1:]) 234 | if deleted && len(nextBranch.Branches) == 0 && !nextBranch.End { 235 | delete(b.Branches, entry[leafLen]) 236 | // dangling leaf value? 237 | if len(b.Branches) == 0 && b.Count == 0 { 238 | b.LeafValue = nil 239 | } 240 | } 241 | return deleted 242 | } 243 | } 244 | 245 | return false 246 | } 247 | 248 | /* 249 | */ 250 | func (b *Branch) has(entry []byte) bool { 251 | if b.getBranch(entry) != nil { 252 | return true 253 | } 254 | return false 255 | } 256 | 257 | func (b *Branch) hasCount(entry []byte) (bool, int64) { 258 | br := b.getBranch(entry) 259 | if br != nil { 260 | return true, br.Count 261 | } 262 | return false, 0 263 | } 264 | 265 | func (b *Branch) getBranch(entry []byte) (be *Branch) { 266 | leafLen := len(b.LeafValue) 267 | entryLen := len(entry) 268 | 269 | if entryLen >= leafLen { 270 | for i, pb := range b.LeafValue { 271 | if pb != entry[i] { 272 | return 273 | } 274 | } 275 | } else { 276 | return 277 | } 278 | 279 | if entryLen > leafLen { 280 | if br, present := b.Branches[entry[leafLen]]; present { 281 | return br.getBranch(entry[leafLen+1:]) 282 | } else { 283 | return 284 | } 285 | } else if entryLen == leafLen && b.End { 286 | be = b 287 | } 288 | return 289 | } 290 | 291 | /* 292 | */ 293 | func (b *Branch) hasPrefix(prefix []byte) bool { 294 | exists, _, _ := b.hasPrefixBranch(prefix) 295 | return exists 296 | } 297 | 298 | func (b *Branch) hasPrefixCount(prefix []byte) (exists bool, count int64) { 299 | exists, br, _ := b.hasPrefixBranch(prefix) 300 | if exists { 301 | count = br.sumCount() 302 | } 303 | return 304 | } 305 | 306 | func (b *Branch) hasPrefixBranch(prefix []byte) (exists bool, branch *Branch, matchedPrefix []byte) { 307 | leafLen := len(b.LeafValue) 308 | prefixLen := len(prefix) 309 | exists = false 310 | var pref []byte 311 | 312 | if leafLen > 0 { 313 | if prefixLen <= leafLen { 314 | for i, pb := range prefix { 315 | if pb != b.LeafValue[i] { 316 | return 317 | } 318 | } 319 | } else { 320 | for i, lb := range b.LeafValue { 321 | if prefix[i] != lb { 322 | return 323 | } 324 | } 325 | matchedPrefix = append(matchedPrefix, prefix[:leafLen]...) 326 | } 327 | } 328 | 329 | if prefixLen > leafLen { 330 | if br, present := b.Branches[prefix[leafLen]]; present { 331 | matchedPrefix = append(matchedPrefix, prefix[leafLen]) 332 | exists, branch, pref = br.hasPrefixBranch(prefix[leafLen+1:]) 333 | matchedPrefix = append(matchedPrefix, pref...) 334 | return 335 | } else { 336 | return 337 | } 338 | } 339 | return true, b, matchedPrefix 340 | } 341 | 342 | func (b *Branch) sumCount() (count int64) { 343 | // leaf itself matches 344 | if b.End { 345 | count += b.Count 346 | } 347 | for _, br := range b.Branches { 348 | count += br.sumCount() 349 | } 350 | return 351 | } 352 | 353 | /* 354 | */ 355 | func (b *Branch) Dump(depth int) (out string) { 356 | if len(b.LeafValue) > 0 { 357 | if b.End { 358 | out += fmt.Sprintf("%s V:%v %v (%v)\n", strings.Repeat(PADDING_CHAR, depth), string(b.LeafValue), b.LeafValue, b.Count) 359 | } else { 360 | out += fmt.Sprintf("%s V:%v %v (%v)\n", strings.Repeat(PADDING_CHAR, depth), string(b.LeafValue), b.LeafValue, "-") 361 | } 362 | } 363 | 364 | if b.End { 365 | out += fmt.Sprintf("%s $\n", strings.Repeat(PADDING_CHAR, depth+len(b.LeafValue))) 366 | } 367 | 368 | for idx, branch := range b.Branches { 369 | if branch.End && len(branch.LeafValue) == 0 { 370 | out += fmt.Sprintf("%s I:%v %v (%v)\n", strings.Repeat(PADDING_CHAR, depth+len(b.LeafValue)), string(idx), idx, branch.Count) 371 | } else { 372 | out += fmt.Sprintf("%s I:%v %v (%v)\n", strings.Repeat(PADDING_CHAR, depth+len(b.LeafValue)), string(idx), idx, "-") 373 | } 374 | out += branch.Dump(depth + len(b.LeafValue) + 1) 375 | } 376 | 377 | return 378 | } 379 | 380 | /* 381 | */ 382 | // func (b *Branch) hasBranches() bool { 383 | // return len(b.Branches) == 0 384 | // } 385 | 386 | /* 387 | */ 388 | func (b *Branch) hasBranch(idx byte) bool { 389 | if _, present := b.Branches[idx]; present { 390 | return true 391 | } 392 | return false 393 | } 394 | 395 | /* 396 | */ 397 | // func (b *Branch) matchesLeaf(entry []byte) bool { 398 | // leafLen := len(b.LeafValue) 399 | // entryLen := len(entry) 400 | 401 | // if leafLen == 0 && entryLen == 0 { 402 | // return true 403 | // } 404 | 405 | // if leafLen == entryLen { 406 | // for i, lb := range b.LeafValue { 407 | // if entry[i] != lb { 408 | // return false 409 | // } 410 | // } 411 | // } 412 | // return true 413 | // } 414 | 415 | /* 416 | */ 417 | func (b *Branch) pullUp() *Branch { 418 | if len(b.Branches) == 1 { 419 | for k, nextBranch := range b.Branches { 420 | if len(nextBranch.Branches) == 0 { 421 | b.LeafValue = append(b.LeafValue, append([]byte{k}, nextBranch.LeafValue...)...) 422 | } else { 423 | b.LeafValue = append(b.LeafValue, k) 424 | } 425 | b.End = nextBranch.End 426 | b.Branches = nextBranch.Branches 427 | b.Count = nextBranch.Count 428 | } 429 | return b.pullUp() 430 | } 431 | return b 432 | } 433 | 434 | func (b *Branch) setEnd(flag bool) { 435 | if flag { 436 | b.Count += 1 437 | } else { 438 | if b.End && b.Count > 0 { 439 | b.Count -= 1 440 | if b.Count > 0 { 441 | return 442 | } 443 | } 444 | } 445 | b.End = flag 446 | return 447 | } 448 | 449 | func (b *Branch) String() string { 450 | return b.Dump(0) 451 | } 452 | 453 | func (b *Branch) PrintDump() { 454 | fmt.Printf("\n%s\n\n", b) 455 | } 456 | -------------------------------------------------------------------------------- /trie_test.go: -------------------------------------------------------------------------------- 1 | package trie 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | "runtime" 7 | "sync" 8 | "testing" 9 | "time" 10 | 11 | "github.com/fvbock/uds-go/set" 12 | ) 13 | 14 | var ( 15 | tr1M *Trie 16 | randstrings []string 17 | ) 18 | 19 | func init() { 20 | runtime.GOMAXPROCS(1) 21 | 22 | tr1M = NewTrie() 23 | randstrings = make([]string, 1000000) 24 | i := 0 25 | for i < 1000000 { 26 | rstr := []byte{} 27 | n := 0 28 | for n < 50 { 29 | rstr = append(rstr, byte(rand.Intn(255))) 30 | n++ 31 | } 32 | randstrings[i] = string(rstr) 33 | i++ 34 | } 35 | fmt.Println(len(randstrings)) 36 | startTime := time.Now() 37 | for x := 0; x < 1000000; x++ { 38 | tr1M.Add(randstrings[x%500000]) 39 | } 40 | fmt.Printf("Adding 1M entries took: %v\n", time.Since(startTime)) 41 | fmt.Printf("Having %v distinct entries.\n", len(tr1M.Members())) 42 | } 43 | 44 | func TestTrieAddSingle(t *testing.T) { 45 | tr := NewTrie() 46 | tr.Add("test") 47 | if !tr.Root.End { 48 | t.Error("Expected Root End to be true") 49 | } 50 | } 51 | 52 | func TestTrieAddBigSmall(t *testing.T) { 53 | tr := NewTrie() 54 | tr.Add("testing") 55 | tr.Add("tests") 56 | if !tr.Root.Branches['i'].End { 57 | t.Error("Expected 'i' End to be true") 58 | } 59 | if !tr.Root.Branches['s'].End { 60 | t.Error("Expected 's' End to be true") 61 | } 62 | } 63 | 64 | func TestTrieAddSmallBig(t *testing.T) { 65 | tr := NewTrie() 66 | tr.Add("tests") 67 | tr.Add("testing") 68 | if !tr.Root.Branches['i'].End { 69 | t.Error("Expected 'i' End to be true") 70 | } 71 | if !tr.Root.Branches['s'].End { 72 | t.Error("Expected 's' End to be true") 73 | } 74 | } 75 | 76 | func TestTrieGetBranch(t *testing.T) { 77 | tr := NewTrie() 78 | tr.Add("test") 79 | tr.Add("testing") 80 | t.Logf("\n%s", tr.Dump()) 81 | 82 | b1 := tr.GetBranch("test") 83 | if b1 == nil { 84 | t.Error("Expected to find a branch for 'test'.") 85 | } 86 | 87 | b2 := tr.GetBranch("tests") 88 | if b2 != nil { 89 | t.Error("Expected not to find a branch for 'tests'.") 90 | } 91 | 92 | b3 := tr.GetBranch("testing") 93 | if b3 == nil { 94 | t.Error("Expected to find a branch for 'testing'.") 95 | } 96 | 97 | b4 := tr.GetBranch("testi") 98 | if b4 != nil { 99 | t.Error("Expected not to find a branch for 'testi'.") 100 | } 101 | 102 | b5 := tr.GetBranch("tessi") 103 | if b5 != nil { 104 | t.Error("Expected not to find a branch for 'tessi'.") 105 | } 106 | } 107 | 108 | // func TestTrieAddEmptyBranch(t *testing.T) { 109 | // tr := NewTrie() 110 | // tr.Add("foobar") 111 | // tr.Add("fooc") 112 | // if tr.Root.End { 113 | // t.Error("Expected Root End to be false") 114 | // } 115 | // t.Logf("\n%s", tr.Dump()) 116 | // // tr.Add("foob") 117 | // // tr.Add("fooba") 118 | // // tr.Add("fooca") 119 | // // t.Logf("\n%s", tr.Dump()) 120 | // // if !tr.Root.End { 121 | // // t.Error("Expected Root End to be true") 122 | // // } 123 | // } 124 | 125 | func TestTrieAddBigSmallMulti(t *testing.T) { 126 | tr := NewTrie() 127 | tr.Add("testing") 128 | tr.Add("testing") 129 | tr.Add("tests") 130 | if !tr.Root.Branches['i'].End { 131 | t.Error("Expected 'i' End to be true") 132 | } 133 | if !tr.Root.Branches['s'].End { 134 | t.Error("Expected 's' End to be true") 135 | } 136 | _, c1 := tr.HasCount("testing") 137 | if c1 != 2 { 138 | t.Errorf("Expected count for testing to be 2. got %v instead", c1) 139 | } 140 | _, c2 := tr.HasCount("tests") 141 | if c2 != 1 { 142 | t.Errorf("Expected count for tests to be 1. got %v instead.", c2) 143 | } 144 | } 145 | 146 | func TestTrieAddSmallBigMulti(t *testing.T) { 147 | tr := NewTrie() 148 | tr.Add("tests") 149 | tr.Add("tests") 150 | tr.Add("testing") 151 | if !tr.Root.Branches['i'].End { 152 | t.Error("Expected 'i' End to be true") 153 | } 154 | if !tr.Root.Branches['s'].End { 155 | t.Error("Expected 's' End to be true") 156 | } 157 | _, c1 := tr.HasCount("testing") 158 | if c1 != 1 { 159 | t.Errorf("Expected count for testing to be 1. got %v instead", c1) 160 | } 161 | _, c2 := tr.HasCount("tests") 162 | if c2 != 2 { 163 | t.Errorf("Expected count for tests to be 2. got %v instead.", c2) 164 | } 165 | } 166 | 167 | func TestTrieAddTestFirst(t *testing.T) { 168 | tr := NewTrie() 169 | tr.Add("test") 170 | tr.Add("testing") 171 | tr.Add("tests") 172 | if !tr.Root.End { 173 | t.Error("Expected Root End to be true") 174 | } 175 | if !tr.Root.End { 176 | t.Error("Expected trunk End to be true") 177 | } 178 | if !tr.Root.Branches['i'].End { 179 | t.Error("Expected 'i' End to be true") 180 | } 181 | if !tr.Root.Branches['s'].End { 182 | t.Error("Expected 's' End to be true") 183 | } 184 | } 185 | 186 | func TestTrieAddTestLast(t *testing.T) { 187 | tr := NewTrie() 188 | tr.Add("testing") 189 | tr.Add("tests") 190 | tr.Add("test") 191 | if !tr.Root.End { 192 | t.Error("Expected Root End to be true") 193 | } 194 | if !tr.Root.Branches['i'].End { 195 | t.Error("Expected 'i' End to be true") 196 | } 197 | if !tr.Root.Branches['s'].End { 198 | t.Error("Expected 's' End to be true") 199 | } 200 | } 201 | 202 | func TestTrieDump(t *testing.T) { 203 | tr := NewTrie() 204 | tr.Add("teased") 205 | tr.Add("test") 206 | tr.Add("test") 207 | tr.Add("testing") 208 | tr.Add("tea") 209 | t.Logf("\n%s", tr.Dump()) 210 | } 211 | 212 | func TestTrieMembersCount(t *testing.T) { 213 | tr := NewTrie() 214 | tr.Add("teased") 215 | tr.Add("test") 216 | tr.Add("test") 217 | tr.Add("testing") 218 | 219 | if len(tr.Members()) != 3 { 220 | t.Error("Expected 3 members") 221 | } 222 | for _, mi := range tr.Members() { 223 | if mi.Value == "teased" && mi.Count != 1 { 224 | t.Error("Expected teased to have Count 1") 225 | continue 226 | } 227 | if mi.Value == "test" && mi.Count != 2 { 228 | t.Error("Expected test to have Count 2") 229 | continue 230 | } 231 | if mi.Value == "testing" && mi.Count != 1 { 232 | t.Error("Expected testing to have Count 1") 233 | continue 234 | } 235 | } 236 | t.Logf("\n%v", tr.Members()) 237 | } 238 | 239 | // // todo 240 | // func TestTriePrefixMembersCount(t *testing.T) { 241 | // tr := NewTrie() 242 | // tr.Add("foo") 243 | // tr.Add("foobar") 244 | // tr.Add("bar") 245 | 246 | // if tr.MembersCount("test") != 0 { 247 | // t.Error("Expected HasCount for test to be 0") 248 | // } 249 | // } 250 | 251 | func TestTriePrefixMembers(t *testing.T) { 252 | tr := NewTrie() 253 | tr.Add("teased") 254 | tr.Add("test") 255 | tr.Add("test") 256 | tr.Add("testing") 257 | 258 | if len(tr.PrefixMembers("test")) != 2 { 259 | t.Error("Expected PrefixMembers('test') to have length 2") 260 | } 261 | var expectedMembers1 = []string{"test", "testing"} 262 | checkMembers1: 263 | for _, s := range expectedMembers1 { 264 | for _, m := range tr.PrefixMembers("test") { 265 | if s == m.Value { 266 | continue checkMembers1 267 | } 268 | } 269 | t.Errorf("Expected PrefixMembers('test') to have `%s` as member", s) 270 | } 271 | 272 | if len(tr.PrefixMembers("te")) != 3 { 273 | t.Error("Expected PrefixMembers('te') to have length 3") 274 | } 275 | var expectedMembers2 = []string{"test", "testing", "teased"} 276 | checkMembers2: 277 | for _, s := range expectedMembers2 { 278 | for _, m := range tr.PrefixMembers("te") { 279 | if s == m.Value { 280 | continue checkMembers2 281 | } 282 | } 283 | t.Errorf("Expected PrefixMembers('te') to have `%s` as member", s) 284 | } 285 | 286 | if len(tr.PrefixMembers("a")) != 0 { 287 | t.Error("Expected PrefixMembers('a') to have length 0") 288 | } 289 | if len(tr.PrefixMembers("ta")) != 0 { 290 | t.Error("Expected PrefixMembers('ta') to have length 0") 291 | } 292 | 293 | if len(tr.PrefixMembers("")) != 3 { 294 | t.Error("Expected PrefixMembers('') to have length 3") 295 | } 296 | if len(tr.PrefixMembersList("a")) != 0 { 297 | t.Error("Expected PrefixMembersList('a') to have length 0") 298 | } 299 | if len(tr.PrefixMembersList("")) != 3 { 300 | t.Error("Expected PrefixMembersList('') to have length 3") 301 | } 302 | 303 | // cover different code paths 304 | tr.Add("te") 305 | tr.PrintDump() 306 | if len(tr.PrefixMembers("a")) != 0 { 307 | t.Error("Expected PrefixMembers('a') to have length 0") 308 | } 309 | tl := len(tr.PrefixMembers("t")) 310 | if tl != 4 { 311 | t.Errorf("Expected PrefixMembers('t') to have length 4, got %v instead.", tl) 312 | } 313 | } 314 | 315 | func TestTrieHasPrefixEmpty(t *testing.T) { 316 | tr := NewTrie() 317 | if tr.HasPrefix("test") { 318 | t.Error("Expected no prefix test") 319 | } 320 | _, c := tr.HasPrefixCount("test") 321 | if c != 0 { 322 | t.Error("Expected no prefix test") 323 | } 324 | } 325 | 326 | func TestTrieHasPrefixOne(t *testing.T) { 327 | tr := NewTrie() 328 | tr.Add("test") 329 | if !tr.HasPrefix("test") { 330 | t.Error("Expected prefix test") 331 | } 332 | _, c := tr.HasPrefixCount("test") 333 | if c != 1 { 334 | t.Error("Expected prefix test to have count 1") 335 | } 336 | } 337 | 338 | func TestTrieHasPrefixMany(t *testing.T) { 339 | tr := NewTrie() 340 | tr.Add("tease") 341 | tr.Add("teases") 342 | tr.Add("teased") 343 | tr.Add("teaser") 344 | tr.Add("tests") 345 | tr.Add("test") 346 | tr.Add("tested") 347 | tr.Add("testing") 348 | if tr.HasPrefix("ted") { 349 | t.Error("Expected no prefix ted") 350 | } 351 | if !tr.HasPrefix("tease") { 352 | t.Error("Expected prefix tease") 353 | } 354 | if !tr.HasPrefix("testing") { 355 | t.Error("Expected prefix testing") 356 | } 357 | 358 | // prefixCount 359 | _, ctest := tr.HasPrefixCount("test") 360 | if ctest != 4 { 361 | t.Errorf("Expected prefix test to have count 4, got %v instead.", ctest) 362 | } 363 | _, ctes := tr.HasPrefixCount("tes") 364 | if ctes != 4 { 365 | t.Errorf("Expected prefix tes to have count 4, got %v instead.", ctes) 366 | } 367 | 368 | _, ctea := tr.HasPrefixCount("tea") 369 | if ctea != 4 { 370 | t.Errorf("Expected prefix tea to have count 4, got %v instead.", ctea) 371 | } 372 | tr.Add("tea") 373 | _, ctea = tr.HasPrefixCount("tea") 374 | if ctea != 5 { 375 | t.Errorf("Expected prefix tea to have count 5, got %v instead.", ctea) 376 | } 377 | 378 | // test false cases with shorter and longer than leaf prefixes 379 | _, ca := tr.HasPrefixCount("a") 380 | if ca != 0 { 381 | t.Errorf("Expected prefix a to have count 0, got %v instead.", ca) 382 | } 383 | _, casdf := tr.HasPrefixCount("asdf") 384 | if casdf != 0 { 385 | t.Errorf("Expected prefix asdf to have count 0, got %v instead.", casdf) 386 | } 387 | } 388 | 389 | func TestTrieHasEmpty(t *testing.T) { 390 | tr := NewTrie() 391 | if tr.Has("test") { 392 | t.Error("Expected no test") 393 | } 394 | } 395 | 396 | func TestTrieHasOne(t *testing.T) { 397 | tr := NewTrie() 398 | tr.Add("test") 399 | if !tr.Has("test") { 400 | t.Error("Expected test") 401 | } 402 | } 403 | 404 | func TestTrieHasMany(t *testing.T) { 405 | tr := NewTrie() 406 | tr.Add("tease") 407 | tr.Add("teases") 408 | tr.Add("teased") 409 | tr.Add("teaser") 410 | tr.Add("tests") 411 | tr.Add("test") 412 | tr.Add("tested") 413 | tr.Add("testing") 414 | if tr.Has("testi") { 415 | t.Error("Expected no testi") 416 | } 417 | if tr.Has("te") { 418 | t.Error("Expected no te") 419 | } 420 | if !tr.Has("tease") { 421 | t.Error("Expected tease") 422 | } 423 | if !tr.Has("testing") { 424 | t.Error("Expected testing") 425 | } 426 | } 427 | 428 | func TestTrieHasPrefixManyMultibyte(t *testing.T) { 429 | tr := NewTrie() 430 | tr.Add("日本人") 431 | tr.Add("人") 432 | tr.Add("日本") 433 | tr.Add("日本語学校") 434 | tr.Add("学校") 435 | tr.Add("日本語") 436 | if tr.HasPrefix("ä") { 437 | t.Error("Expected no prefix ä") 438 | } 439 | if tr.HasPrefix("無い") { 440 | t.Error("Expected no prefix 無い") 441 | } 442 | if !tr.HasPrefix("日本語") { 443 | t.Error("Expected prefix 日本語") 444 | } 445 | if !tr.HasPrefix("日") { 446 | t.Error("Expected prefix 日") 447 | } 448 | } 449 | 450 | /* 451 | These are a bunch of tests that i basically added when trying to reproduce 452 | a bug that dropped a byte from prefix members in certain cases. (only happened 453 | with multibyte character prefixes/terms) 454 | strictly speaking these tests could go, but i leave them in for now. 455 | */ 456 | func TestTrieHasPrefixMultibyteWithSharedSubCharBytes(t *testing.T) { 457 | tr := NewTrie() 458 | tr.Add("フードスポンサー") 459 | tr.Add("フードラボ") 460 | 461 | if !tr.HasPrefix("フ") { 462 | t.Error("Expected prefix フ") 463 | } 464 | if !tr.HasPrefix("フー") { 465 | t.Error("Expected prefix フー") 466 | } 467 | if !tr.HasPrefix("フード") { 468 | t.Error("Expected prefix フード") 469 | } 470 | 471 | // t.Log(tr.PrefixMembers("フ")) 472 | if len(tr.PrefixMembers("フ")) != 2 { 473 | t.Error("Expected PrefixMembers('フ') to have length 2") 474 | } 475 | var expectedMembers = []string{"フードスポンサー", "フードラボ"} 476 | checkMembers1: 477 | for _, s := range expectedMembers { 478 | for _, m := range tr.PrefixMembers("フ") { 479 | if s == m.Value { 480 | continue checkMembers1 481 | } 482 | } 483 | t.Errorf("Expected PrefixMembers('フ') to have `%s` as member", s) 484 | } 485 | 486 | // t.Log(tr.PrefixMembers("フー")) 487 | if len(tr.PrefixMembers("フー")) != 2 { 488 | t.Error("Expected PrefixMembers('フー') to have length 2") 489 | } 490 | checkMembers2: 491 | for _, s := range expectedMembers { 492 | for _, m := range tr.PrefixMembers("フー") { 493 | if s == m.Value { 494 | continue checkMembers2 495 | } 496 | } 497 | t.Errorf("Expected PrefixMembers('フー') to have `%s` as member", s) 498 | } 499 | 500 | // t.Log(tr.PrefixMembers("フード")) 501 | if len(tr.PrefixMembers("フード")) != 2 { 502 | t.Error("Expected PrefixMembers('フード') to have length 2") 503 | } 504 | checkMembers3: 505 | for _, s := range expectedMembers { 506 | for _, m := range tr.PrefixMembers("フード") { 507 | if s == m.Value { 508 | continue checkMembers3 509 | } 510 | } 511 | t.Errorf("Expected PrefixMembers('フード') to have `%s` as member", s) 512 | } 513 | 514 | // t.Log(tr.PrefixMembers("フードス")) 515 | if len(tr.PrefixMembers("フードス")) != 1 { 516 | t.Error("Expected PrefixMembers('フードス') to have length 1") 517 | } 518 | if tr.PrefixMembers("フードス")[0].Value != "フードスポンサー" { 519 | t.Error("Expected PrefixMembers('フードス') to have `フードスポンサー` as first and only member") 520 | } 521 | 522 | tr.Add("ファ") 523 | tr.Add("フぃ") 524 | tr.Add("フぇ") 525 | tr.Add("フォーム") 526 | tr.Add("フリガナ") 527 | 528 | if !tr.HasPrefix("フ") { 529 | t.Error("Expected prefix フ") 530 | } 531 | if !tr.HasPrefix("フー") { 532 | t.Error("Expected prefix フー") 533 | } 534 | if !tr.HasPrefix("フード") { 535 | t.Error("Expected prefix フード") 536 | } 537 | 538 | // t.Log(tr.PrefixMembers("フー")) 539 | if len(tr.PrefixMembers("フー")) != 2 { 540 | t.Error("Expected PrefixMembers('フー') to have length 2") 541 | } 542 | checkMembers4: 543 | for _, s := range expectedMembers { 544 | for _, m := range tr.PrefixMembers("フー") { 545 | if s == m.Value { 546 | continue checkMembers4 547 | } 548 | } 549 | t.Errorf("Expected PrefixMembers('フー') to have `%s` as member", s) 550 | } 551 | } 552 | 553 | func TestTrieHasManyMultibyte(t *testing.T) { 554 | tr := NewTrie() 555 | tr.Add("日本人") 556 | tr.Add("人") 557 | tr.Add("日本") 558 | tr.Add("日本語学校") 559 | tr.Add("学校") 560 | tr.Add("日本語") 561 | if tr.Has("ä") { 562 | t.Error("Expected no ä") 563 | } 564 | if tr.Has("無い") { 565 | t.Error("Expected no 無い") 566 | } 567 | if tr.Has("日") { 568 | t.Error("Expected no 日") 569 | } 570 | if !tr.Has("日本語") { 571 | t.Error("Expected 日本語") 572 | } 573 | if !tr.Has("学校") { 574 | t.Error("Expected 学校") 575 | } 576 | } 577 | 578 | func TestTrieDeleteEmpty(t *testing.T) { 579 | tr := NewTrie() 580 | if tr.Delete("test") { 581 | t.Error("Expected false for tr.Delete('test')") 582 | } 583 | } 584 | 585 | func TestTrieDeleteNothing(t *testing.T) { 586 | tr := NewTrie() 587 | tr.Add("test") 588 | if tr.Delete("") { 589 | t.Error("Expected false for tr.Delete('')") 590 | } 591 | 592 | _, c1 := tr.HasCount("test") 593 | if c1 != 1 { 594 | t.Errorf("Expected count for test to be 1. got %v instead", c1) 595 | } 596 | 597 | if tr.Delete("tes") { 598 | t.Error("Expected false for tr.Delete('tes')") 599 | } 600 | } 601 | 602 | func TestTrieDeleteOne(t *testing.T) { 603 | tr := NewTrie() 604 | tr.Add("test") 605 | if !tr.Delete("test") { 606 | t.Error("Expected true for tr.Delete('test')") 607 | } 608 | 609 | // // delete a branch that has no further branches 610 | // tr.Add("teste") 611 | // if !tr.Delete("teste") { 612 | // t.Error("Expected true for tr.Delete('test')") 613 | // } 614 | } 615 | 616 | func TestTrieDeleteDouble(t *testing.T) { 617 | tr := NewTrie() 618 | tr.Add("foo") 619 | tr.Add("test") 620 | tr.Add("test") 621 | if !tr.Delete("test") { 622 | t.Error("Expected true for tr.Delete('test')") 623 | } 624 | tr.PrintDump() 625 | t.Log(tr.Members()) 626 | if !tr.Delete("test") { 627 | t.Error("Expected true for tr.Delete('test')") 628 | } 629 | tr.PrintDump() 630 | t.Log(tr.Members()) 631 | } 632 | 633 | func TestTrieDeletePrefixCount(t *testing.T) { 634 | tr := NewTrie() 635 | tr.Add("foo") 636 | tr.Add("foo") 637 | tr.Add("foobar") 638 | tr.PrintDump() 639 | if tr.Delete("test") { 640 | t.Error("Expected false for tr.Delete('test')") 641 | } 642 | if !tr.Delete("foo") { 643 | t.Error("Expected true for tr.Delete('foo')") 644 | } 645 | tr.PrintDump() 646 | _, cfoo := tr.HasCount("foo") 647 | if cfoo != 1 { 648 | t.Errorf("Expected count for foo to be 1. got %v instead.", cfoo) 649 | } 650 | _, cfoobar := tr.HasCount("foobar") 651 | if cfoobar != 1 { 652 | t.Errorf("Expected count for foobar to be 1. got %v instead.", cfoobar) 653 | } 654 | if !tr.Delete("foo") { 655 | t.Error("Expected true for tr.Delete('foo')") 656 | } 657 | tr.PrintDump() 658 | _, cfoo = tr.HasCount("foo") 659 | if cfoo != 0 { 660 | t.Errorf("Expected count for foo to be 0. got %v instead.", cfoo) 661 | } 662 | _, cfoobar = tr.HasCount("foobar") 663 | if cfoobar != 1 { 664 | t.Errorf("Expected count for foobar to be 1. got %v instead.", cfoobar) 665 | } 666 | } 667 | 668 | func TestTrieDeleteMany(t *testing.T) { 669 | tr := NewTrie() 670 | tr.Add("tease") 671 | tr.Add("teases") 672 | tr.Add("teased") 673 | tr.Add("test") 674 | tr.Add("test") 675 | 676 | // if tr.Delete("te") { 677 | // t.Error("Expected false for tr.Delete('te')") 678 | // } 679 | if !tr.Delete("test") { 680 | t.Error("Expected true for tr.Delete('test')") 681 | } 682 | 683 | expectedMembers := make(map[string]bool) 684 | expectedMembers["tease"] = true 685 | expectedMembers["teases"] = true 686 | expectedMembers["teased"] = true 687 | expectedMembers["test"] = true 688 | // expectedMembers["test"] = true 689 | for _, m := range tr.Members() { 690 | if m.Count != 1 { 691 | t.Errorf("Expected Count for %s to be 1 - not %v.", m.Value, m.Count) 692 | } else { 693 | ec := len(expectedMembers) 694 | delete(expectedMembers, m.Value) 695 | if len(expectedMembers) == ec { 696 | t.Errorf("Not expected member %s.", m.Value) 697 | } 698 | } 699 | } 700 | 701 | if len(expectedMembers) != 0 { 702 | t.Log(tr.Members()) 703 | t.Error("Deletion seems to have deleted more than just 'test' (once).", expectedMembers) 704 | } 705 | 706 | if !tr.Delete("tease") { 707 | t.Error("Expected true for tr.Delete('tease')") 708 | } 709 | if !tr.Delete("teases") { 710 | t.Error("Expected true for tr.Delete('tease')") 711 | } 712 | if !tr.Delete("teased") { 713 | t.Error("Expected true for tr.Delete('tease')") 714 | } 715 | 716 | tr.PrintDump() 717 | t.Log(tr.Members()) 718 | if !tr.Delete("test") { 719 | t.Error("Expected true for tr.Delete('test')") 720 | } 721 | 722 | tr.PrintDump() 723 | t.Log(tr.Members()) 724 | 725 | if len(tr.Root.Branches) != 0 { 726 | t.Error("Expected 0 Branches on Root") 727 | } 728 | if len(tr.Root.LeafValue) != 0 { 729 | t.Error("Expected no LeafValue on Root") 730 | } 731 | if tr.Root.End { 732 | t.Error("Expected End to be false on Root") 733 | } 734 | } 735 | 736 | func TestTrieDeleteManyRandom_az(t *testing.T) { 737 | tr := NewTrie() 738 | var prefix = "prefix" 739 | var words []string 740 | var str []byte 741 | var n = 0 742 | for n < 100 { 743 | i := 0 744 | str = []byte{} 745 | for i < 10 { 746 | rn := 0 747 | for rn < 97 { 748 | rn = rand.Intn(123) 749 | } 750 | str = append(str, byte(rn)) 751 | i++ 752 | } 753 | if rand.Intn(2) == 1 { 754 | words = append(words, prefix+string(str)) 755 | tr.Add(prefix + string(str)) 756 | } else { 757 | words = append(words, string(str)) 758 | tr.Add(string(str)) 759 | } 760 | n++ 761 | } 762 | // t.Log(words) 763 | // tr.PrintDump() 764 | for wi, w := range words { 765 | if !tr.Delete(w) { 766 | t.Errorf("Expected true for tr.Delete('%s')", w) 767 | } 768 | // expect to still find the rest 769 | if wi+1 < len(words) { 770 | for _, ow := range words[wi+1:] { 771 | // t.Logf("Checking for %s", ow) 772 | if !tr.Has(ow) { 773 | t.Errorf("Expected to still find %s", ow) 774 | } 775 | } 776 | } 777 | } 778 | tr.PrintDump() 779 | if len(tr.Root.Branches) != 0 { 780 | t.Error("Expected 0 Branches on Root") 781 | } 782 | if len(tr.Root.LeafValue) != 0 { 783 | t.Error("Expected no LeafValue on Root") 784 | } 785 | if tr.Root.End { 786 | t.Error("Expected End to be false on Root") 787 | } 788 | } 789 | 790 | func TestTrieMultiAdd(t *testing.T) { 791 | tr := NewTrie() 792 | words := []string{"foodie", "foods", "foodchain", "foodcrave", "food", "人", "日本", "日本語学校", "学校", "日本語"} 793 | // words := []string{"日本語", "日本語学校"} 794 | // words := []string{"日本語学校", "日本"} 795 | wg := sync.WaitGroup{} 796 | for _, w := range words { 797 | // wg.Add(1) 798 | // go func(word string) { 799 | // tr.Add(word) 800 | // wg.Done() 801 | // }(w) 802 | 803 | // tr.Add(w) 804 | // tr.Add(w) 805 | // if w == "日本" { 806 | // tr.PrintDump() 807 | // tr.Delete(w) 808 | // tr.PrintDump() 809 | // } 810 | 811 | // wg.Add(2) 812 | // go func(word string) { 813 | // tr.Add(word) 814 | // wg.Done() 815 | // }(w) 816 | // go func(word string) { 817 | // tr.Add(word) 818 | // wg.Done() 819 | // }(w) 820 | // go func(word string) { 821 | // wg.Add(1) 822 | // if word == "日本" { 823 | // tr.PrintDump() 824 | // tr.Delete(word) 825 | // tr.PrintDump() 826 | // } 827 | // wg.Done() 828 | // }(w) 829 | 830 | // wg.Add(3) 831 | // go func(word string) { 832 | // tr.Add(word) 833 | // wg.Done() 834 | // }(w) 835 | // go func(word string) { 836 | // tr.Delete(word) 837 | // wg.Done() 838 | // }(w) 839 | // go func(word string) { 840 | // tr.Add(word) 841 | // wg.Done() 842 | // }(w) 843 | 844 | wg.Add(5) 845 | go func(word string) { 846 | tr.Add(word) 847 | wg.Done() 848 | }(w) 849 | go func(word string) { 850 | tr.Delete(word) 851 | wg.Done() 852 | }(w) 853 | go func(word string) { 854 | tr.Add(word) 855 | wg.Done() 856 | }(w) 857 | go func(word string) { 858 | tr.Delete(word) 859 | wg.Done() 860 | }(w) 861 | go func(word string) { 862 | tr.Add(word) 863 | wg.Done() 864 | }(w) 865 | 866 | } 867 | wg.Wait() 868 | tr.PrintDump() 869 | t.Log(tr.Members()) 870 | } 871 | 872 | func TestTrieDumpToFileLoadFromFile(t *testing.T) { 873 | tr := NewTrie() 874 | var prefix = "prefix" 875 | var words []string 876 | var str []byte 877 | var insert string 878 | var n = 0 879 | for n < 100 { 880 | i := 0 881 | str = []byte{} 882 | for i < 10 { 883 | rn := 0 884 | for rn < 97 { 885 | rn = rand.Intn(123) 886 | } 887 | str = append(str, byte(rn)) 888 | i++ 889 | } 890 | if rand.Intn(2) == 1 { 891 | insert = prefix + string(str) 892 | } else { 893 | insert = string(str) 894 | } 895 | words = append(words, insert) 896 | tr.Add(insert) 897 | if rand.Intn(2) == 1 { 898 | tr.Add(insert) 899 | } 900 | n++ 901 | } 902 | err := tr.DumpToFile("testfiles/TestDumpToFileLoadFromFile") 903 | 904 | loadedTrie, err := LoadFromFile("testfiles/TestDumpToFileLoadFromFile") 905 | if err != nil { 906 | t.Errorf("Failed to load Trie from file: %v", err) 907 | } 908 | for _, w := range words { 909 | // t.Logf("Checking for %s", w) 910 | if !loadedTrie.Has(w) { 911 | t.Errorf("Expected to find %s", w) 912 | } 913 | } 914 | 915 | trMembers := set.NewStringSet(tr.MembersList()...) 916 | loadedTrieMembers := set.NewStringSet(loadedTrie.MembersList()...) 917 | 918 | t.Log("trMembers.IsEqual(loadedTrieMembers):", trMembers.IsEqual(loadedTrieMembers)) 919 | 920 | diff := trMembers.Difference(loadedTrieMembers) 921 | if diff.Len() > 0 { 922 | t.Error("Dump() of the original and the LoadFromFile() version of the Trie are different.") 923 | } 924 | 925 | // check counts 926 | for _, mi := range tr.Members() { 927 | _, count := loadedTrie.HasCount(mi.Value) 928 | if count != mi.Count { 929 | t.Errorf("Count for member %s differs: orig was %v, restored trie has %v", mi.Value, mi.Count, count) 930 | } 931 | } 932 | 933 | // test expected failures 934 | if tr.DumpToFile("dirdoesnotexist/TestDumpToFileLoadFromFile") == nil { 935 | t.Error("expected DumpToFile() to fail with non existent directory.") 936 | } 937 | } 938 | 939 | func TestTrieLoadFromFileEmpty(t *testing.T) { 940 | loadedTrie, err := LoadFromFile("testfiles/empty") 941 | if err != nil { 942 | t.Errorf("Failed to load Trie from file: %v", err) 943 | } 944 | 945 | loadedTrieMembers := set.NewStringSet(loadedTrie.MembersList()...) 946 | t.Log(loadedTrieMembers) 947 | t.Log(loadedTrieMembers.Len()) 948 | if loadedTrieMembers.Len() > 0 { 949 | t.Error("Expected 0 Members from LoadFromFile() with an empty file.") 950 | } 951 | } 952 | 953 | func TestTrieLoadFromFileExpectedFailures(t *testing.T) { 954 | _, err := LoadFromFile("testfiles/notatriedump") 955 | if err == nil { 956 | t.Error("Expected LoadFromFile to fail - file testfiles/notatriedump is not a valid Trie dump.") 957 | } 958 | _, err = LoadFromFile("doesnotexist/doesnotexist") 959 | if err == nil { 960 | t.Error("Expected LoadFromFile to fail - file testfiles/doesnotexist does notexist.") 961 | } 962 | } 963 | 964 | func TestTrieDumpToFileMergeFromFile(t *testing.T) { 965 | tr := NewTrie() 966 | tr.Add("test") 967 | tr.Add("test") 968 | tr.Add("tested") 969 | tr.Add("tent") 970 | tr.DumpToFile("testfiles/TestDumpToFileMergeFromFile") 971 | 972 | tr2 := NewTrie() 973 | tr2.Add("tea") 974 | tr2.Add("tested") 975 | 976 | err := tr2.MergeFromFile("testfiles/TestDumpToFileMergeFromFile") 977 | if err != nil { 978 | t.Errorf("Failed to merge Trie from file: %v", err) 979 | } 980 | 981 | _, ctest := tr2.HasCount("test") 982 | if ctest != 2 { 983 | t.Errorf("Expected count for test to be 2. got %v instead.", ctest) 984 | } 985 | _, ctested := tr2.HasCount("tested") 986 | if ctested != 2 { 987 | t.Errorf("Expected count for tested to be 2. got %v instead.", ctested) 988 | } 989 | _, ctea := tr2.HasCount("tea") 990 | if ctea != 1 { 991 | t.Errorf("Expected count for tea to be 1. got %v instead.", ctea) 992 | } 993 | _, ctent := tr2.HasCount("tent") 994 | if ctent != 1 { 995 | t.Errorf("Expected count for tent to be 1. got %v instentd.", ctent) 996 | } 997 | 998 | // expected failure 999 | err = tr2.MergeFromFile("doesnotexist/doesnotexist") 1000 | if err == nil { 1001 | t.Error("Expected MergeFromFile to fail - file testfiles/doesnotexist does notexist.") 1002 | } 1003 | } 1004 | 1005 | // some simple benchmarks 1006 | 1007 | func BenchmarkTrieBenchAdd(b *testing.B) { 1008 | tr := NewTrie() 1009 | for x := 0; x < b.N; x++ { 1010 | tr.Add(randstrings[x%500000]) 1011 | } 1012 | } 1013 | 1014 | func BenchmarkTrieBenchHasPrefix(b *testing.B) { 1015 | tr := NewTrie() 1016 | b.StopTimer() 1017 | randstr := make([]string, 100) 1018 | i := 0 1019 | for i < 100000 { 1020 | rstr := []byte{} 1021 | n := 0 1022 | for n < 100 { 1023 | rstr = append(rstr, byte(rand.Intn(255))) 1024 | n++ 1025 | } 1026 | randstr = append(randstr, string(rstr)) 1027 | i++ 1028 | } 1029 | 1030 | for x := 0; x < 1000000; x++ { 1031 | tr.Add(randstr[x%10000]) 1032 | } 1033 | // fmt.Printf("Having %v distinct entries.\n", len(tr.Members())) 1034 | b.StartTimer() 1035 | for x := 0; x < b.N; x++ { 1036 | tr.HasPrefix(randstr[x%100000]) 1037 | } 1038 | } 1039 | 1040 | func BenchmarkTrieBenchHas(b *testing.B) { 1041 | tr := NewTrie() 1042 | b.StopTimer() 1043 | randstr := make([]string, 100) 1044 | i := 0 1045 | for i < 100000 { 1046 | rstr := []byte{} 1047 | n := 0 1048 | for n < 100 { 1049 | rstr = append(rstr, byte(rand.Intn(255))) 1050 | n++ 1051 | } 1052 | randstr = append(randstr, string(rstr)) 1053 | i++ 1054 | } 1055 | 1056 | for x := 0; x < 1000000; x++ { 1057 | tr.Add(randstr[x%10000]) 1058 | } 1059 | // fmt.Printf("Having %v distinct entries.\n", len(tr.Members())) 1060 | b.StartTimer() 1061 | for x := 0; x < b.N; x++ { 1062 | tr.Has(randstr[x%100000]) 1063 | } 1064 | } 1065 | 1066 | func BenchmarkTrie1MBenchHasPrefix(b *testing.B) { 1067 | for x := 0; x < b.N; x++ { 1068 | tr1M.HasPrefix(randstrings[x%1000000]) 1069 | } 1070 | } 1071 | 1072 | func BenchmarkTrie1MBenchHas(b *testing.B) { 1073 | for x := 0; x < b.N; x++ { 1074 | tr1M.Has(randstrings[x%1000000]) 1075 | } 1076 | } 1077 | --------------------------------------------------------------------------------