├── README.md ├── bktree.go └── bktree_test.go /README.md: -------------------------------------------------------------------------------- 1 | ## bktree 2 | 3 | 编辑距离(Edit Distance),又称Levenshtein距离,是指两个字串之间,由一个转成另一个所需的最少编辑操作次数。 4 | 5 | 许可的编辑操作包括将一个字符替换成另一个字符,插入一个字符,删除一个字符。 6 | 7 | 8 | bktree的作用是: 9 | 10 | 给定一个词典(很多字符串),然后每输入一个字符串和一个数字k, 11 | 12 | 从词典中查找出与该字符串编辑距离小于等于k的字符串。 13 | 14 | ~~~ go 15 | package main 16 | 17 | import ( 18 | "fmt" 19 | "github.com/gansidui/bktree" 20 | "log" 21 | ) 22 | 23 | func main() { 24 | if bktree.Levenshtein("hello", "Aelo") != 2 { 25 | log.Fatal() 26 | } 27 | 28 | if bktree.Levenshtein("我爱你", "你爱我") != 2 { 29 | log.Fatal() 30 | } 31 | 32 | bk := bktree.New() 33 | bk.SetLevenshteinLimit(50) 34 | 35 | bk.Insert("ABCD") 36 | bk.Insert("ACED") 37 | bk.Insert("SBDE") 38 | 39 | ret := bk.Find("AABB", 3, -1) 40 | fmt.Println(ret) 41 | } 42 | 43 | ~~~ 44 | 45 | 46 | 47 | ## LICENSE 48 | 49 | MIT -------------------------------------------------------------------------------- /bktree.go: -------------------------------------------------------------------------------- 1 | package bktree 2 | 3 | const DEFAULT_MAX_LEVENSHTEIN = 50 4 | 5 | type bktreeNode struct { 6 | str string 7 | child []*bktreeNode 8 | } 9 | 10 | func newBktreeNode(s string, limit int) *bktreeNode { 11 | return &bktreeNode{ 12 | str: s, 13 | child: make([]*bktreeNode, limit+1), 14 | } 15 | } 16 | 17 | type BKTree struct { 18 | root *bktreeNode 19 | size int 20 | levenshteinLimit int 21 | } 22 | 23 | func New() *BKTree { 24 | return &BKTree{ 25 | root: nil, 26 | size: 0, 27 | levenshteinLimit: DEFAULT_MAX_LEVENSHTEIN, 28 | } 29 | } 30 | 31 | func (this *BKTree) SetLevenshteinLimit(limit int) { 32 | this.levenshteinLimit = limit 33 | } 34 | 35 | func (this *BKTree) GetLevenshteinLimit() int { 36 | return this.levenshteinLimit 37 | } 38 | 39 | func (this *BKTree) Size() int { 40 | return this.size 41 | } 42 | 43 | func (this *BKTree) insert(rt *bktreeNode, s string) bool { 44 | d := Levenshtein(rt.str, s) 45 | if d > this.levenshteinLimit || d >= len(rt.child) { 46 | return false 47 | } 48 | 49 | if rt.child[d] == nil { 50 | rt.child[d] = newBktreeNode(s, this.levenshteinLimit) 51 | return true 52 | } else { 53 | return this.insert(rt.child[d], s) 54 | } 55 | } 56 | 57 | func (this *BKTree) Insert(s string) bool { 58 | if this.root == nil { 59 | this.root = newBktreeNode(s, this.levenshteinLimit) 60 | this.size++ 61 | return true 62 | } 63 | 64 | if this.insert(this.root, s) { 65 | this.size++ 66 | return true 67 | } 68 | 69 | return false 70 | } 71 | 72 | func (this *BKTree) find(rt *bktreeNode, s string, k int, n int) (ret []string) { 73 | if n == 0 { 74 | return []string{} 75 | } 76 | 77 | d := Levenshtein(rt.str, s) 78 | if d <= k { 79 | ret = append(ret, rt.str) 80 | if n >= 0 && len(ret) >= n { 81 | return ret[0:n] 82 | } 83 | } 84 | 85 | dx, dy := max(0, d-k), min(d+k, len(rt.child)-1) 86 | for i := dx; i <= dy; i++ { 87 | if rt.child[i] != nil { 88 | ret = append(ret, this.find(rt.child[i], s, k, n)...) 89 | if n >= 0 && len(ret) >= n { 90 | return ret[0:n] 91 | } 92 | } 93 | } 94 | return ret 95 | } 96 | 97 | // if n < 0, there is no limit on the number of find strings. 98 | func (this *BKTree) Find(s string, k int, n int) []string { 99 | if this.root == nil { 100 | return []string{} 101 | } 102 | return this.find(this.root, s, k, n) 103 | } 104 | 105 | func (this *BKTree) Levenshtein(s1, s2 string) int { 106 | return Levenshtein(s1, s2) 107 | } 108 | 109 | func Levenshtein(s1, s2 string) int { 110 | runes1 := []rune(s1) 111 | runes2 := []rune(s2) 112 | 113 | m := len(runes1) 114 | n := len(runes2) 115 | 116 | // roll array 117 | d := make([][]int, 2) 118 | d[0] = make([]int, n+1) 119 | d[1] = make([]int, n+1) 120 | 121 | turn, pre := 0, 0 122 | for i := 0; i <= n; i++ { 123 | d[turn][i] = i 124 | } 125 | for i := 1; i <= m; i++ { 126 | pre = turn 127 | turn = (turn + 1) % 2 128 | d[turn][0] = i 129 | 130 | for j := 1; j <= n; j++ { 131 | if runes1[i-1] == runes2[j-1] { 132 | d[turn][j] = d[pre][j-1] 133 | } else { 134 | d[turn][j] = min(min(d[pre][j]+1, d[turn][j-1]+1), d[pre][j-1]+1) 135 | } 136 | } 137 | } 138 | 139 | return d[turn][n] 140 | } 141 | 142 | func min(a, b int) int { 143 | if a < b { 144 | return a 145 | } 146 | return b 147 | } 148 | 149 | func max(a, b int) int { 150 | if a < b { 151 | return b 152 | } 153 | return a 154 | } 155 | -------------------------------------------------------------------------------- /bktree_test.go: -------------------------------------------------------------------------------- 1 | package bktree 2 | 3 | import ( 4 | "crypto/rand" 5 | "encoding/base64" 6 | "fmt" 7 | "io" 8 | "testing" 9 | "time" 10 | ) 11 | 12 | func TestLevenshtein(t *testing.T) { 13 | s1, s2 := "hello", "world" 14 | if Levenshtein(s1, s2) != 4 { 15 | t.Fatal() 16 | } 17 | 18 | if Levenshtein("", "AA") != 2 { 19 | t.Fatal() 20 | } 21 | 22 | if Levenshtein("a", "b") != 1 || Levenshtein("aa", "bb") != 2 { 23 | t.Fatal() 24 | } 25 | 26 | if Levenshtein("abcd", "bcde") != 2 { 27 | t.Fatal() 28 | } 29 | 30 | if Levenshtein("AB好C", "你好啊") != 3 { 31 | t.Fatal() 32 | } 33 | 34 | if Levenshtein("你好世界啊", "世界啊你好") != 4 { 35 | t.Fatal() 36 | } 37 | } 38 | 39 | func TestInsert(t *testing.T) { 40 | bk := New() 41 | bk.SetLevenshteinLimit(2) 42 | 43 | ret := bk.Find("hello", 1, 1) 44 | if len(ret) != 0 { 45 | t.Fatal() 46 | } 47 | 48 | if !bk.Insert("") { 49 | t.Fatal() 50 | } 51 | 52 | if bk.Insert("ABC") { 53 | t.Fatal() 54 | } 55 | 56 | if !bk.Insert("AB") { 57 | t.Fatal() 58 | } 59 | } 60 | 61 | func TestInsertAndFind(t *testing.T) { 62 | bk := New() 63 | bk.SetLevenshteinLimit(50) 64 | 65 | if bk.GetLevenshteinLimit() != 50 { 66 | t.Fatal() 67 | } 68 | 69 | bk.Insert("656") 70 | bk.Insert("67") 71 | bk.Insert("9313") 72 | bk.Insert("1178") 73 | bk.Insert("38") 74 | 75 | if bk.Size() != 5 { 76 | t.Fatal() 77 | } 78 | 79 | ret := bk.Find("87", 2, 2) 80 | if ret[0] != "67" || ret[1] != "38" { 81 | t.Fatal() 82 | } 83 | 84 | ret = bk.Find("87", 2, 1) 85 | if len(ret) != 1 { 86 | t.Fatal() 87 | } 88 | 89 | ret = bk.Find("87", 4, -1) 90 | if len(ret) != bk.Size() { 91 | t.Fatal() 92 | } 93 | } 94 | 95 | func Test(t *testing.T) { 96 | bk := New() 97 | bk.SetLevenshteinLimit(20) 98 | 99 | start := time.Now() 100 | for i := 0; i < 100000; i++ { 101 | buf := make([]byte, 5) 102 | io.ReadFull(rand.Reader, buf) 103 | s := base64.StdEncoding.EncodeToString(buf) 104 | bk.Insert(s) 105 | } 106 | fmt.Println("Insert", time.Since(start)) 107 | 108 | start = time.Now() 109 | for i := 0; i < 10; i++ { 110 | buf := make([]byte, 5) 111 | io.ReadFull(rand.Reader, buf) 112 | s := base64.StdEncoding.EncodeToString(buf) 113 | ret := bk.Find(s, 5, 2) 114 | if len(ret) > 2 { 115 | t.Fatal() 116 | } 117 | } 118 | fmt.Println("Find", time.Since(start)) 119 | } 120 | --------------------------------------------------------------------------------