├── .github └── workflows │ └── build.yml ├── .gitignore ├── Makefile ├── README.rst ├── UNLICENSE ├── alphabet.go ├── alphabet_test.go ├── base64.go ├── bitstring.go ├── bitwriter.go ├── example ├── basic │ └── usage.go └── pali │ ├── decode.go │ └── encode.go ├── frozentrie.go ├── frozentrie_test.go ├── go.mod ├── rankdirectory.go ├── reference ├── Bits.go ├── Bits.js ├── Bits_test.go ├── Bitsjs-pali.patch ├── Bitsjs.patch ├── Succinct Data Structures_ Cramming 80,000 words into a Javascript file_.pdf ├── buildSuccinctTrie.js ├── lookup.js ├── test.html ├── test.js └── variables.js ├── search.go ├── search_test.go ├── trie.go └── trie_test.go /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Test Package 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | go: [ '1.17', '1.8' ] 14 | name: Test go-succinct-data-structure-trie Package 15 | steps: 16 | - uses: actions/checkout@v2.3.1 17 | - uses: actions/setup-go@v2 18 | with: 19 | go-version: ${{ matrix.go }} 20 | #- run: make test 21 | - run: go test -v 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | pkg/ 2 | src/ 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # cannot use relative path in GOROOT, otherwise 6g not found. For example, 2 | # export GOROOT=../go (=> 6g not found) 3 | # it is also not allowed to use relative path in GOPATH 4 | ifndef GOROOT 5 | export GOROOT=$(realpath $(CURDIR)/../go) 6 | export PATH := $(GOROOT)/bin:$(PATH) 7 | endif 8 | 9 | 10 | test: 11 | @# -v means verbose, can see logs of t.Log 12 | @go test -v -race 13 | 14 | run_basic: 15 | @go run example/basic/usage.go 16 | 17 | run_pali: 18 | @cd example/pali; go run encode.go 19 | @cd example/pali; go run decode.go 20 | 21 | bitsjs: 22 | chromium-browser reference/test.html 23 | 24 | fmt: 25 | @go fmt *.go 26 | @go fmt example/basic/*.go 27 | @go fmt example/pali/*.go 28 | 29 | help: 30 | @go help 31 | 32 | modinit: 33 | go mod init github.com/siongui/go-succinct-data-structure-trie 34 | 35 | modtidy: 36 | go mod tidy 37 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | `Succinct Data Structure`_ Trie_ 3 | ================================ 4 | 5 | .. image:: https://img.shields.io/badge/Language-Go-blue.svg 6 | :target: https://golang.org/ 7 | 8 | .. image:: https://godoc.org/github.com/siongui/go-succinct-data-structure-trie?status.svg 9 | :target: https://godoc.org/github.com/siongui/go-succinct-data-structure-trie 10 | 11 | .. image:: https://github.com/siongui/go-succinct-data-structure-trie/workflows/Test%20Package/badge.svg 12 | :target: https://github.com/siongui/go-succinct-data-structure-trie/blob/master/.github/workflows/build.yml 13 | 14 | .. image:: https://goreportcard.com/badge/github.com/siongui/go-succinct-data-structure-trie 15 | :target: https://goreportcard.com/report/github.com/siongui/go-succinct-data-structure-trie 16 | 17 | .. image:: https://img.shields.io/badge/license-Unlicense-blue.svg 18 | :target: https://raw.githubusercontent.com/siongui/go-succinct-data-structure-trie/master/UNLICENSE 19 | 20 | .. image:: https://img.shields.io/twitter/url/https/github.com/siongui/go-succinct-data-structure-trie.svg?style=social 21 | :target: https://twitter.com/intent/tweet?text=Wow:&url=%5Bobject%20Object%5D 22 | 23 | 24 | Implementation of `Succinct Trie`_ [1]_ in Go_. 25 | 26 | The trie structure is great for fast lookup of dictionary words, but if the 27 | vocabulary of the dictionary is big, it may takes a lot of space to store the 28 | constructed trie. For this reason, succinct data structure is applied to the 29 | trie strcuture and we can both have fast lookup and small space requirement. 30 | 31 | 32 | Usage 33 | ===== 34 | 35 | - Basic example: `basic usage `__ 36 | - Advanced example: `pali dir `__ 37 | 38 | UNLICENSE 39 | ========= 40 | 41 | Released in public domain. See UNLICENSE_. 42 | 43 | 44 | References 45 | ========== 46 | 47 | .. [1] `Succinct Data Structures: Cramming 80,000 words into a Javascript file. `_ 48 | (`source code `__) 49 | 50 | .. [2] Google Search `succinct data structure `__ 51 | 52 | .. [3] Google Search `succinct trie `__ 53 | 54 | .. [4] Google Search `golang const array `__ 55 | 56 | .. [5] Google Search `golang function as argument `__ 57 | 58 | .. [6] Google Search `golang charcodeat `__ 59 | 60 | `string - Go lang's equivalent of charCode() method of JavaScript - Stack Overflow `_ 61 | 62 | .. [7] `[Golang] Succinct Trie Implementation `_ 63 | 64 | .. [8] `[JavaScript] Bug in Succinct Trie Implementation of Bits.js `_ 65 | 66 | .. _Go: https://golang.org/ 67 | .. _UNLICENSE: https://unlicense.org/ 68 | .. _Succinct Data Structure: https://www.google.com/search?q=Succinct+Data+Structure 69 | .. _Trie: https://www.google.com/search?q=Trie 70 | .. _Succinct Trie: https://www.google.com/search?q=Succinct+Trie 71 | -------------------------------------------------------------------------------- /UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /alphabet.go: -------------------------------------------------------------------------------- 1 | package bits 2 | 3 | /** 4 | * Set alphabet of words 5 | */ 6 | 7 | import "strings" 8 | 9 | //var allowedCharacters = "abcdeghijklmnoprstuvyāīūṁṃŋṇṅñṭḍḷ…'’° -" 10 | var allowedCharacters = "abcdefghijklmnopqrstuvwxyz " 11 | var mapCharToUint = getCharToUintMap(allowedCharacters) 12 | var mapUintToChar = getUintToCharMap(mapCharToUint) 13 | 14 | /** 15 | * Write the data for each node, call getDataBits() to calculate how many bits 16 | * for one node. 17 | * 1 bit stores the "final" indicator. The other bits store one of the 18 | * characters of the alphabet. 19 | */ 20 | var dataBits = getDataBits(allowedCharacters) 21 | 22 | func SetAllowedCharacters(alphabet string) { 23 | allowedCharacters = alphabet 24 | mapCharToUint = getCharToUintMap(alphabet) 25 | mapUintToChar = getUintToCharMap(mapCharToUint) 26 | dataBits = getDataBits(alphabet) 27 | } 28 | 29 | func getCharToUintMap(alphabet string) map[string]uint { 30 | result := map[string]uint{} 31 | 32 | var i uint = 0 33 | chars := strings.Split(alphabet, "") 34 | for _, char := range chars { 35 | result[char] = i 36 | i++ 37 | } 38 | 39 | return result 40 | } 41 | 42 | func getUintToCharMap(c2ui map[string]uint) map[uint]string { 43 | result := map[uint]string{} 44 | for k, v := range c2ui { 45 | result[v] = k 46 | } 47 | return result 48 | } 49 | 50 | func getDataBits(alphabet string) uint { 51 | numOfChars := len(strings.Split(alphabet, "")) 52 | var i uint = 0 53 | 54 | for (1 << i) < numOfChars { 55 | i++ 56 | } 57 | 58 | // one more bit for the "final" indicator 59 | return (i + 1) 60 | } 61 | -------------------------------------------------------------------------------- /alphabet_test.go: -------------------------------------------------------------------------------- 1 | package bits 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestAlphabet(t *testing.T) { 8 | if len(mapCharToUint) != 27 { 9 | t.Error("len(mapCharToUint) != 27") 10 | t.Log(mapCharToUint) 11 | } 12 | if len(mapUintToChar) != 27 { 13 | t.Error("len(mapUintToChar) != 27") 14 | t.Log(mapUintToChar) 15 | } 16 | if dataBits != 6 { 17 | t.Error("dataBits != 6") 18 | t.Log(dataBits) 19 | } 20 | 21 | SetAllowedCharacters("abcdeghijklmnoprstuvyāīūṁṃŋṇṅñṭḍḷ…'’° -") 22 | 23 | if len(mapCharToUint) != 39 { 24 | t.Error("len(mapCharToUint) != 39") 25 | t.Log(mapCharToUint) 26 | } 27 | if len(mapUintToChar) != 39 { 28 | t.Error("len(mapUintToChar) != 39") 29 | t.Log(mapUintToChar) 30 | } 31 | if dataBits != 7 { 32 | t.Error("dataBits != 7") 33 | t.Log(dataBits) 34 | } 35 | 36 | SetAllowedCharacters("abcdefghijklmnopqrstuvwxyz ") 37 | if dataBits != 6 { 38 | t.Error("dataBits != 6") 39 | t.Log(dataBits) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /base64.go: -------------------------------------------------------------------------------- 1 | package bits 2 | 3 | // Configure the bit writing and reading functions to work natively in BASE-64 4 | // encoding. That way, we don't have to convert back and forth to bytes. 5 | 6 | var BASE64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" 7 | 8 | /** 9 | The width of each unit of the encoding, in bits. Here we use 6, for base-64 10 | encoding. 11 | */ 12 | var W uint = 6 13 | 14 | /** 15 | Returns the character unit that represents the given value. If this were 16 | binary data, we would simply return id. 17 | */ 18 | func CHR(id uint) string { 19 | return BASE64[id : id+1] 20 | } 21 | 22 | /** 23 | Returns the decimal value of the given character unit. 24 | */ 25 | var BASE64_CACHE = map[string]uint{ 26 | "A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6, "H": 7, 27 | "I": 8, "J": 9, "K": 10, "L": 11, "M": 12, "N": 13, "O": 14, 28 | "P": 15, "Q": 16, "R": 17, "S": 18, "T": 19, "U": 20, "V": 21, 29 | "W": 22, "X": 23, "Y": 24, "Z": 25, "a": 26, "b": 27, "c": 28, 30 | "d": 29, "e": 30, "f": 31, "g": 32, "h": 33, "i": 34, "j": 35, 31 | "k": 36, "l": 37, "m": 38, "n": 39, "o": 40, "p": 41, "q": 42, 32 | "r": 43, "s": 44, "t": 45, "u": 46, "v": 47, "w": 48, "x": 49, 33 | "y": 50, "z": 51, "0": 52, "1": 53, "2": 54, "3": 55, "4": 56, 34 | "5": 57, "6": 58, "7": 59, "8": 60, "9": 61, "-": 62, "_": 63, 35 | } 36 | 37 | func ORD(ch string) uint { 38 | // Used to be: return BASE64.indexOf(ch); 39 | return BASE64_CACHE[ch] 40 | } 41 | -------------------------------------------------------------------------------- /bitstring.go: -------------------------------------------------------------------------------- 1 | package bits 2 | 3 | /** 4 | Given a string of data (eg, in BASE-64), the BitString class supports 5 | reading or counting a number of bits from an arbitrary position in the 6 | string. 7 | */ 8 | type BitString struct { 9 | base64DataString string 10 | length uint 11 | } 12 | 13 | var MaskTop = [7]uint{ 14 | 0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x01, 0x00, 15 | } 16 | 17 | var BitsInByte = [256]uint{ 18 | 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 19 | 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 20 | 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 21 | 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 22 | 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 23 | 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 24 | 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 25 | 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 26 | 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 27 | 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 28 | 6, 7, 6, 7, 7, 8, 29 | } 30 | 31 | func (bs *BitString) Init(data string) { 32 | bs.base64DataString = data 33 | bs.length = uint(len(bs.base64DataString)) * W 34 | } 35 | 36 | /** 37 | Returns the internal string of bytes 38 | */ 39 | func (bs *BitString) GetData() string { 40 | return bs.base64DataString 41 | } 42 | 43 | /** 44 | Returns a decimal number, consisting of a certain number, n, of bits 45 | starting at a certain position, p. 46 | */ 47 | func (bs *BitString) Get(p, n uint) uint { 48 | 49 | // case 1: bits lie within the given byte 50 | if (p%W)+n <= W { 51 | idx := p/W | 0 52 | return (ORD(bs.base64DataString[idx:idx+1]) & MaskTop[p%W]) >> 53 | (W - p%W - n) 54 | 55 | // case 2: bits lie incompletely in the given byte 56 | } else { 57 | idx := p/W | 0 58 | result := (ORD(bs.base64DataString[idx:idx+1]) & MaskTop[p%W]) 59 | 60 | l := W - p%W 61 | p += l 62 | n -= l 63 | 64 | for n >= W { 65 | idx := p/W | 0 66 | result = (result << W) | ORD(bs.base64DataString[idx:idx+1]) 67 | p += W 68 | n -= W 69 | } 70 | 71 | if n > 0 { 72 | idx := p/W | 0 73 | result = (result << n) | (ORD(bs.base64DataString[idx:idx+1]) >> 74 | (W - n)) 75 | } 76 | 77 | return result 78 | } 79 | } 80 | 81 | /** 82 | Counts the number of bits set to 1 starting at position p and 83 | ending at position p + n 84 | */ 85 | func (bs *BitString) Count(p, n uint) uint { 86 | 87 | var count uint = 0 88 | for n >= 8 { 89 | count += BitsInByte[bs.Get(p, 8)] 90 | p += 8 91 | n -= 8 92 | } 93 | 94 | return count + BitsInByte[bs.Get(p, n)] 95 | } 96 | 97 | /** 98 | Returns the number of bits set to 1 up to and including position x. 99 | This is the slow implementation used for testing. 100 | */ 101 | func (bs *BitString) Rank(x uint) uint { 102 | var rank uint = 0 103 | var i uint = 0 104 | for i = 0; i <= x; i++ { 105 | // FIXME: the above line should be the following??? 106 | //for i = 0; i < x; i++ { 107 | if bs.Get(i, 1) != 0 { 108 | rank++ 109 | } 110 | } 111 | 112 | return rank 113 | } 114 | -------------------------------------------------------------------------------- /bitwriter.go: -------------------------------------------------------------------------------- 1 | package bits 2 | 3 | import "strings" 4 | 5 | /** 6 | The BitWriter will create a stream of bytes, letting you write a certain 7 | number of bits at a time. This is part of the encoder, so it is not 8 | optimized for memory or speed. 9 | */ 10 | type BitWriter struct { 11 | bits []uint 12 | } 13 | 14 | /** 15 | Write some data to the bit string. The number of bits must be 32 or 16 | fewer. 17 | */ 18 | func (bw *BitWriter) Write(data, numBits uint) { 19 | //for i := (numBits-1); i >= 0; i-- { 20 | // @siongui: the above commented line will cause infinite loop, why??? 21 | // answer from @xphoenix: 22 | // Because i becomes uint, let's check iteration when i == 0, at the end 23 | // of loop, i-- takes place but as i is uint, it leads to 2^32-1 instead 24 | // of -1, loop condition is still true... 25 | for i := numBits; i > 0; i-- { 26 | j := i - 1 27 | if (data & (1 << j)) != 0 { 28 | bw.bits = append(bw.bits, 1) 29 | } else { 30 | bw.bits = append(bw.bits, 0) 31 | } 32 | } 33 | } 34 | 35 | /** 36 | Get the bitstring represented as a javascript string of bytes 37 | */ 38 | func (bw *BitWriter) GetData() string { 39 | var chars []string 40 | var b, i uint = 0, 0 41 | 42 | for j := 0; j < len(bw.bits); j++ { 43 | b = (b << 1) | bw.bits[j] 44 | i += 1 45 | if i == W { 46 | chars = append(chars, CHR(b)) 47 | i = 0 48 | b = 0 49 | } 50 | } 51 | 52 | if i != 0 { 53 | chars = append(chars, CHR(b<<(W-i))) 54 | } 55 | 56 | return strings.Join(chars, "") 57 | } 58 | 59 | /** 60 | Returns the bits as a human readable binary string for debugging 61 | */ 62 | func (bw *BitWriter) GetDebugString(group uint) string { 63 | var chars []string 64 | var i uint = 0 65 | 66 | for j := 0; j < len(bw.bits); j++ { 67 | if bw.bits[j] == 1 { 68 | chars = append(chars, "1") 69 | } else { 70 | chars = append(chars, "0") 71 | } 72 | i++ 73 | if i == group { 74 | chars = append(chars, " ") 75 | i = 0 76 | } 77 | } 78 | 79 | return strings.Join(chars, "") 80 | } 81 | -------------------------------------------------------------------------------- /example/basic/usage.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | bits "github.com/siongui/go-succinct-data-structure-trie" 5 | ) 6 | 7 | func insertNotInAlphabeticalOrder(te *bits.Trie) { 8 | te.Insert("apple") 9 | te.Insert("orange") 10 | te.Insert("alphapha") 11 | te.Insert("lamp") 12 | te.Insert("hello") 13 | te.Insert("jello") 14 | te.Insert("quiz") 15 | } 16 | 17 | func main() { 18 | // optional: set alphabet of words 19 | //bits.SetAllowedCharacters("abcdeghijklmnoprstuvyāīūṁṃŋṇṅñṭḍḷ…'’° -") 20 | // Note that you must include space " " in your alphabet if you do not use the 21 | // default alphabet. 22 | // default alphabet is [a-z ], i.e., 23 | // bits.SetAllowedCharacters("abcdefghijklmnopqrstuvwxyz ") 24 | 25 | // encode: build succinct trie 26 | te := bits.Trie{} 27 | te.Init() 28 | // encode: insert words 29 | insertNotInAlphabeticalOrder(&te) 30 | // encode: trie encoding 31 | teData := te.Encode() 32 | println(teData) 33 | println(te.GetNodeCount()) 34 | // encode: build cache for quick lookup 35 | rd := bits.CreateRankDirectory(teData, te.GetNodeCount()*2+1, bits.L1, bits.L2) 36 | println(rd.GetData()) 37 | 38 | // decode: build frozen succinct trie 39 | ft := bits.FrozenTrie{} 40 | ft.Init(teData, rd.GetData(), te.GetNodeCount()) 41 | 42 | // decode: look up words 43 | println(ft.Lookup("apple")) 44 | println(ft.Lookup("appl")) 45 | println(ft.Lookup("applee")) 46 | 47 | // decode: words suggestion (find words that start with "prefix") 48 | // find words starts with "a", max number of returned words is 10 49 | for _, word := range ft.GetSuggestedWords("a", 10) { 50 | println(word) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /example/pali/decode.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "io/ioutil" 6 | 7 | bits "github.com/siongui/go-succinct-data-structure-trie" 8 | ) 9 | 10 | type TrieData struct { 11 | EncodedData string 12 | NodeCount uint 13 | RankDirectoryData string 14 | } 15 | 16 | func loadTrie(filePath string) (td TrieData, err error) { 17 | b, err := ioutil.ReadFile(filePath) 18 | if err != nil { 19 | return 20 | } 21 | 22 | err = json.Unmarshal(b, &td) 23 | return 24 | } 25 | 26 | func main() { 27 | // Set alphabet of words 28 | bits.SetAllowedCharacters("abcdeghijklmnoprstuvyāīūṁṃŋṇṅñṭḍḷ…'’° -") 29 | // Note that you must include space " " in your alphabet if you do not 30 | // use the default alphabet. 31 | // default alphabet is [a-z ], i.e., 32 | // bits.SetAllowedCharacters("abcdefghijklmnopqrstuvwxyz ") 33 | 34 | td, err := loadTrie("trie.json") 35 | if err != nil { 36 | panic(err) 37 | } 38 | 39 | println(td.EncodedData) 40 | println(td.NodeCount) 41 | println(td.RankDirectoryData) 42 | 43 | // decode: build frozen succinct trie 44 | ft := bits.FrozenTrie{} 45 | ft.Init(td.EncodedData, td.RankDirectoryData, td.NodeCount) 46 | 47 | // decode: look up words 48 | println(ft.Lookup("sacca")) 49 | println(ft.Lookup("sacc")) 50 | println(ft.Lookup("dhamma")) 51 | 52 | // decode: words suggestion (find words that start with "prefix") 53 | // find words starts with "a", max number of returned words is 10 54 | for _, word := range ft.GetSuggestedWords("a", 10) { 55 | println(word) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /example/pali/encode.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "io/ioutil" 6 | 7 | bits "github.com/siongui/go-succinct-data-structure-trie" 8 | ) 9 | 10 | type TrieData struct { 11 | EncodedData string 12 | NodeCount uint 13 | RankDirectoryData string 14 | } 15 | 16 | func saveTrie(t bits.Trie) (err error) { 17 | rd := bits.CreateRankDirectory(t.Encode(), t.GetNodeCount()*2+1, bits.L1, bits.L2) 18 | td := TrieData{ 19 | EncodedData: t.Encode(), 20 | NodeCount: t.GetNodeCount(), 21 | RankDirectoryData: rd.GetData(), 22 | } 23 | 24 | b, err := json.Marshal(td) 25 | if err != nil { 26 | return 27 | } 28 | 29 | err = ioutil.WriteFile("trie.json", b, 0644) 30 | return 31 | } 32 | 33 | func insertNotInAlphabeticalOrder(te *bits.Trie) { 34 | te.Insert("sacca") 35 | te.Insert("ariya") 36 | te.Insert("saccavācā") 37 | te.Insert("dhammaṃ") 38 | te.Insert("buddho") 39 | te.Insert("viharati") 40 | } 41 | 42 | func main() { 43 | // Set alphabet of words 44 | bits.SetAllowedCharacters("abcdeghijklmnoprstuvyāīūṁṃŋṇṅñṭḍḷ…'’° -") 45 | // Note that you must include space " " in your alphabet if you do not 46 | // use the default alphabet. 47 | // default alphabet is [a-z ], i.e., 48 | // bits.SetAllowedCharacters("abcdefghijklmnopqrstuvwxyz ") 49 | 50 | // encode: build succinct trie 51 | te := bits.Trie{} 52 | te.Init() 53 | // encode: insert words 54 | insertNotInAlphabeticalOrder(&te) 55 | // encode: trie encoding 56 | teData := te.Encode() 57 | println(teData) 58 | println(te.GetNodeCount()) 59 | // encode: build cache for quick lookup 60 | rd := bits.CreateRankDirectory(teData, te.GetNodeCount()*2+1, bits.L1, bits.L2) 61 | println(rd.GetData()) 62 | 63 | err := saveTrie(te) 64 | if err != nil { 65 | panic(err) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /frozentrie.go: -------------------------------------------------------------------------------- 1 | package bits 2 | 3 | import "unicode/utf8" 4 | 5 | /** 6 | This class is used for traversing the succinctly encoded trie. 7 | */ 8 | type FrozenTrieNode struct { 9 | trie *FrozenTrie 10 | index uint 11 | letter string 12 | final bool 13 | firstChild uint 14 | childCount uint 15 | } 16 | 17 | /** 18 | Returns the number of children. 19 | */ 20 | func (f *FrozenTrieNode) GetChildCount() uint { 21 | return f.childCount 22 | } 23 | 24 | /** 25 | Returns the FrozenTrieNode for the given child. 26 | 27 | @param index The 0-based index of the child of this node. For example, if 28 | the node has 5 children, and you wanted the 0th one, pass in 0. 29 | */ 30 | func (f *FrozenTrieNode) GetChild(index uint) FrozenTrieNode { 31 | return f.trie.GetNodeByIndex(f.firstChild + index) 32 | } 33 | 34 | /** 35 | The FrozenTrie is used for looking up words in the encoded trie. 36 | 37 | @param data A string representing the encoded trie. 38 | 39 | @param directoryData A string representing the RankDirectory. The global L1 40 | and L2 constants are used to determine the L1Size and L2size. 41 | 42 | @param nodeCount The number of nodes in the trie. 43 | */ 44 | type FrozenTrie struct { 45 | data BitString 46 | directory RankDirectory 47 | letterStart uint 48 | } 49 | 50 | func (f *FrozenTrie) Init(data, directoryData string, nodeCount uint) { 51 | f.data.Init(data) 52 | f.directory.Init(directoryData, data, nodeCount*2+1, L1, L2) 53 | 54 | // The position of the first bit of the data in 0th node. In non-root 55 | // nodes, this would contain 6-bit letters. 56 | f.letterStart = nodeCount*2 + 1 57 | } 58 | 59 | /** 60 | Retrieve the FrozenTrieNode of the trie, given its index in level-order. 61 | This is a private function that you don't have to use. 62 | */ 63 | func (f *FrozenTrie) GetNodeByIndex(index uint) FrozenTrieNode { 64 | // retrieve the (dataBits)-bit letter. 65 | final := (f.data.Get(f.letterStart+index*dataBits, 1) == 1) 66 | letter, ok := mapUintToChar[f.data.Get(f.letterStart+index*dataBits+1, (dataBits-1))] 67 | if !ok { 68 | panic("illegal: bits -> char failed") 69 | } 70 | firstChild := f.directory.Select(0, index+1) - index 71 | 72 | // Since the nodes are in level order, this nodes children must go up 73 | // until the next node's children start. 74 | childOfNextNode := f.directory.Select(0, index+2) - index - 1 75 | 76 | return FrozenTrieNode{ 77 | trie: f, 78 | index: index, 79 | letter: letter, 80 | final: final, 81 | firstChild: firstChild, 82 | childCount: (childOfNextNode - firstChild), 83 | } 84 | } 85 | 86 | /** 87 | Retrieve the root node. You can use this node to obtain all of the other 88 | nodes in the trie. 89 | */ 90 | func (f *FrozenTrie) GetRoot() FrozenTrieNode { 91 | return f.GetNodeByIndex(0) 92 | } 93 | 94 | /** 95 | Look-up a word in the trie. Returns true if and only if the word exists 96 | in the trie. 97 | */ 98 | func (f *FrozenTrie) Lookup(word string) bool { 99 | node := f.GetRoot() 100 | for i, w := 0, 0; i < len(word); i += w { 101 | runeValue, width := utf8.DecodeRuneInString(word[i:]) 102 | w = width 103 | var child FrozenTrieNode 104 | var j uint = 0 105 | for ; j < node.GetChildCount(); j++ { 106 | child = node.GetChild(j) 107 | if child.letter == string(runeValue) { 108 | break 109 | } 110 | } 111 | 112 | if j == node.GetChildCount() { 113 | return false 114 | } 115 | node = child 116 | } 117 | 118 | return node.final 119 | } 120 | -------------------------------------------------------------------------------- /frozentrie_test.go: -------------------------------------------------------------------------------- 1 | package bits 2 | 3 | import "testing" 4 | 5 | func TestLookup(t *testing.T) { 6 | te := Trie{} 7 | te.Init() 8 | insertNotInAlphabeticalOrder(&te) 9 | teData := te.Encode() 10 | rd := CreateRankDirectory(teData, te.GetNodeCount()*2+1, L1, L2) 11 | 12 | ft := FrozenTrie{} 13 | ft.Init(teData, rd.GetData(), te.GetNodeCount()) 14 | 15 | if ft.Lookup("apple") != true { 16 | t.Error("apple") 17 | } 18 | if ft.Lookup("appl") != false { 19 | t.Error("appl") 20 | } 21 | if ft.Lookup("applea") != false { 22 | t.Error("applea") 23 | } 24 | if ft.Lookup("orange") != true { 25 | t.Error("orange") 26 | } 27 | if ft.Lookup("lamp") != true { 28 | t.Error("lamp") 29 | } 30 | if ft.Lookup("hello") != true { 31 | t.Error("hello") 32 | } 33 | if ft.Lookup("jello") != true { 34 | t.Error("jello") 35 | } 36 | if ft.Lookup("quiz") != true { 37 | t.Error("quiz") 38 | } 39 | if ft.Lookup("quize") != false { 40 | t.Error("quize") 41 | } 42 | if ft.Lookup("alphaph") != false { 43 | t.Error("alphaph") 44 | } 45 | if ft.Lookup("alphapha") != true { 46 | t.Error("alphapha") 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/siongui/go-succinct-data-structure-trie 2 | 3 | go 1.17 4 | -------------------------------------------------------------------------------- /rankdirectory.go: -------------------------------------------------------------------------------- 1 | package bits 2 | 3 | import "math" 4 | 5 | /** 6 | Fixed values for the L1 and L2 table sizes in the Rank Directory 7 | */ 8 | var L1 uint = 32 * 32 9 | var L2 uint = 32 10 | 11 | /** 12 | The rank directory allows you to build an index to quickly compute the 13 | rank() and select() functions. The index can itself be encoded as a binary 14 | string. 15 | */ 16 | type RankDirectory struct { 17 | directory BitString 18 | data BitString // data of succinct trie 19 | l1Size uint 20 | l2Size uint 21 | l1Bits uint 22 | l2Bits uint 23 | sectionBits uint 24 | numBits uint 25 | } 26 | 27 | /** 28 | Used to build a rank directory from the given input string. 29 | 30 | @param data A javascript string containing the data, as readable using the 31 | BitString object. 32 | 33 | @param numBits The number of bits to index. 34 | 35 | @param l1Size The number of bits that each entry in the Level 1 table 36 | summarizes. This should be a multiple of l2Size. 37 | 38 | @param l2Size The number of bits that each entry in the Level 2 table 39 | summarizes. 40 | */ 41 | func CreateRankDirectory(data string, numBits, l1Size, l2Size uint) RankDirectory { 42 | bits := BitString{} 43 | bits.Init(data) 44 | var p, i uint = 0, 0 45 | var count1, count2 uint = 0, 0 46 | l1bits := uint(math.Ceil(math.Log2(float64(numBits)))) 47 | l2bits := uint(math.Ceil(math.Log2(float64(l1Size)))) 48 | 49 | directory := BitWriter{} 50 | 51 | for p+l2Size <= numBits { 52 | count2 += bits.Count(p, l2Size) 53 | i += l2Size 54 | p += l2Size 55 | if i == l1Size { 56 | count1 += count2 57 | directory.Write(count1, l1bits) 58 | count2 = 0 59 | i = 0 60 | } else { 61 | directory.Write(count2, l2bits) 62 | } 63 | } 64 | 65 | rd := RankDirectory{} 66 | rd.Init(directory.GetData(), data, numBits, l1Size, l2Size) 67 | return rd 68 | } 69 | 70 | func (rd *RankDirectory) Init(directoryData, bitData string, numBits, l1Size, l2Size uint) { 71 | rd.directory.Init(directoryData) 72 | rd.data.Init(bitData) 73 | rd.l1Size = l1Size 74 | rd.l2Size = l2Size 75 | rd.l1Bits = uint(math.Ceil(math.Log2(float64(numBits)))) 76 | rd.l2Bits = uint(math.Ceil(math.Log2(float64(l1Size)))) 77 | rd.sectionBits = (l1Size/l2Size-1)*rd.l2Bits + rd.l1Bits 78 | rd.numBits = numBits 79 | } 80 | 81 | /** 82 | Returns the string representation of the directory. 83 | */ 84 | func (rd *RankDirectory) GetData() string { 85 | return rd.directory.GetData() 86 | } 87 | 88 | /** 89 | Returns the number of 1 or 0 bits (depending on the "which" parameter) to 90 | to and including position x. 91 | */ 92 | func (rd *RankDirectory) Rank(which, x uint) uint { 93 | 94 | if which == 0 { 95 | return x - rd.Rank(1, x) + 1 96 | } 97 | 98 | var rank uint = 0 99 | o := x 100 | var sectionPos uint = 0 101 | 102 | if o >= rd.l1Size { 103 | sectionPos = (o/rd.l1Size | 0) * rd.sectionBits 104 | rank = rd.directory.Get(sectionPos-rd.l1Bits, rd.l1Bits) 105 | o = o % rd.l1Size 106 | } 107 | 108 | if o >= rd.l2Size { 109 | sectionPos += (o/rd.l2Size | 0) * rd.l2Bits 110 | rank += rd.directory.Get(sectionPos-rd.l2Bits, rd.l2Bits) 111 | } 112 | 113 | rank += rd.data.Count(x-x%rd.l2Size, x%rd.l2Size+1) 114 | 115 | return rank 116 | } 117 | 118 | /** 119 | Returns the position of the y'th 0 or 1 bit, depending on the "which" 120 | parameter. 121 | */ 122 | func (rd *RankDirectory) Select(which, y uint) uint { 123 | high := int(rd.numBits) 124 | low := -1 125 | val := -1 126 | 127 | for high-low > 1 { 128 | probe := (high+low)/2 | 0 129 | r := rd.Rank(which, uint(probe)) 130 | 131 | if r == y { 132 | // We have to continue searching after we have found it, 133 | // because we want the _first_ occurrence. 134 | val = probe 135 | high = probe 136 | } else if r < y { 137 | low = probe 138 | } else { 139 | high = probe 140 | } 141 | } 142 | 143 | return uint(val) 144 | } 145 | -------------------------------------------------------------------------------- /reference/Bits.go: -------------------------------------------------------------------------------- 1 | /* 2 | A Succinct Trie for Go 3 | 4 | By Siong-Ui Te 5 | Released to the public domain. 6 | translated From: 7 | 8 | A Succinct Trie for Javascript 9 | 10 | By Steve Hanov 11 | Released to the public domain. 12 | 13 | This file contains functions for creating a succinctly encoded trie structure 14 | from a list of words. The trie is encoded to a succinct bit string using the 15 | method of Jacobson (1989). The bitstring is then encoded using BASE-64. 16 | 17 | The resulting trie does not have to be decoded to be used. This file also 18 | contains functions for looking up a word in the BASE-64 encoded data, in 19 | O(mlogn) time, where m is the number of letters in the target word, and n is 20 | the number of nodes in the trie. 21 | 22 | Objects for encoding: 23 | 24 | TrieNode 25 | Trie 26 | BitWriter 27 | 28 | Objects for decoding: 29 | BitString 30 | FrozenTrieNode 31 | FrozenTrie 32 | 33 | QUICK USAGE: 34 | 35 | Suppose we let data be some output of the demo encoder: 36 | 37 | var data = { 38 | "nodeCount": 37, 39 | "directory": "BMIg", 40 | "trie": "v2qqqqqqqpIUn4A5JZyBZ4ggCKh55ZZgBA5ZZd5vIEl1wx8g8A" 41 | }; 42 | 43 | var frozenTrie = new FrozenTrie( Data.trie, Data.directory, Data.nodeCount); 44 | 45 | alert( frozenTrie.lookup( "hello" ) ); // outputs true 46 | alert( frozenTrie.lookup( "kwijibo" ) ); // outputs false 47 | 48 | */ 49 | package Bits 50 | 51 | import ( 52 | "math" 53 | "strings" 54 | ) 55 | 56 | // Configure the bit writing and reading functions to work natively in BASE-64 57 | // encoding. That way, we don't have to convert back and forth to bytes. 58 | 59 | var BASE64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" 60 | 61 | /** 62 | The width of each unit of the encoding, in bits. Here we use 6, for base-64 63 | encoding. 64 | */ 65 | var W uint = 6 66 | 67 | /** 68 | Returns the character unit that represents the given value. If this were 69 | binary data, we would simply return id. 70 | */ 71 | func CHR(id uint) string { 72 | return BASE64[id : id+1] 73 | } 74 | 75 | /** 76 | Returns the decimal value of the given character unit. 77 | */ 78 | var BASE64_CACHE = map[string]uint{ 79 | "A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6, "H": 7, 80 | "I": 8, "J": 9, "K": 10, "L": 11, "M": 12, "N": 13, "O": 14, 81 | "P": 15, "Q": 16, "R": 17, "S": 18, "T": 19, "U": 20, "V": 21, 82 | "W": 22, "X": 23, "Y": 24, "Z": 25, "a": 26, "b": 27, "c": 28, 83 | "d": 29, "e": 30, "f": 31, "g": 32, "h": 33, "i": 34, "j": 35, 84 | "k": 36, "l": 37, "m": 38, "n": 39, "o": 40, "p": 41, "q": 42, 85 | "r": 43, "s": 44, "t": 45, "u": 46, "v": 47, "w": 48, "x": 49, 86 | "y": 50, "z": 51, "0": 52, "1": 53, "2": 54, "3": 55, "4": 56, 87 | "5": 57, "6": 58, "7": 59, "8": 60, "9": 61, "-": 62, "_": 63, 88 | } 89 | 90 | func ORD(ch string) uint { 91 | // Used to be: return BASE64.indexOf(ch); 92 | return BASE64_CACHE[ch] 93 | } 94 | 95 | /** 96 | Fixed values for the L1 and L2 table sizes in the Rank Directory 97 | */ 98 | var L1 uint = 32 * 32 99 | var L2 uint = 32 100 | 101 | /** 102 | The BitWriter will create a stream of bytes, letting you write a certain 103 | number of bits at a time. This is part of the encoder, so it is not 104 | optimized for memory or speed. 105 | */ 106 | type BitWriter struct { 107 | bits []uint 108 | } 109 | 110 | /** 111 | Write some data to the bit string. The number of bits must be 32 or 112 | fewer. 113 | */ 114 | func (bw *BitWriter) Write(data, numBits uint) { 115 | //for i := (numBits-1); i >= 0; i-- { 116 | //FIXME: the above commented line will cause infinite loop, why??? 117 | for i := numBits; i > 0; i-- { 118 | j := i - 1 119 | if (data & (1 << j)) != 0 { 120 | bw.bits = append(bw.bits, 1) 121 | } else { 122 | bw.bits = append(bw.bits, 0) 123 | } 124 | } 125 | } 126 | 127 | /** 128 | Get the bitstring represented as a javascript string of bytes 129 | */ 130 | func (bw *BitWriter) GetData() string { 131 | var chars []string 132 | var b, i uint = 0, 0 133 | 134 | for j := 0; j < len(bw.bits); j++ { 135 | b = (b << 1) | bw.bits[j] 136 | i += 1 137 | if i == W { 138 | chars = append(chars, CHR(b)) 139 | i = 0 140 | b = 0 141 | } 142 | } 143 | 144 | if i != 0 { 145 | chars = append(chars, CHR(b<<(W-i))) 146 | } 147 | 148 | return strings.Join(chars, "") 149 | } 150 | 151 | /** 152 | Returns the bits as a human readable binary string for debugging 153 | */ 154 | func (bw *BitWriter) GetDebugString(group uint) string { 155 | var chars []string 156 | var i uint = 0 157 | 158 | for j := 0; j < len(bw.bits); j++ { 159 | if bw.bits[j] == 1 { 160 | chars = append(chars, "1") 161 | } else { 162 | chars = append(chars, "0") 163 | } 164 | i++ 165 | if i == group { 166 | chars = append(chars, " ") 167 | i = 0 168 | } 169 | } 170 | 171 | return strings.Join(chars, "") 172 | } 173 | 174 | /** 175 | Given a string of data (eg, in BASE-64), the BitString class supports 176 | reading or counting a number of bits from an arbitrary position in the 177 | string. 178 | */ 179 | type BitString struct { 180 | base64DataString string 181 | length uint 182 | } 183 | 184 | var MaskTop = [7]uint{ 185 | 0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x01, 0x00, 186 | } 187 | 188 | var BitsInByte = [256]uint{ 189 | 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 190 | 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 191 | 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 192 | 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 193 | 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 194 | 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 195 | 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 196 | 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 197 | 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 198 | 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 199 | 6, 7, 6, 7, 7, 8, 200 | } 201 | 202 | func (bs *BitString) Init(data string) { 203 | bs.base64DataString = data 204 | bs.length = uint(len(bs.base64DataString)) * W 205 | } 206 | 207 | /** 208 | Returns the internal string of bytes 209 | */ 210 | func (bs *BitString) GetData() string { 211 | return bs.base64DataString 212 | } 213 | 214 | /** 215 | Returns a decimal number, consisting of a certain number, n, of bits 216 | starting at a certain position, p. 217 | */ 218 | func (bs *BitString) Get(p, n uint) uint { 219 | 220 | // case 1: bits lie within the given byte 221 | if (p%W)+n <= W { 222 | idx := p/W | 0 223 | return (ORD(bs.base64DataString[idx:idx+1]) & MaskTop[p%W]) >> 224 | (W - p%W - n) 225 | 226 | // case 2: bits lie incompletely in the given byte 227 | } else { 228 | idx := p/W | 0 229 | result := (ORD(bs.base64DataString[idx:idx+1]) & MaskTop[p%W]) 230 | 231 | l := W - p%W 232 | p += l 233 | n -= l 234 | 235 | for n >= W { 236 | idx := p/W | 0 237 | result = (result << W) | ORD(bs.base64DataString[idx:idx+1]) 238 | p += W 239 | n -= W 240 | } 241 | 242 | if n > 0 { 243 | idx := p/W | 0 244 | result = (result << n) | (ORD(bs.base64DataString[idx:idx+1]) >> 245 | (W - n)) 246 | } 247 | 248 | return result 249 | } 250 | } 251 | 252 | /** 253 | Counts the number of bits set to 1 starting at position p and 254 | ending at position p + n 255 | */ 256 | func (bs *BitString) Count(p, n uint) uint { 257 | 258 | var count uint = 0 259 | for n >= 8 { 260 | count += BitsInByte[bs.Get(p, 8)] 261 | p += 8 262 | n -= 8 263 | } 264 | 265 | return count + BitsInByte[bs.Get(p, n)] 266 | } 267 | 268 | /** 269 | Returns the number of bits set to 1 up to and including position x. 270 | This is the slow implementation used for testing. 271 | */ 272 | func (bs *BitString) Rank(x uint) uint { 273 | var rank uint = 0 274 | var i uint = 0 275 | for i = 0; i <= x; i++ { 276 | // FIXME: the above line should be the following??? 277 | //for i = 0; i < x; i++ { 278 | if bs.Get(i, 1) != 0 { 279 | rank++ 280 | } 281 | } 282 | 283 | return rank 284 | } 285 | 286 | /** 287 | The rank directory allows you to build an index to quickly compute the 288 | rank() and select() functions. The index can itself be encoded as a binary 289 | string. 290 | */ 291 | type RankDirectory struct { 292 | directory BitString 293 | data BitString // data of succinct trie 294 | l1Size uint 295 | l2Size uint 296 | l1Bits uint 297 | l2Bits uint 298 | sectionBits uint 299 | numBits uint 300 | } 301 | 302 | /** 303 | Used to build a rank directory from the given input string. 304 | 305 | @param data A javascript string containing the data, as readable using the 306 | BitString object. 307 | 308 | @param numBits The number of bits to index. 309 | 310 | @param l1Size The number of bits that each entry in the Level 1 table 311 | summarizes. This should be a multiple of l2Size. 312 | 313 | @param l2Size The number of bits that each entry in the Level 2 table 314 | summarizes. 315 | */ 316 | func CreateRankDirectory(data string, numBits, l1Size, l2Size uint) RankDirectory { 317 | bits := BitString{} 318 | bits.Init(data) 319 | var p, i uint = 0, 0 320 | var count1, count2 uint = 0, 0 321 | l1bits := uint(math.Ceil(math.Log2(float64(numBits)))) 322 | l2bits := uint(math.Ceil(math.Log2(float64(l1Size)))) 323 | 324 | directory := BitWriter{} 325 | 326 | for p+l2Size <= numBits { 327 | count2 += bits.Count(p, l2Size) 328 | i += l2Size 329 | p += l2Size 330 | if i == l1Size { 331 | count1 += count2 332 | directory.Write(count1, l1bits) 333 | count2 = 0 334 | i = 0 335 | } else { 336 | directory.Write(count2, l2bits) 337 | } 338 | } 339 | 340 | rd := RankDirectory{} 341 | rd.Init(directory.GetData(), data, numBits, l1Size, l2Size) 342 | return rd 343 | } 344 | 345 | func (rd *RankDirectory) Init(directoryData, bitData string, numBits, l1Size, l2Size uint) { 346 | rd.directory.Init(directoryData) 347 | rd.data.Init(bitData) 348 | rd.l1Size = l1Size 349 | rd.l2Size = l2Size 350 | rd.l1Bits = uint(math.Ceil(math.Log2(float64(numBits)))) 351 | rd.l2Bits = uint(math.Ceil(math.Log2(float64(l1Size)))) 352 | rd.sectionBits = (l1Size/l2Size-1)*rd.l2Bits + rd.l1Bits 353 | rd.numBits = numBits 354 | } 355 | 356 | /** 357 | Returns the string representation of the directory. 358 | */ 359 | func (rd *RankDirectory) GetData() string { 360 | return rd.directory.GetData() 361 | } 362 | 363 | /** 364 | Returns the number of 1 or 0 bits (depending on the "which" parameter) to 365 | to and including position x. 366 | */ 367 | func (rd *RankDirectory) Rank(which, x uint) uint { 368 | 369 | if which == 0 { 370 | return x - rd.Rank(1, x) + 1 371 | } 372 | 373 | var rank uint = 0 374 | o := x 375 | var sectionPos uint = 0 376 | 377 | if o >= rd.l1Size { 378 | sectionPos = (o/rd.l1Size | 0) * rd.sectionBits 379 | rank = rd.directory.Get(sectionPos-rd.l1Bits, rd.l1Bits) 380 | o = o % rd.l1Size 381 | } 382 | 383 | if o >= rd.l2Size { 384 | sectionPos += (o/rd.l2Size | 0) * rd.l2Bits 385 | rank += rd.directory.Get(sectionPos-rd.l2Bits, rd.l2Bits) 386 | } 387 | 388 | rank += rd.data.Count(x-x%rd.l2Size, x%rd.l2Size+1) 389 | 390 | return rank 391 | } 392 | 393 | /** 394 | Returns the position of the y'th 0 or 1 bit, depending on the "which" 395 | parameter. 396 | */ 397 | func (rd *RankDirectory) Select(which, y uint) uint { 398 | high := int(rd.numBits) 399 | low := -1 400 | val := -1 401 | 402 | for high-low > 1 { 403 | probe := (high+low)/2 | 0 404 | r := rd.Rank(which, uint(probe)) 405 | 406 | if r == y { 407 | // We have to continue searching after we have found it, 408 | // because we want the _first_ occurrence. 409 | val = probe 410 | high = probe 411 | } else if r < y { 412 | low = probe 413 | } else { 414 | high = probe 415 | } 416 | } 417 | 418 | return uint(val) 419 | } 420 | 421 | /** 422 | A Trie node, for use in building the encoding trie. This is not needed for 423 | the decoder. 424 | */ 425 | type TrieNode struct { 426 | letter string 427 | final bool 428 | children []*TrieNode 429 | } 430 | 431 | type Trie struct { 432 | previousWord string 433 | root *TrieNode 434 | cache []*TrieNode 435 | nodeCount uint 436 | } 437 | 438 | func (t *Trie) Init() { 439 | t.previousWord = "" 440 | t.root = &TrieNode{ 441 | letter: " ", 442 | final: false, 443 | } 444 | t.cache = append(t.cache, t.root) 445 | t.nodeCount = 1 446 | } 447 | 448 | /** 449 | Returns the number of nodes in the trie 450 | */ 451 | func (t *Trie) GetNodeCount() uint { 452 | return t.nodeCount 453 | } 454 | 455 | /** 456 | Inserts a word into the trie. This function is fastest if the words are 457 | inserted in alphabetical order. 458 | */ 459 | func (t *Trie) Insert(word string) { 460 | 461 | commonPrefix := 0 462 | 463 | min := len(word) 464 | if min > len(t.previousWord) { 465 | min = len(t.previousWord) 466 | } 467 | 468 | for i := 0; i < min; i++ { 469 | if word[i] != t.previousWord[i] { 470 | break 471 | } 472 | commonPrefix += 1 473 | } 474 | 475 | t.cache = t.cache[:commonPrefix+1] 476 | node := t.cache[commonPrefix] 477 | 478 | for i := commonPrefix; i < len(word); i++ { 479 | // fix the bug if words not inserted in alphabetical order 480 | isLetterExist := false 481 | for _, cld := range node.children { 482 | if cld.letter == word[i:i+1] { 483 | t.cache = append(t.cache, cld) 484 | node = cld 485 | isLetterExist = true 486 | break 487 | } 488 | } 489 | if isLetterExist { 490 | continue 491 | } 492 | 493 | next := &TrieNode{ 494 | letter: word[i : i+1], 495 | final: false, 496 | } 497 | t.nodeCount++ 498 | node.children = append(node.children, next) 499 | t.cache = append(t.cache, next) 500 | node = next 501 | } 502 | 503 | node.final = true 504 | t.previousWord = word 505 | } 506 | 507 | /** 508 | Apply a function to each node, traversing the trie in level order. 509 | */ 510 | func (t *Trie) Apply(fn func(*TrieNode)) { 511 | var level []*TrieNode 512 | level = append(level, t.root) 513 | for len(level) > 0 { 514 | node := level[0] 515 | level = level[1:] 516 | for i := 0; i < len(node.children); i++ { 517 | level = append(level, node.children[i]) 518 | } 519 | fn(node) 520 | } 521 | } 522 | 523 | /** 524 | Encode the trie and all of its nodes. Returns a string representing the 525 | encoded data. 526 | */ 527 | func (t *Trie) Encode() string { 528 | // Write the unary encoding of the tree in level order. 529 | bits := BitWriter{} 530 | bits.Write(0x02, 2) 531 | t.Apply(func(node *TrieNode) { 532 | for i := 0; i < len(node.children); i++ { 533 | bits.Write(1, 1) 534 | } 535 | bits.Write(0, 1) 536 | }) 537 | 538 | // Write the data for each node, using 6 bits for node. 1 bit stores 539 | // the "final" indicator. The other 5 bits store one of the 26 letters 540 | // of the alphabet. 541 | t.Apply(func(node *TrieNode) { 542 | value := node.letter[0] - "a"[0] 543 | if node.final { 544 | value |= 0x20 545 | } 546 | 547 | bits.Write(uint(value), 6) 548 | }) 549 | 550 | return bits.GetData() 551 | } 552 | 553 | /** 554 | This class is used for traversing the succinctly encoded trie. 555 | */ 556 | type FrozenTrieNode struct { 557 | trie *FrozenTrie 558 | index uint 559 | letter string 560 | final bool 561 | firstChild uint 562 | childCount uint 563 | } 564 | 565 | /** 566 | Returns the number of children. 567 | */ 568 | func (f *FrozenTrieNode) GetChildCount() uint { 569 | return f.childCount 570 | } 571 | 572 | /** 573 | Returns the FrozenTrieNode for the given child. 574 | 575 | @param index The 0-based index of the child of this node. For example, if 576 | the node has 5 children, and you wanted the 0th one, pass in 0. 577 | */ 578 | func (f *FrozenTrieNode) GetChild(index uint) FrozenTrieNode { 579 | return f.trie.GetNodeByIndex(f.firstChild + index) 580 | } 581 | 582 | /** 583 | The FrozenTrie is used for looking up words in the encoded trie. 584 | 585 | @param data A string representing the encoded trie. 586 | 587 | @param directoryData A string representing the RankDirectory. The global L1 588 | and L2 constants are used to determine the L1Size and L2size. 589 | 590 | @param nodeCount The number of nodes in the trie. 591 | */ 592 | type FrozenTrie struct { 593 | data BitString 594 | directory RankDirectory 595 | letterStart uint 596 | } 597 | 598 | func (f *FrozenTrie) Init(data, directoryData string, nodeCount uint) { 599 | f.data.Init(data) 600 | f.directory.Init(directoryData, data, nodeCount*2+1, L1, L2) 601 | 602 | // The position of the first bit of the data in 0th node. In non-root 603 | // nodes, this would contain 6-bit letters. 604 | f.letterStart = nodeCount*2 + 1 605 | } 606 | 607 | /** 608 | Retrieve the FrozenTrieNode of the trie, given its index in level-order. 609 | This is a private function that you don't have to use. 610 | */ 611 | func (f *FrozenTrie) GetNodeByIndex(index uint) FrozenTrieNode { 612 | // retrieve the 6-bit letter. 613 | final := (f.data.Get(f.letterStart+index*6, 1) == 1) 614 | letter := string("a"[0] + byte(f.data.Get(f.letterStart+index*6+1, 5))) 615 | firstChild := f.directory.Select(0, index+1) - index 616 | 617 | // Since the nodes are in level order, this nodes children must go up 618 | // until the next node's children start. 619 | childOfNextNode := f.directory.Select(0, index+2) - index - 1 620 | 621 | return FrozenTrieNode{ 622 | trie: f, 623 | index: index, 624 | letter: letter, 625 | final: final, 626 | firstChild: firstChild, 627 | childCount: (childOfNextNode - firstChild), 628 | } 629 | } 630 | 631 | /** 632 | Retrieve the root node. You can use this node to obtain all of the other 633 | nodes in the trie. 634 | */ 635 | func (f *FrozenTrie) GetRoot() FrozenTrieNode { 636 | return f.GetNodeByIndex(0) 637 | } 638 | 639 | /** 640 | Look-up a word in the trie. Returns true if and only if the word exists 641 | in the trie. 642 | */ 643 | func (f *FrozenTrie) Lookup(word string) bool { 644 | node := f.GetRoot() 645 | for i := 0; i < len(word); i++ { 646 | var child FrozenTrieNode 647 | var j uint = 0 648 | for ; j < node.GetChildCount(); j++ { 649 | child = node.GetChild(j) 650 | if child.letter == word[i:i+1] { 651 | break 652 | } 653 | } 654 | 655 | if j == node.GetChildCount() { 656 | return false 657 | } 658 | node = child 659 | } 660 | 661 | return node.final 662 | } 663 | -------------------------------------------------------------------------------- /reference/Bits.js: -------------------------------------------------------------------------------- 1 | /* 2 | A Succinct Trie for Javascript 3 | 4 | By Steve Hanov 5 | Released to the public domain. 6 | 7 | This file contains functions for creating a succinctly encoded trie structure 8 | from a list of words. The trie is encoded to a succinct bit string using the 9 | method of Jacobson (1989). The bitstring is then encoded using BASE-64. 10 | 11 | The resulting trie does not have to be decoded to be used. This file also 12 | contains functions for looking up a word in the BASE-64 encoded data, in 13 | O(mlogn) time, where m is the number of letters in the target word, and n is 14 | the number of nodes in the trie. 15 | 16 | Objects for encoding: 17 | 18 | TrieNode 19 | Trie 20 | BitWriter 21 | 22 | Objects for decoding: 23 | BitString 24 | FrozenTrieNode 25 | FrozenTrie 26 | 27 | QUICK USAGE: 28 | 29 | Suppose we let data be some output of the demo encoder: 30 | 31 | var data = { 32 | "nodeCount": 37, 33 | "directory": "BMIg", 34 | "trie": "v2qqqqqqqpIUn4A5JZyBZ4ggCKh55ZZgBA5ZZd5vIEl1wx8g8A" 35 | }; 36 | 37 | var frozenTrie = new FrozenTrie( Data.trie, Data.directory, Data.nodeCount); 38 | 39 | alert( frozenTrie.lookup( "hello" ) ); // outputs true 40 | alert( frozenTrie.lookup( "kwijibo" ) ); // outputs false 41 | 42 | */ 43 | 44 | // Configure the bit writing and reading functions to work natively in BASE-64 45 | // encoding. That way, we don't have to convert back and forth to bytes. 46 | 47 | var BASE64 = 48 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; 49 | 50 | 51 | /** 52 | The width of each unit of the encoding, in bits. Here we use 6, for base-64 53 | encoding. 54 | */ 55 | var W = 6; 56 | 57 | /** 58 | Returns the character unit that represents the given value. If this were 59 | binary data, we would simply return id. 60 | */ 61 | function CHR(id) 62 | { 63 | return BASE64[id]; 64 | } 65 | 66 | /** 67 | Returns the decimal value of the given character unit. 68 | */ 69 | 70 | var BASE64_CACHE = {"A" : 0, "B" : 1, "C" : 2, "D" : 3, "E" : 4, "F" : 5, "G" : 71 | 6, "H" : 7, "I" : 8, "J" : 9, "K" : 10, "L" : 11, "M" : 12, "N" : 13, "O" : 72 | 14, "P" : 15, "Q" : 16, "R" : 17, "S" : 18, "T" : 19, "U" : 20, "V" : 73 | 21, "W" : 22, "X" : 23, "Y" : 24, "Z" : 25, "a" : 26, "b" : 27, "c" : 74 | 28, "d" : 29, "e" : 30, "f" : 31, "g" : 32, "h" : 33, "i" : 34, "j" : 75 | 35, "k" : 36, "l" : 37, "m" : 38, "n" : 39, "o" : 40, "p" : 41, "q" : 76 | 42, "r" : 43, "s" : 44, "t" : 45, "u" : 46, "v" : 47, "w" : 48, "x" : 77 | 49, "y" : 50, "z" : 51, "0" : 52, "1" : 53, "2" : 54, "3" : 55, "4" : 78 | 56, "5" : 57, "6" : 58, "7" : 59, "8" : 60, "9" : 61, "-" : 62, "_" : 79 | 63}; 80 | 81 | function ORD(ch) 82 | { 83 | // Used to be: return BASE64.indexOf(ch); 84 | return BASE64_CACHE[ch]; 85 | } 86 | 87 | /** 88 | Fixed values for the L1 and L2 table sizes in the Rank Directory 89 | */ 90 | var L1 = 32*32; 91 | var L2 = 32; 92 | 93 | /** 94 | The BitWriter will create a stream of bytes, letting you write a certain 95 | number of bits at a time. This is part of the encoder, so it is not 96 | optimized for memory or speed. 97 | */ 98 | function BitWriter() 99 | { 100 | this.init(); 101 | } 102 | 103 | BitWriter.prototype = 104 | { 105 | init: function() { 106 | this.bits = []; 107 | }, 108 | 109 | /** 110 | Write some data to the bit string. The number of bits must be 32 or 111 | fewer. 112 | */ 113 | write: function( data, numBits ) { 114 | for( var i = numBits - 1; i >= 0; i-- ) { 115 | if ( data & ( 1 << i ) ) { 116 | this.bits.push(1); 117 | } else { 118 | this.bits.push(0); 119 | } 120 | } 121 | }, 122 | 123 | /** 124 | Get the bitstring represented as a javascript string of bytes 125 | */ 126 | getData: function() { 127 | var chars = []; 128 | var b = 0; 129 | var i = 0; 130 | 131 | for ( var j = 0; j < this.bits.length; j++ ) { 132 | b = ( b << 1 ) | this.bits[j]; 133 | i += 1; 134 | if ( i === W ) { 135 | chars.push( CHR(b) ); 136 | i = b = 0; 137 | } 138 | } 139 | 140 | if ( i ) { 141 | chars.push( CHR(b << ( W - i )) ); 142 | } 143 | 144 | return chars.join(""); 145 | }, 146 | 147 | /** 148 | Returns the bits as a human readable binary string for debugging 149 | */ 150 | getDebugString: function(group) { 151 | var chars = []; 152 | var i = 0; 153 | 154 | for( var j = 0; j < this.bits.length; j++ ) { 155 | chars.push( "" + this.bits[j] ); 156 | i++; 157 | if ( i === group ) { 158 | chars.push( ' ' ); 159 | i = 0; 160 | } 161 | } 162 | 163 | return chars.join(""); 164 | } 165 | }; 166 | 167 | /** 168 | Given a string of data (eg, in BASE-64), the BitString class supports 169 | reading or counting a number of bits from an arbitrary position in the 170 | string. 171 | */ 172 | function BitString( str ) 173 | { 174 | this.init( str ); 175 | } 176 | 177 | BitString.MaskTop = [ 178 | 0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x01, 0x00 179 | ]; 180 | 181 | BitString.BitsInByte = [ 182 | 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 183 | 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 184 | 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 185 | 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 186 | 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 187 | 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 188 | 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 189 | 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 190 | 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 191 | 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 192 | 6, 7, 6, 7, 7, 8 193 | ]; 194 | 195 | 196 | BitString.prototype = { 197 | init: function( str ) { 198 | this.bytes = str; 199 | this.length = this.bytes.length * W; 200 | }, 201 | 202 | /** 203 | Returns the internal string of bytes 204 | */ 205 | getData: function() { 206 | return this.bytes; 207 | }, 208 | 209 | /** 210 | Returns a decimal number, consisting of a certain number, n, of bits 211 | starting at a certain position, p. 212 | */ 213 | get: function( p, n ) { 214 | 215 | // case 1: bits lie within the given byte 216 | if ( ( p % W ) + n <= W ) { 217 | return ( ORD( this.bytes[ p / W | 0 ] ) & BitString.MaskTop[ p % W ] ) >> 218 | ( W - p % W - n ); 219 | 220 | // case 2: bits lie incompletely in the given byte 221 | } else { 222 | var result = ( ORD( this.bytes[ p / W | 0 ] ) & 223 | BitString.MaskTop[ p % W ] ); 224 | 225 | var l = W - p % W; 226 | p += l; 227 | n -= l; 228 | 229 | while ( n >= W ) { 230 | result = (result << W) | ORD( this.bytes[ p / W | 0 ] ); 231 | p += W; 232 | n -= W; 233 | } 234 | 235 | if ( n > 0 ) { 236 | result = (result << n) | ( ORD( this.bytes[ p / W | 0 ] ) >> 237 | ( W - n ) ); 238 | } 239 | 240 | return result; 241 | } 242 | }, 243 | 244 | /** 245 | Counts the number of bits set to 1 starting at position p and 246 | ending at position p + n 247 | */ 248 | count: function( p, n ) { 249 | 250 | var count = 0; 251 | while( n >= 8 ) { 252 | count += BitString.BitsInByte[ this.get( p, 8 ) ]; 253 | p += 8; 254 | n -= 8; 255 | } 256 | 257 | return count + BitString.BitsInByte[ this.get( p, n ) ]; 258 | }, 259 | 260 | /** 261 | Returns the number of bits set to 1 up to and including position x. 262 | This is the slow implementation used for testing. 263 | */ 264 | rank: function( x ) { 265 | var rank = 0; 266 | for( var i = 0; i <= x; i++ ) { 267 | if ( this.get(i, 1) ) { 268 | rank++; 269 | } 270 | } 271 | 272 | return rank; 273 | } 274 | }; 275 | 276 | /** 277 | The rank directory allows you to build an index to quickly compute the 278 | rank() and select() functions. The index can itself be encoded as a binary 279 | string. 280 | */ 281 | function RankDirectory( directoryData, bitData, numBits, l1Size, l2Size ) 282 | { 283 | this.init(directoryData, bitData, numBits, l1Size, l2Size); 284 | } 285 | 286 | /** 287 | Used to build a rank directory from the given input string. 288 | 289 | @param data A javascript string containing the data, as readable using the 290 | BitString object. 291 | 292 | @param numBits The number of bits to index. 293 | 294 | @param l1Size The number of bits that each entry in the Level 1 table 295 | summarizes. This should be a multiple of l2Size. 296 | 297 | @param l2Size The number of bits that each entry in the Level 2 table 298 | summarizes. 299 | */ 300 | RankDirectory.Create = function( data, numBits, l1Size, l2Size ) { 301 | var bits = new BitString( data ); 302 | var p = 0; 303 | var i = 0; 304 | var count1 = 0, count2 = 0; 305 | var l1bits = Math.ceil( Math.log( numBits ) / Math.log(2) ); 306 | var l2bits = Math.ceil( Math.log( l1Size ) / Math.log(2) ); 307 | 308 | var directory = new BitWriter(); 309 | 310 | while( p + l2Size <= numBits ) { 311 | count2 += bits.count( p, l2Size ); 312 | i += l2Size; 313 | p += l2Size; 314 | if ( i === l1Size ) { 315 | count1 += count2; 316 | directory.write( count1, l1bits ); 317 | count2 = 0; 318 | i = 0; 319 | } else { 320 | directory.write( count2, l2bits ); 321 | } 322 | } 323 | 324 | return new RankDirectory( directory.getData(), data, numBits, l1Size, l2Size ); 325 | }; 326 | 327 | 328 | RankDirectory.prototype = { 329 | 330 | init: function( directoryData, bitData, numBits, l1Size, l2Size ) { 331 | this.directory = new BitString( directoryData ); 332 | this.data = new BitString( bitData ); 333 | this.l1Size = l1Size; 334 | this.l2Size = l2Size; 335 | this.l1Bits = Math.ceil( Math.log( numBits ) / Math.log( 2 ) ); 336 | this.l2Bits = Math.ceil( Math.log( l1Size ) / Math.log( 2 ) ); 337 | this.sectionBits = (l1Size / l2Size - 1) * this.l2Bits + this.l1Bits; 338 | this.numBits = numBits; 339 | }, 340 | 341 | /** 342 | Returns the string representation of the directory. 343 | */ 344 | getData: function() { 345 | return this.directory.getData(); 346 | }, 347 | 348 | /** 349 | Returns the number of 1 or 0 bits (depending on the "which" parameter) to 350 | to and including position x. 351 | */ 352 | rank: function( which, x ) { 353 | 354 | if ( which === 0 ) { 355 | return x - this.rank( 1, x ) + 1; 356 | } 357 | 358 | var rank = 0; 359 | var o = x; 360 | var sectionPos = 0; 361 | 362 | if ( o >= this.l1Size ) { 363 | sectionPos = ( o / this.l1Size | 0 ) * this.sectionBits; 364 | rank = this.directory.get( sectionPos - this.l1Bits, this.l1Bits ); 365 | o = o % this.l1Size; 366 | } 367 | 368 | if ( o >= this.l2Size ) { 369 | sectionPos += ( o / this.l2Size | 0 ) * this.l2Bits; 370 | rank += this.directory.get( sectionPos - this.l2Bits, this.l2Bits ); 371 | } 372 | 373 | rank += this.data.count( x - x % this.l2Size, x % this.l2Size + 1 ); 374 | 375 | return rank; 376 | }, 377 | 378 | /** 379 | Returns the position of the y'th 0 or 1 bit, depending on the "which" 380 | parameter. 381 | */ 382 | select: function( which, y ) { 383 | var high = this.numBits; 384 | var low = -1; 385 | var val = -1; 386 | 387 | while ( high - low > 1 ) { 388 | var probe = (high + low) / 2 | 0; 389 | var r = this.rank( which, probe ); 390 | 391 | if ( r === y ) { 392 | // We have to continue searching after we have found it, 393 | // because we want the _first_ occurrence. 394 | val = probe; 395 | high = probe; 396 | } else if ( r < y ) { 397 | low = probe; 398 | } else { 399 | high = probe; 400 | } 401 | } 402 | 403 | return val; 404 | } 405 | }; 406 | 407 | /** 408 | A Trie node, for use in building the encoding trie. This is not needed for 409 | the decoder. 410 | */ 411 | function TrieNode( letter ) 412 | { 413 | this.letter = letter; 414 | this.final = false; 415 | this.children = []; 416 | } 417 | 418 | function Trie() 419 | { 420 | this.init(); 421 | } 422 | 423 | Trie.prototype = { 424 | init: function() { 425 | this.previousWord = ""; 426 | this.root = new TrieNode(' '); 427 | this.cache = [ this.root ]; 428 | this.nodeCount = 1; 429 | }, 430 | 431 | /** 432 | Returns the number of nodes in the trie 433 | */ 434 | getNodeCount: function() { 435 | return this.nodeCount; 436 | }, 437 | 438 | /** 439 | Inserts a word into the trie. This function is fastest if the words are 440 | inserted in alphabetical order. 441 | */ 442 | insert: function( word ) { 443 | 444 | var commonPrefix = 0; 445 | for( var i = 0; i < Math.min( word.length, this.previousWord.length ); 446 | i++ ) 447 | { 448 | if ( word[i] !== this.previousWord[i] ) { break; } 449 | commonPrefix += 1; 450 | } 451 | 452 | this.cache.length = commonPrefix + 1; 453 | var node = this.cache[ this.cache.length - 1 ]; 454 | 455 | for( i = commonPrefix; i < word.length; i++ ) { 456 | var next = new TrieNode( word[i] ); 457 | this.nodeCount++; 458 | node.children.push( next ); 459 | this.cache.push( next ); 460 | node = next; 461 | } 462 | 463 | node.final = true; 464 | this.previousWord = word; 465 | }, 466 | 467 | /** 468 | Apply a function to each node, traversing the trie in level order. 469 | */ 470 | apply: function( fn ) 471 | { 472 | var level = [ this.root ]; 473 | while( level.length > 0 ) { 474 | var node = level.shift(); 475 | for( var i = 0; i < node.children.length; i++ ) { 476 | level.push( node.children[i] ); 477 | } 478 | fn( node ); 479 | } 480 | 481 | }, 482 | 483 | /** 484 | Encode the trie and all of its nodes. Returns a string representing the 485 | encoded data. 486 | */ 487 | encode: function() 488 | { 489 | // Write the unary encoding of the tree in level order. 490 | var bits = new BitWriter(); 491 | bits.write( 0x02, 2 ); 492 | this.apply( function( node ) { 493 | for( var i = 0; i < node.children.length; i++ ) { 494 | bits.write( 1, 1 ); 495 | } 496 | bits.write( 0, 1 ); 497 | }); 498 | 499 | // Write the data for each node, using 6 bits for node. 1 bit stores 500 | // the "final" indicator. The other 5 bits store one of the 26 letters 501 | // of the alphabet. 502 | var a = ("a").charCodeAt(0); 503 | this.apply( function( node ) { 504 | var value = node.letter.charCodeAt(0) - a; 505 | if ( node.final ) { 506 | value |= 0x20; 507 | } 508 | 509 | bits.write( value, 6 ); 510 | }); 511 | 512 | return bits.getData(); 513 | } 514 | }; 515 | 516 | /** 517 | This class is used for traversing the succinctly encoded trie. 518 | */ 519 | function FrozenTrieNode( trie, index, letter, final, firstChild, childCount ) 520 | { 521 | this.trie = trie; 522 | this.index = index; 523 | this.letter = letter; 524 | this.final = final; 525 | this.firstChild = firstChild; 526 | this.childCount = childCount; 527 | } 528 | 529 | FrozenTrieNode.prototype = { 530 | /** 531 | Returns the number of children. 532 | */ 533 | getChildCount: function() 534 | { 535 | return this.childCount; 536 | }, 537 | 538 | /** 539 | Returns the FrozenTrieNode for the given child. 540 | 541 | @param index The 0-based index of the child of this node. For example, if 542 | the node has 5 children, and you wanted the 0th one, pass in 0. 543 | */ 544 | getChild: function(index) 545 | { 546 | return this.trie.getNodeByIndex( this.firstChild + index ); 547 | } 548 | }; 549 | 550 | /** 551 | The FrozenTrie is used for looking up words in the encoded trie. 552 | 553 | @param data A string representing the encoded trie. 554 | 555 | @param directoryData A string representing the RankDirectory. The global L1 556 | and L2 constants are used to determine the L1Size and L2size. 557 | 558 | @param nodeCount The number of nodes in the trie. 559 | */ 560 | function FrozenTrie( data, directoryData, nodeCount ) 561 | { 562 | this.init( data, directoryData, nodeCount ); 563 | } 564 | 565 | FrozenTrie.prototype = { 566 | init: function( data, directoryData, nodeCount ) 567 | { 568 | this.data = new BitString( data ); 569 | this.directory = new RankDirectory( directoryData, data, 570 | nodeCount * 2 + 1, L1, L2 ); 571 | 572 | // The position of the first bit of the data in 0th node. In non-root 573 | // nodes, this would contain 6-bit letters. 574 | this.letterStart = nodeCount * 2 + 1; 575 | }, 576 | 577 | /** 578 | Retrieve the FrozenTrieNode of the trie, given its index in level-order. 579 | This is a private function that you don't have to use. 580 | */ 581 | getNodeByIndex: function( index ) 582 | { 583 | // retrieve the 6-bit letter. 584 | var final = this.data.get( this.letterStart + index * 6, 1 ) === 1; 585 | var letter = String.fromCharCode( 586 | this.data.get( this.letterStart + index * 6 + 1, 5 ) + 587 | 'a'.charCodeAt(0)); 588 | var firstChild = this.directory.select( 0, index+1 ) - index; 589 | 590 | // Since the nodes are in level order, this nodes children must go up 591 | // until the next node's children start. 592 | var childOfNextNode = this.directory.select( 0, index + 2 ) - index - 1; 593 | 594 | return new FrozenTrieNode( this, index, letter, final, firstChild, 595 | childOfNextNode - firstChild ); 596 | }, 597 | 598 | /** 599 | Retrieve the root node. You can use this node to obtain all of the other 600 | nodes in the trie. 601 | */ 602 | getRoot: function() 603 | { 604 | return this.getNodeByIndex( 0 ); 605 | }, 606 | 607 | /** 608 | Look-up a word in the trie. Returns true if and only if the word exists 609 | in the trie. 610 | */ 611 | lookup: function( word ) 612 | { 613 | var node = this.getRoot(); 614 | for ( var i = 0; i < word.length; i++ ) { 615 | var child; 616 | var j = 0; 617 | for ( ; j < node.getChildCount(); j++ ) { 618 | child = node.getChild( j ); 619 | if ( child.letter === word[i] ) { 620 | break; 621 | } 622 | } 623 | 624 | if ( j === node.getChildCount() ) { 625 | return false; 626 | } 627 | node = child; 628 | } 629 | 630 | return node.final; 631 | } 632 | }; 633 | 634 | /************************************************************************************************** 635 | DEMONSTATION APPLICATION FUNCTIONS 636 | *************************************************************************************************/ 637 | 638 | /** 639 | Load a dictionary asynchronously. 640 | */ 641 | function loadDictionary() 642 | { 643 | var xmlHttpReq; 644 | try { 645 | xmlHttpReq = new XMLHttpRequest(); 646 | } catch ( trymicrosoft ) { 647 | try { 648 | xmlHttpReq = new ActiveXObject("Msxml2.XMLHTTP"); 649 | } catch(othermicrosoft) { 650 | try { 651 | xmlHttpReq = new ActiveXObject("Microsoft.XMLHTTP"); 652 | } catch(failed) { 653 | xmlHttpReq = null; 654 | } 655 | } 656 | } 657 | 658 | strUrl = "ospd3.txt"; 659 | 660 | xmlHttpReq.open("GET", "ospd3.txt", true); 661 | xmlHttpReq.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded'); 662 | xmlHttpReq.onreadystatechange = function() { 663 | if (xmlHttpReq.readyState === 4) { 664 | if (xmlHttpReq.status === 200 ) { 665 | document.getElementById("input").value = 666 | xmlHttpReq.responseText; 667 | } else if ( xmlHttpReq.message ) { 668 | alert( xmlHttpReq.message ); 669 | } else { 670 | alert( "Network error. Check internet connection" ); 671 | } 672 | } 673 | }; 674 | 675 | xmlHttpReq.send(""); 676 | } 677 | 678 | /** 679 | Encode the trie in the input text box. 680 | */ 681 | function go() 682 | { 683 | // create a trie 684 | var trie = new Trie(); 685 | 686 | // split the words of the input up. Sort them for faster trie insertion. 687 | var words = document.getElementById("input").value.split(/\s+/); 688 | words.sort(); 689 | var regex = /^[a-z]+$/; 690 | for ( var i = 0; i < words.length; i++ ) { 691 | // To save space, our encoding handles only the letters a-z. Ignore 692 | // words that contain other characters. 693 | var word = words[i].toLowerCase(); 694 | if ( word.match( /^[a-z]+$/ ) ) { 695 | trie.insert( word ); 696 | } 697 | 698 | } 699 | 700 | // Encode the trie. 701 | var trieData = trie.encode(); 702 | 703 | // Encode the rank directory 704 | var directory = RankDirectory.Create( trieData, trie.getNodeCount() * 2 + 705 | 1, L1, L2 ); 706 | var output; 707 | 708 | output = '{\n "nodeCount": ' + trie.getNodeCount() + ",\n"; 709 | 710 | output += ' "directory": "' + directory.getData() + '",\n'; 711 | 712 | output += ' "trie": "' + trieData + '"\n'; 713 | output += "}\n"; 714 | 715 | document.getElementById("output").value = output; 716 | 717 | document.getElementById("encodeStatus").innerHTML = 718 | "Encoded " + document.getElementById("input").value.length + 719 | " bytes to " + output.length + " bytes."; 720 | 721 | } 722 | 723 | /** 724 | Decode the data in the output textarea, and use it to check if a word exists 725 | in the dictionary. 726 | */ 727 | function lookup() 728 | { 729 | var status = ""; 730 | try 731 | { 732 | var json = eval( '(' + document.getElementById("output").value + ")" ); 733 | var ftrie = new FrozenTrie( json.trie, json.directory, json.nodeCount 734 | ); 735 | var word = document.getElementById("lookup").value; 736 | if ( ftrie.lookup( document.getElementById("lookup").value ) ) { 737 | status = '"' + word + "' is in the dictionary."; 738 | } else { 739 | status = '"' + word + "' IS NOT in the dictionary."; 740 | } 741 | } catch ( e ) { 742 | status = "Error. Have you encoded the dictionary yet?"; 743 | } 744 | 745 | document.getElementById("status").innerHTML = status; 746 | 747 | } 748 | 749 | -------------------------------------------------------------------------------- /reference/Bits_test.go: -------------------------------------------------------------------------------- 1 | package Bits 2 | 3 | import "testing" 4 | 5 | func TestBASE64(t *testing.T) { 6 | t.Log("W:", W) 7 | t.Log("len(BASE64):", len(BASE64)) 8 | if len(BASE64) != 64 { 9 | t.Error("len(BASE64) Expected 64, got ", len(BASE64)) 10 | } 11 | t.Log("len(BASE64_CACHE):", len(BASE64_CACHE)) 12 | if len(BASE64_CACHE) != 64 { 13 | t.Error("len(BASE64_CACHE) Expected 64, got ", len(BASE64_CACHE)) 14 | } 15 | t.Log("CHR(0):", CHR(0)) 16 | if CHR(0) != "A" { 17 | t.Error("CHR(0) Expected \"A\", got ", CHR(0)) 18 | } 19 | t.Log("ORD(\"A\"):", ORD("A")) 20 | if ORD("A") != 0 { 21 | t.Error("ORD(\"A\") Expected 0, got ", ORD("A")) 22 | } 23 | t.Log("CHR(1):", CHR(1)) 24 | t.Log("ORD(\"B\"):", ORD("B")) 25 | t.Log("CHR(63):", CHR(63)) 26 | if CHR(63) != "_" { 27 | t.Error("CHR(63) Expected \"_\", got ", CHR(63)) 28 | } 29 | t.Log("ORD(\"_\"):", ORD("_")) 30 | if ORD("_") != 63 { 31 | t.Error("ORD(\"_\") Expected 63, got ", ORD("_")) 32 | } 33 | t.Log("L1:", L1) 34 | t.Log("L2:", L2) 35 | } 36 | 37 | func TestBitWriter(t *testing.T) { 38 | bw := BitWriter{} 39 | bw.Write(3, 2) 40 | if bw.GetDebugString(3) != "11" { 41 | t.Error("Expected 11, got ", bw.GetDebugString(3)) 42 | } 43 | if bw.GetData() != "w" { 44 | t.Error("Expected w, got ", bw.GetData()) 45 | } 46 | bw.Write(0, 3) 47 | if bw.GetData() != "w" { 48 | t.Error("Expected w, got ", bw.GetData()) 49 | } 50 | bw.Write(2, 2) 51 | if bw.GetData() != "xA" { 52 | t.Error("Expected xA, got ", bw.GetData()) 53 | } 54 | t.Log(bw) 55 | t.Log(bw.GetData()) 56 | t.Log(bw.GetDebugString(3)) 57 | if bw.GetDebugString(3) != "110 001 0" { 58 | t.Error("Expected 110 001 0, got ", bw.GetDebugString(3)) 59 | } 60 | } 61 | 62 | func TestBitString(t *testing.T) { 63 | bs := BitString{} 64 | bs.Init("88kj5w_6phb") 65 | t.Log(bs) 66 | if bs.Rank(5) != 4 { 67 | t.Error("Expected 4, got ", bs.Rank(5)) 68 | } 69 | if bs.Rank(24) != 14 { 70 | t.Error("Expected 14, got ", bs.Rank(24)) 71 | } 72 | if bs.Rank(37) != 21 { 73 | t.Error("Expected 21, got ", bs.Rank(37)) 74 | } 75 | if bs.Rank(55) != 33 { 76 | t.Error("Expected 33, got ", bs.Rank(55)) 77 | } 78 | if bs.Rank(65) != 38 { 79 | t.Error("Expected 38, got ", bs.Rank(65)) 80 | } 81 | // FIXME??: bs.Rank(66) fails the test 82 | if bs.Get(5, 7) != 60 { 83 | t.Error("Expected 60, got ", bs.Get(5, 7)) 84 | } 85 | if bs.Get(7, 13) != 7314 { 86 | t.Error("Expected 7314, got ", bs.Get(7, 13)) 87 | } 88 | if bs.Get(0, 5) != 30 { 89 | t.Error("Expected 30, got ", bs.Get(0, 5)) 90 | } 91 | if bs.Get(3, 3) != 4 { 92 | t.Error("Expected 4, got ", bs.Get(3, 3)) 93 | } 94 | if bs.Get(33, 17) != 16362 { 95 | t.Error("Expected 16362, got ", bs.Get(33, 17)) 96 | } 97 | if bs.Count(0, 17) != 10 { 98 | t.Error("Expected 10, got ", bs.Count(0, 17)) 99 | } 100 | if bs.Count(7, 2) != 2 { 101 | t.Error("Expected 2, got ", bs.Count(7, 2)) 102 | } 103 | if bs.Count(56, 9) != 4 { 104 | t.Error("Expected 4, got ", bs.Count(56, 9)) 105 | } 106 | if bs.Count(12, 1) != 1 { 107 | t.Error("Expected 1, got ", bs.Count(12, 1)) 108 | } 109 | if bs.Count(5, 7) != 4 { 110 | t.Error("Expected 4, got ", bs.Count(5, 7)) 111 | } 112 | } 113 | 114 | func TestRankDirectory(t *testing.T) { 115 | rd := CreateRankDirectory("1wnc2bxhbx7mkbgnpwq7vtlub7p6pkls42lvie9j1ekcpt0zytrdl67enescolwex7aumq4imywstrpktbvxy0rp61nnonj9grdf", 400, L1, L2) 116 | t.Log(rd) 117 | if rd.directory.GetData() != "BIJA0EcXBsH4kykLgzjc" { 118 | t.Error("Expected BIJA0EcXBsH4kykLgzjc, got ", rd.directory.GetData()) 119 | } 120 | if rd.directory.length != 120 { 121 | t.Error("Expected 120, got ", rd.directory.length) 122 | } 123 | if rd.Rank(1, 200) != 113 { 124 | t.Error("Expected 113, got ", rd.Rank(1, 200)) 125 | } 126 | if rd.Rank(0, 100) != 47 { 127 | t.Error("Expected 47, got ", rd.Rank(0, 100)) 128 | } 129 | if rd.Select(1, 134) != 233 { 130 | t.Error("Expected 233, got ", rd.Rank(1, 134)) 131 | } 132 | if rd.Select(0, 77) != 178 { 133 | t.Error("Expected 178, got ", rd.Rank(0, 77)) 134 | } 135 | } 136 | 137 | func insertInAlphabeticalOrder(te *Trie) { 138 | te.Insert("alphapha") 139 | te.Insert("apple") 140 | te.Insert("hello") 141 | te.Insert("jello") 142 | te.Insert("lamp") 143 | te.Insert("orange") 144 | te.Insert("quiz") 145 | } 146 | 147 | func insertNotInAlphabeticalOrder(te *Trie) { 148 | te.Insert("apple") 149 | te.Insert("orange") 150 | te.Insert("alphapha") 151 | te.Insert("lamp") 152 | te.Insert("hello") 153 | te.Insert("jello") 154 | te.Insert("quiz") 155 | } 156 | 157 | func TestTrie(t *testing.T) { 158 | te := Trie{} 159 | te.Init() 160 | insertInAlphabeticalOrder(&te) 161 | teData := te.Encode() 162 | t.Log(teData) 163 | t.Log(te.GetNodeCount()) 164 | if teData != "v2qqqqqqqpIUn4A5JZyBZ4ggCKh55ZZgBA5ZZd5vIEl1wx8g8A" { 165 | t.Error("Expected v2qqqqqqqpIUn4A5JZyBZ4ggCKh55ZZgBA5ZZd5vIEl1wx8g8A, got ", teData) 166 | } 167 | if te.GetNodeCount() != 37 { 168 | t.Error("Expected 37, got ", te.GetNodeCount()) 169 | } 170 | rd := CreateRankDirectory(teData, te.GetNodeCount()*2+1, L1, L2) 171 | if rd.GetData() != "BMIg" { 172 | t.Error("Expected BMIg, got ", rd.GetData()) 173 | } 174 | t.Log(rd.GetData()) 175 | } 176 | 177 | func TestLookup(t *testing.T) { 178 | te := Trie{} 179 | te.Init() 180 | insertNotInAlphabeticalOrder(&te) 181 | teData := te.Encode() 182 | rd := CreateRankDirectory(teData, te.GetNodeCount()*2+1, L1, L2) 183 | 184 | ft := FrozenTrie{} 185 | ft.Init(teData, rd.GetData(), te.GetNodeCount()) 186 | 187 | if ft.Lookup("apple") != true { 188 | t.Error("apple") 189 | } 190 | if ft.Lookup("appl") != false { 191 | t.Error("appl") 192 | } 193 | if ft.Lookup("applea") != false { 194 | t.Error("applea") 195 | } 196 | if ft.Lookup("orange") != true { 197 | t.Error("orange") 198 | } 199 | if ft.Lookup("lamp") != true { 200 | t.Error("lamp") 201 | } 202 | if ft.Lookup("hello") != true { 203 | t.Error("hello") 204 | } 205 | if ft.Lookup("jello") != true { 206 | t.Error("jello") 207 | } 208 | if ft.Lookup("quiz") != true { 209 | t.Error("quiz") 210 | } 211 | if ft.Lookup("quize") != false { 212 | t.Error("quize") 213 | } 214 | if ft.Lookup("alphaph") != false { 215 | t.Error("alphaph") 216 | } 217 | if ft.Lookup("alphapha") != true { 218 | t.Error("alphapha") 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /reference/Bitsjs-pali.patch: -------------------------------------------------------------------------------- 1 | diff --git a/reference/Bits.js b/reference/Bits.js 2 | index d5a3934..cc0cda1 100644 3 | --- a/reference/Bits.js 4 | +++ b/reference/Bits.js 5 | @@ -439,7 +439,15 @@ Trie.prototype = { 6 | Inserts a word into the trie. This function is fastest if the words are 7 | inserted in alphabetical order. 8 | */ 9 | - insert: function( word ) { 10 | + insert: function( word ) { 11 | + 12 | + /** 13 | + * modified by Siong-Ui Te to support non-[a-z] characters 14 | + * 15 | + * check if the word consists of allowed characters. 16 | + */ 17 | + if ( !word.match( VALID_WORD_REGEX ) ) 18 | + throw ('invalid word: ' + word); 19 | 20 | var commonPrefix = 0; 21 | for( var i = 0; i < Math.min( word.length, this.previousWord.length ); 22 | @@ -496,17 +504,20 @@ Trie.prototype = { 23 | bits.write( 0, 1 ); 24 | }); 25 | 26 | - // Write the data for each node, using 6 bits for node. 1 bit stores 27 | - // the "final" indicator. The other 5 bits store one of the 26 letters 28 | - // of the alphabet. 29 | - var a = ("a").charCodeAt(0); 30 | + /** 31 | + * modified by Siong-Ui Te to support non-[a-z] characters 32 | + * 33 | + * Write the data for each node, using DATA_BITS bits for node. 1 bit 34 | + * stores the "final" indicator. The other (DATA_BITS - 1) bits store 35 | + * one of the characters of the alphabet. 36 | + */ 37 | this.apply( function( node ) { 38 | - var value = node.letter.charCodeAt(0) - a; 39 | + var value = CharacterToValue( node.letter ); 40 | if ( node.final ) { 41 | - value |= 0x20; 42 | + value |= ( 1 << ( DATA_BITS - 1 ) ); 43 | } 44 | 45 | - bits.write( value, 6 ); 46 | + bits.write( value, DATA_BITS ); 47 | }); 48 | 49 | return bits.getData(); 50 | @@ -580,11 +591,14 @@ FrozenTrie.prototype = { 51 | */ 52 | getNodeByIndex: function( index ) 53 | { 54 | - // retrieve the 6-bit letter. 55 | - var final = this.data.get( this.letterStart + index * 6, 1 ) === 1; 56 | - var letter = String.fromCharCode( 57 | - this.data.get( this.letterStart + index * 6 + 1, 5 ) + 58 | - 'a'.charCodeAt(0)); 59 | + /** 60 | + * modified by Siong-Ui Te to support non-[a-z] characters 61 | + */ 62 | + // retrieve the DATA_BITS-bit character. 63 | + var final = this.data.get( this.letterStart + index * DATA_BITS, 1 ) === 1; 64 | + var letter = ValueToCharacter( 65 | + this.data.get( this.letterStart + index * DATA_BITS + 1, (DATA_BITS - 1) ) 66 | + ); 67 | var firstChild = this.directory.select( 0, index+1 ) - index; 68 | 69 | // Since the nodes are in level order, this nodes children must go up 70 | @@ -746,3 +760,47 @@ function lookup() 71 | 72 | } 73 | 74 | + 75 | +/** 76 | + * The following code added by Siong-Ui Te to support non-[a-z] characters 77 | + */ 78 | +var ALLOWED_CHARACTERS = "abcdeghijklmnoprstuvyāīūṁṃŋṇṅñṭḍḷ…'’° -"; 79 | +var VALID_WORD_REGEX = new RegExp('^[' + ALLOWED_CHARACTERS + ']+$'); 80 | +var CHARACTERS_CACHE = {}; 81 | +var CHARACTERS = (function() { 82 | + var obj = {}; 83 | + var chars = ALLOWED_CHARACTERS.split(""); 84 | + 85 | + for (var i=0; i '); 14 | rl.prompt(); 15 | 16 | rl.on('line', function(line) { 17 | console.log('looking up ' + line.trim() + ' ...' ); 18 | console.log(ftrie.lookup(line.trim())); 19 | rl.prompt(); 20 | }).on('close', function() { 21 | console.log('\nEnd of lookup'); 22 | process.exit(0); 23 | }); 24 | -------------------------------------------------------------------------------- /reference/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Use Bits.js for testing 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /reference/test.js: -------------------------------------------------------------------------------- 1 | function TestBitWriter() { 2 | var bw = new BitWriter(); 3 | bw.write(3, 2); 4 | console.log(bw.getDebugString(3)); 5 | console.log(bw.getData()); 6 | bw.write(0, 3); 7 | console.log(bw.getData()); 8 | bw.write(2, 2); 9 | console.log(bw.getData()); 10 | console.log(bw.getDebugString(3)); 11 | } 12 | 13 | function TestBitString() { 14 | var bs = new BitString("88kj5w_6phb"); 15 | console.log(bs); 16 | console.log(bs.rank(5)); 17 | console.log(bs.rank(24)); 18 | console.log(bs.rank(37)); 19 | console.log(bs.rank(55)); 20 | console.log(bs.rank(65)); 21 | console.log(bs.get(5,7)); 22 | console.log(bs.get(7,13)); 23 | console.log(bs.get(0,5)); 24 | console.log(bs.get(3,3)); 25 | console.log(bs.get(33,17)); 26 | console.log(bs.count(0,17)); 27 | console.log(bs.count(7,2)); 28 | console.log(bs.count(56,9)); 29 | console.log(bs.count(12,1)); 30 | console.log(bs.count(5,7)); 31 | } 32 | 33 | function TestRankDirectory() { 34 | var rd = RankDirectory.Create("1wnc2bxhbx7mkbgnpwq7vtlub7p6pkls42lvie9j1ekcpt0zytrdl67enescolwex7aumq4imywstrpktbvxy0rp61nnonj9grdf", 400, L1, L2); 35 | console.log(rd); 36 | console.log(rd.rank(1, 200)) 37 | console.log(rd.rank(0, 100)) 38 | console.log(rd.select(1, 134)) 39 | console.log(rd.select(0, 77)) 40 | } 41 | 42 | function TestTrie() { 43 | var te = new Trie(); 44 | te.insert("apple"); 45 | te.insert("orange"); 46 | te.insert("alphapha"); 47 | te.insert("lamp"); 48 | te.insert("hello"); 49 | te.insert("jello"); 50 | te.insert("quiz"); 51 | var teData = te.encode(); 52 | console.log(teData); 53 | console.log(te.getNodeCount()); 54 | var rd = RankDirectory.Create(teData, te.getNodeCount() * 2 + 1, L1, L2); 55 | console.log(rd.getData()); 56 | 57 | var ftrie = new FrozenTrie( teData, rd.getData(), te.getNodeCount()); 58 | console.log(ftrie.lookup("alphapha")) 59 | } 60 | 61 | TestTrie(); 62 | -------------------------------------------------------------------------------- /reference/variables.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Path of files and directories 3 | */ 4 | 5 | function getDictWordsJsonDir() { 6 | return require('path').resolve(__dirname, 7 | '../../pylib/paliwords'); 8 | } 9 | 10 | function getBitsjsPath() { 11 | return require('path').resolve(__dirname, 'Bits.js'); 12 | } 13 | 14 | function getSuccinctTrieJsonPath() { 15 | return require('path').resolve(__dirname, 16 | '../../pylib/json/succinct_trie.json'); 17 | } 18 | 19 | if ( typeof exports !== "undefined" ) { 20 | exports.dictWordsJsonDir = getDictWordsJsonDir(); 21 | exports.BitsjsPath = getBitsjsPath(); 22 | exports.succinctTrieJsonPath = getSuccinctTrieJsonPath(); 23 | } 24 | -------------------------------------------------------------------------------- /search.go: -------------------------------------------------------------------------------- 1 | package bits 2 | 3 | /** 4 | * Given a word, returns array of words, prefix of which is word 5 | */ 6 | func (f *FrozenTrie) GetSuggestedWords(word string, limit int) []string { 7 | var result []string 8 | 9 | node := f.GetRoot() 10 | 11 | // find the node corresponding to the last char of input 12 | for _, runeValue := range word { 13 | var child FrozenTrieNode 14 | var j uint = 0 15 | for ; j < node.GetChildCount(); j++ { 16 | child = node.GetChild(j) 17 | if child.letter == string(runeValue) { 18 | break 19 | } 20 | } 21 | 22 | // not found, return. 23 | if j == node.GetChildCount() { 24 | return result 25 | } 26 | 27 | node = child 28 | } 29 | 30 | // The node corresponding to the last letter of word is found. 31 | // Use this node as root. traversing the trie in level order. 32 | return f.traverseSubTrie(node, word, limit) 33 | } 34 | 35 | func (f *FrozenTrie) traverseSubTrie(node FrozenTrieNode, prefix string, limit int) []string { 36 | var result []string 37 | 38 | var level []FrozenTrieNode 39 | level = append(level, node) 40 | var prefixLevel []string 41 | prefixLevel = append(prefixLevel, prefix) 42 | 43 | for len(level) > 0 { 44 | nodeNow := level[0] 45 | level = level[1:] 46 | prefixNow := prefixLevel[0] 47 | prefixLevel = prefixLevel[1:] 48 | 49 | // if the prefix is a legal word. 50 | if nodeNow.final { 51 | result = append(result, prefixNow) 52 | if len(result) > limit { 53 | return result 54 | } 55 | } 56 | 57 | var i uint = 0 58 | for ; i < nodeNow.GetChildCount(); i++ { 59 | child := nodeNow.GetChild(i) 60 | level = append(level, child) 61 | prefixLevel = append(prefixLevel, prefixNow+child.letter) 62 | } 63 | } 64 | 65 | return result 66 | } 67 | -------------------------------------------------------------------------------- /search_test.go: -------------------------------------------------------------------------------- 1 | package bits 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestSearch(t *testing.T) { 9 | te := Trie{} 10 | te.Init() 11 | insertNotInAlphabeticalOrder(&te) 12 | teData := te.Encode() 13 | rd := CreateRankDirectory(teData, te.GetNodeCount()*2+1, L1, L2) 14 | 15 | ft := FrozenTrie{} 16 | ft.Init(teData, rd.GetData(), te.GetNodeCount()) 17 | 18 | if !reflect.DeepEqual(ft.GetSuggestedWords("a", 10), []string{"apple", "alphapha"}) { 19 | t.Error(`ft.GetSuggestedWords("a", 10) != []string{"apple", "alphapha"}`) 20 | } 21 | if len(ft.GetSuggestedWords("b", 10)) != 0 { 22 | t.Error(`len(ft.GetSuggestedWords("b", 10)) != 0`) 23 | } 24 | if !reflect.DeepEqual(ft.GetSuggestedWords("h", 10), []string{"hello"}) { 25 | t.Error(`ft.GetSuggestedWords("h", 10) != []string{"hello"}`) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /trie.go: -------------------------------------------------------------------------------- 1 | // Package go-succinct-data-structure-trie implements trie with succinct data 2 | // structure in Go. 3 | package bits 4 | 5 | import "unicode/utf8" 6 | 7 | // https://blog.golang.org/strings 8 | // https://golang.org/pkg/unicode/utf8/ 9 | 10 | /** 11 | A Trie node, for use in building the encoding trie. This is not needed for 12 | the decoder. 13 | */ 14 | type TrieNode struct { 15 | letter string 16 | final bool 17 | children []*TrieNode 18 | } 19 | 20 | type Trie struct { 21 | previousWord string 22 | root *TrieNode 23 | cache []*TrieNode 24 | nodeCount uint 25 | } 26 | 27 | func (t *Trie) Init() { 28 | t.previousWord = "" 29 | t.root = &TrieNode{ 30 | letter: " ", 31 | final: false, 32 | } 33 | t.cache = append(t.cache, t.root) 34 | t.nodeCount = 1 35 | } 36 | 37 | /** 38 | Returns the number of nodes in the trie 39 | */ 40 | func (t *Trie) GetNodeCount() uint { 41 | return t.nodeCount 42 | } 43 | 44 | /** 45 | Inserts a word into the trie. This function is fastest if the words are 46 | inserted in alphabetical order. 47 | */ 48 | func (t *Trie) Insert(word string) { 49 | 50 | commonPrefixWidth := 0 51 | commonRuneCount := 0 52 | 53 | minRuneCount := utf8.RuneCountInString(word) 54 | if minRuneCount > utf8.RuneCountInString(t.previousWord) { 55 | minRuneCount = utf8.RuneCountInString(t.previousWord) 56 | } 57 | 58 | for ; commonRuneCount < minRuneCount; commonRuneCount++ { 59 | runeValue1, width1 := utf8.DecodeRuneInString(word[commonPrefixWidth:]) 60 | runeValue2, _ := utf8.DecodeRuneInString(t.previousWord[commonPrefixWidth:]) 61 | if runeValue1 != runeValue2 { 62 | break 63 | } 64 | commonPrefixWidth += width1 65 | } 66 | 67 | t.cache = t.cache[:commonRuneCount+1] 68 | node := t.cache[commonRuneCount] 69 | 70 | for i, w := commonPrefixWidth, 0; i < len(word); i += w { 71 | // fix the bug if words not inserted in alphabetical order 72 | isLetterExist := false 73 | runeValue, width := utf8.DecodeRuneInString(word[i:]) 74 | w = width 75 | for _, cld := range node.children { 76 | if cld.letter == string(runeValue) { 77 | t.cache = append(t.cache, cld) 78 | node = cld 79 | isLetterExist = true 80 | break 81 | } 82 | } 83 | if isLetterExist { 84 | continue 85 | } 86 | 87 | next := &TrieNode{ 88 | letter: string(runeValue), 89 | final: false, 90 | } 91 | t.nodeCount++ 92 | node.children = append(node.children, next) 93 | t.cache = append(t.cache, next) 94 | node = next 95 | } 96 | 97 | node.final = true 98 | t.previousWord = word 99 | } 100 | 101 | /** 102 | Apply a function to each node, traversing the trie in level order. 103 | */ 104 | func (t *Trie) Apply(fn func(*TrieNode)) { 105 | var level []*TrieNode 106 | level = append(level, t.root) 107 | for len(level) > 0 { 108 | node := level[0] 109 | level = level[1:] 110 | for i := 0; i < len(node.children); i++ { 111 | level = append(level, node.children[i]) 112 | } 113 | fn(node) 114 | } 115 | } 116 | 117 | /** 118 | Encode the trie and all of its nodes. Returns a string representing the 119 | encoded data. 120 | */ 121 | func (t *Trie) Encode() string { 122 | // Write the unary encoding of the tree in level order. 123 | bits := BitWriter{} 124 | bits.Write(0x02, 2) 125 | t.Apply(func(node *TrieNode) { 126 | for i := 0; i < len(node.children); i++ { 127 | bits.Write(1, 1) 128 | } 129 | bits.Write(0, 1) 130 | }) 131 | 132 | // Write the data for each node, using (dataBits) bits for one node. 133 | // 1 bit stores the "final" indicator. The other (dataBits-1) bits store 134 | // one of the characters of the alphabet. 135 | t.Apply(func(node *TrieNode) { 136 | value, ok := mapCharToUint[node.letter] 137 | if !ok { 138 | panic("illegal character:" + node.letter) 139 | } 140 | if node.final { 141 | value |= (1 << (dataBits - 1)) 142 | } 143 | 144 | bits.Write(uint(value), dataBits) 145 | }) 146 | 147 | return bits.GetData() 148 | } 149 | -------------------------------------------------------------------------------- /trie_test.go: -------------------------------------------------------------------------------- 1 | package bits 2 | 3 | import "testing" 4 | 5 | func insertInAlphabeticalOrder(te *Trie) { 6 | te.Insert("alphapha") 7 | te.Insert("apple") 8 | te.Insert("hello") 9 | te.Insert("jello") 10 | te.Insert("lamp") 11 | te.Insert("orange") 12 | te.Insert("quiz") 13 | } 14 | 15 | func insertNotInAlphabeticalOrder(te *Trie) { 16 | te.Insert("apple") 17 | te.Insert("orange") 18 | te.Insert("alphapha") 19 | te.Insert("lamp") 20 | te.Insert("hello") 21 | te.Insert("jello") 22 | te.Insert("quiz") 23 | } 24 | 25 | func TestTrie(t *testing.T) { 26 | te := Trie{} 27 | te.Init() 28 | insertInAlphabeticalOrder(&te) 29 | teData := te.Encode() 30 | t.Log(teData) 31 | t.Log(te.GetNodeCount()) 32 | if teData != "v2qqqqqqqpIUjQA5JZyBZ4ggCKh55ZZgBA5ZZd5vIEl1wx8g8A" { 33 | t.Error("Expected v2qqqqqqqpIUjQA5JZyBZ4ggCKh55ZZgBA5ZZd5vIEl1wx8g8A, got ", teData) 34 | } 35 | if te.GetNodeCount() != 37 { 36 | t.Error("Expected 37, got ", te.GetNodeCount()) 37 | } 38 | rd := CreateRankDirectory(teData, te.GetNodeCount()*2+1, L1, L2) 39 | if rd.GetData() != "BMIg" { 40 | t.Error("Expected BMIg, got ", rd.GetData()) 41 | } 42 | t.Log(rd.GetData()) 43 | } 44 | --------------------------------------------------------------------------------