├── .gitignore ├── LICENSE ├── README.md ├── bench.go ├── bitreader.go ├── bitreader_test.go ├── boltiddb.go ├── bucket_execution.png ├── customlineardocitr.go ├── customlineardocitr_test.go ├── custommapdocitr.go ├── custommapdocitr_test.go ├── dataset_tools ├── census_p_rec_gen.sh └── sample.csv ├── db.go ├── db_test.go ├── diffdocitr.go ├── docitr.go ├── elastic.go ├── fielddocitr.go ├── fielddocitr_test.go ├── fsscoredb.go ├── fsscoredb_test.go ├── http.go ├── memorydb.go ├── memorydb_test.go ├── memorydocitr.go ├── migratabledb.go ├── mindocitr.go ├── powdocitr.go ├── productdocitr.go ├── productdocitr_test.go ├── scale_performance.png ├── scaledocitr.go ├── scoredb └── main.go ├── shardeddb.go ├── shardeddb_test.go ├── stub.go └── sumdocitr.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | data 10 | 11 | # Architecture specific extensions/prefixes 12 | *.[568vq] 13 | [568vq].out 14 | 15 | *.cgo1.go 16 | *.cgo2.c 17 | _cgo_defun.c 18 | _cgo_gotypes.go 19 | _cgo_export.* 20 | 21 | _testmain.go 22 | 23 | *.exe 24 | *.test 25 | *.prof 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Phillip Schanely 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scoredb 2 | 3 | A simple database index optimized for returning results by custom scoring functions. 4 | 5 | To my knowledge, it is the only open source system with an algorithm designed for this purpose; in some cases, it is faster than elasticsearch's implementation by an order of magnitude. (see below) 6 | 7 | # Why? 8 | 9 | Scoredb is optimized for systems that want to find the top scoring results, where the scoring function is specified by the client, 10 | and may depend on more than one field. 11 | It may be a good choice for any system that needs to incorporate multiple factors when returning results. 12 | For instance, it might power a used car website to produce results based on factors like mileage, year, and distance. 13 | 14 | 15 | # Run It 16 | 17 | Though Scoredb has a straightforward programatic interface, you can run a simple standalone HTTP server like so: 18 | 19 | ``` 20 | $ go get github.com/pschanely/scoredb 21 | $ go install github.com/pschanely/scoredb/... 22 | $ ${GOPATH}/bin/scoredb serve -datadir my_data_directory -port 11625 23 | ``` 24 | ... and in another shell: 25 | ``` 26 | # insert some people with ages and weights 27 | $ curl -XPUT http://localhost:11625/jim -d '{"age":21, "weight":170}' 28 | $ curl -XPUT http://localhost:11625/bob -d '{"age":34, "weight":150}' 29 | 30 | # get people by age 31 | $ curl -G 'http://localhost:11625' --data-urlencode 'score=["field", "age"]' 32 | {"Ids":["bob","jim"]} 33 | 34 | # get people by the sum of their age and weight: 35 | $ curl -G 'http://localhost:11625' --data-urlencode 'score=["sum", ["field", "age"], ["field", "weight"]]' 36 | {"Ids":["jim","bob"]} 37 | ``` 38 | 39 | 40 | # The Algorithm 41 | 42 | Scoredb uses a format on disk that is very similar to that used by text search systems like solr and elasticsearch. 43 | We divide each field into ranges of values (buckets) and, for each bucket, maintain a file containing the IDs of objects that have their value inside that range. 44 | 45 | The IDs in each file are strictly increasing; this means that we can traverse several buckets efficiently by using a heap of buckets to find the next smallest id among many buckets. 46 | 47 | As we traverse the buckets, we score the objects produced and put them into a candidate result set. The result set is capped at the `&limit=` parameter specified by the user. As poorly scoring results get kicked out of the candidate result set, we can infer a lower bound on the final score. With some math, we can propagate that lower bound backwards through the scoring function to infer bounds on the individual fields. These bounds may then be used to stop traversing very poorly scoring buckets that could not produce a good enough final score. In this manner, as the candidate result set gets better and better, the system can eliminate more and more buckets to arrive at a result very quickly. 48 | 49 | The following graph shows bucket elimination over the course of an example query combining two fields, "age" and "wages": 50 | 51 | 52 | 53 | 54 | # Performance 55 | 56 | Few database systems support custom scoring functions, and fewer (possibly none?) use algorithms designed for that purpose. 57 | In practice, I've found elasticsearch's 58 | [custom scoring functions](https://www.elastic.co/guide/en/elasticsearch/reference/0.90/query-dsl-function-score-query.html#query-dsl-function-score-query) 59 | to be quite fast, so I've benchmarked against it here. Please let me know about other systems I might benchmark against! 60 | 61 | This is a graph of how 5 different queries perform with varying database sizes (yellow is elasticsearch and blue is scoredb): 62 | 63 | 64 | 65 | The elasticsearch query times (yellow) look like they're rising exponentially, but it's actually linear because the X-axis has a logarithmic scale. 66 | 67 | The dataset is anonymized US census data, each object representing an individual. These are the 5 scoring functions used for benchmarking, in order from fastest to slowest (for scoredb): 68 | 69 | ``` 70 | 10 * number_of_children + age 71 | 10000 * age + yearly_wages 72 | 100 * age + yearly_wages 73 | 40 * gender + weekly_work_hours 74 | 100.0 * gender + 9 * num_children + age + weekly_work_hours 75 | 5 * num_children + age + weekly_work_hours 76 | ``` 77 | 78 | This is an unscientific test! Just my personal laptop, [this datafile](http://millstonecw.com/censusdata.csv.bz2) repeated a few times over for the biggest datasets, and `scoredb benchmark -maxrecords 10000000 -csv censusdata.csv`. There's no substitute for testing with your own data, queries, and hardware. 79 | 80 | It's clear from the graph that scoredb's performance can vary significantly based on the scoring function. 81 | Some guidance on scoring: 82 | 83 | * Prefer to combine fields with addition, multiplication, and, in particular, minimum, because they allow the computation of useful lower bounds. Combining fields with a max() function does not, because a bad value in one field can be completely overcome by a good value in another. 84 | * Combining many fields instead of a few will make the query take longer, because it takes longer to determine useful lower bounds on each field. 85 | * Prefer to engineer weights so that the contributions from each of your fields is similar in scale. Scoredb may never be able to find useful bounds on fields that tweak the final score very slightly. 86 | 87 | 88 | # Limitations 89 | 90 | Scoredb is minimalistic and highly specialized; it is intended to just act as one piece of a larger system: 91 | * Scoredb **has no delete or update operation**. To remove or change an object, you must build a new index. See below for how to swap a new index in under a running instance without downtime. 92 | * It stores objects as a flat set of key-value pairs with string keys and numeric values only. (internally, all values are 32 bit floating point values) 93 | * Scoredb can only respond to queries with lists of identifiers; scoredb's indexes do not provide efficient access to the original field data. 94 | * Scoredb has no built-in clustering, redundancy, or backup functions. 95 | * Adding objects to scoredb is slow if you add them one at a time. Bulk insertion should be used whenever possible. 96 | * Scoredb requires many open files; sometimes thousands of them. You will need to increase default filehandle limits on your system (see "ulimit" on linux). 97 | * Scoredb expects you to provide every field for every object; objects that are missing a field cannot be returned from queries that use the missing fields. 98 | * Scoredb data files are endian specific; most modern CPUs are little endian, so you won't normally have to worry about this. 99 | 100 | # Index Bulk Load 101 | 102 | You can create a database without running a server using the `scoredb load` command, which expects newline separated json records on stdin. 103 | So, for instance: 104 | ``` 105 | printf '{"id":"person_1", "values":{"age":10, "height":53}}\n' > data.jsonl 106 | printf '{"id":"person_2", "values":{"age":32, "height":68}}\n' >> data.jsonl 107 | cat data.jsonl | scoredb load 108 | ``` 109 | 110 | # Index Swapping 111 | 112 | If you need deletes or updates, you'll have to perodically rebuild your database and swap in updated versions. 113 | If you specify the -automigrate option to the server, it will look for new database directories that begin with the given data directory 114 | and keep the (lexigraphically largest) one live. Use an atomic mv command to put it in place like so: 115 | 116 | ``` 117 | $ cat new_data.jsonlines | scoredb load -datadir ./live_db_v00001 # Load initial data 118 | $ scoredb serve -readonly -automigrate -datadir ./live_db_v # Start server 119 | 120 | # when ready for a new version of the database, 121 | 122 | $ cat new_data.jsonlines | scoredb load -datadir ./tmp_db # Create the database 123 | $ mv ./tmp_db ./live_db_v00002 # Rename to match the watched prefix 124 | 125 | # The server should detect and load the new database here. 126 | 127 | $ rm -rf ./live_db_v00001 # Now, remove the old database 128 | ``` 129 | 130 | # Supported Query Functions 131 | 132 | As shown above, queries are expressed as JSON expressions and then url encoded into the "score" query parameter. 133 | Each expression takes a lisp-like form: `[, , , ...]`. These are the supported functions: 134 | 135 | #### `["field", ]` 136 | Simply produces the value of `` as a score. 137 | * Example: `["field", "age"]` (return the age value as a score) 138 | 139 | #### `["scale", , ]` 140 | Takes the result of `` and multiplies it by ``. `` may be negative. 141 | * Example: `["scale", 2.0, ["field", "age"]]` (age, doubled) 142 | 143 | #### `["sum", , , ...]` 144 | Sums the results of each ``. 145 | * Example: `["sum", ["field", "age"], ["field", "height"]]` (add age and height together) 146 | 147 | #### `["product", , , ...]` 148 | Multiplies the result of each `` together. For bounding reasons, negative inputs are not allowed. 149 | * Example: `["product", ["field", "age"], ["field", "height"]]` (multiply age by height) 150 | 151 | #### `["min", , , ...]` 152 | Takes the least score resulting from all ``s. 153 | * Example: `["min", ["field", "age"], ["field", "height"]]` (Take age or height, whichever is smaller) 154 | 155 | ####`["diff", , ]` 156 | Returns the absolute difference between the values produced by both subexpressions. 157 | * Example: `["diff", ["field", "age"], ["field", "height"]]` (the difference between each age and height) 158 | 159 | #### `["pow", , ]` 160 | Raises the result from the given subexpression to the `` power. 161 | `` may be fractional (for Nth roots) or negative. 162 | However, for bounding reasons, the subexpression may not produce negative values. 163 | * Example: `["pow", ["field", "age"], 2.0]` (age, squared) 164 | 165 | #### `["custom_linear", [[, ], [, ], ..], ]` 166 | Establishes a user-defined function using a set of linearly interpolated [x, y] points. 167 | Inputs smaller than the smallest X value or larger than the largest X value get the closest specified Y value. 168 | * Example: `["custom_linear", [[0, 0.0], [30, 1.0], [80, 0.0]], ["field", "age"]]` Maping ages to scores: 30 year-olds get a score of one, gradually declining to a score of zero for infants and the elderly. 169 | 170 | #### `["geo_distance", , , , ]` 171 | Returns the distance to a fixed point in kilometers as a score. 172 | This is experimental: may be inaccurate for large distances, and fails across the prime meridian. 173 | Since you typically want smaller distances to have higher scores, you'll probably want to wrap the "scale" or "custom_linear" functions around this one to invert it. 174 | * Example: `["geo_distance", 40.7, -74.0, "home_lat", "home_lng"]` Scores each result by how far its home_lat and home_lng fields put it from New York City. 175 | 176 | 177 | # Status 178 | 179 | Though it has reasonable test coverage and a small, straightforward codebase, scoredb is certainly alpha-quality software. 180 | 181 | Your bug reports are greatly appreciated. 182 | 183 | 184 | # Thanks 185 | 186 | Thanks are due to the [Samsung Accelerator](http://samsungaccelerator.com) which let us start this project as a hackathon proof of concept. Scoredb was built with this awesome team (in github lexicographic order!): 187 | 188 | * https://github.com/davidgljay 189 | * https://github.com/ploxiln 190 | * https://github.com/pschanely 191 | * https://github.com/rmarianski 192 | * https://github.com/sleepylemur 193 | 194 | 195 | # Plugs 196 | 197 | Check out of some of our other side projects too: 198 | 199 | * [wildflower-touch](https://github.com/pschanely/wildflower-touch) is proof-of-concept programming IDE and language for touch devices. 200 | * [music-tonight](http://musictonightapp.com) makes playlists of bands playing near you, tonight. 201 | -------------------------------------------------------------------------------- /bench.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "bufio" 5 | "encoding/csv" 6 | "fmt" 7 | "io" 8 | "os" 9 | "strconv" 10 | "time" 11 | ) 12 | 13 | type LinearCombinationBackend interface { 14 | BulkIndex(records []Record) error 15 | LinearQuery(numResults int, coefs map[string]float32) []string 16 | } 17 | 18 | func (db BaseDb) LinearQuery(numResults int, weights map[string]float32) []string { 19 | scorer := make([]interface{}, len(weights)+1) 20 | scorer[0] = "sum" 21 | idx := 1 22 | for key, weight := range weights { 23 | scorer[idx] = []interface{}{"scale", weight, []interface{}{"field", key}} 24 | idx += 1 25 | } 26 | result, _ := db.Query(Query{ 27 | Limit: numResults, 28 | Scorer: scorer, 29 | }) 30 | return result.Ids 31 | } 32 | 33 | func RunBenchmark(db LinearCombinationBackend, csvFilename string, maxRecords int64) ([]int64, []int64, [][]int64, error) { 34 | fp, err := os.Open(csvFilename) 35 | if err != nil { 36 | return nil, nil, nil, err 37 | } 38 | defer fp.Close() 39 | 40 | bufReader := bufio.NewReader(fp) 41 | csvReader := csv.NewReader(bufReader) 42 | 43 | header, err := csvReader.Read() 44 | if err == io.EOF { 45 | return nil, nil, nil, fmt.Errorf("Missing csv header") 46 | } else if err != nil { 47 | return nil, nil, nil, fmt.Errorf("Error reading csv header") 48 | } 49 | 50 | // TODO ensure we have at least one value? 51 | 52 | colMap := make(map[int]string, len(header)) 53 | for colIdx, colName := range header { 54 | colMap[colIdx] = colName 55 | } 56 | 57 | totalRecs := []int64{} 58 | indexTimes := []int64{} 59 | queryTimes := [][]int64{} 60 | nResults := 10 61 | weights := []map[string]float32{ 62 | map[string]float32{ 63 | "age": 100.0, 64 | "wages": 1.0, 65 | }, 66 | map[string]float32{ 67 | "age": 10000.0, 68 | "wages": 1.0, 69 | }, 70 | map[string]float32{ 71 | "sex": 40.0, 72 | "weekly_work_hours": 1.0, 73 | }, 74 | map[string]float32{ 75 | "fertility": 10.0, 76 | "age": 1.0, 77 | }, 78 | map[string]float32{ 79 | "fertility": 5.0, 80 | "age": 1.0, 81 | "weekly_work_hours": 1.0, 82 | }, 83 | map[string]float32{ 84 | "sex": 100.0, 85 | "fertility": 9.0, 86 | "age": 1.0, 87 | "weekly_work_hours": 1.0, 88 | }, 89 | } 90 | 91 | bucketSize := 1000 92 | recordGroup := make([]Record, bucketSize) 93 | totalCount := int64(0) 94 | curGroupSize := 0 95 | 96 | for { 97 | row, err := csvReader.Read() 98 | if err == io.EOF { 99 | break 100 | } else if err != nil { 101 | return nil, nil, nil, fmt.Errorf("Error reading csv contents") 102 | } 103 | record := make(map[string]float32, len(row)) 104 | for fieldIdx, fieldValue := range row { 105 | recordKey, ok := colMap[fieldIdx] 106 | if !ok { 107 | // if we don't have header mappings, skip 108 | break 109 | } 110 | val64, err := strconv.ParseFloat(fieldValue, 32) 111 | if err != nil { 112 | continue 113 | } 114 | val32 := float32(val64) 115 | record[recordKey] = val32 116 | } 117 | if len(record) > 0 { 118 | // indexing one at a time 119 | // id := db.Index(record) 120 | // recordIndexIds = append(recordIndexIds, id) 121 | 122 | totalCount++ 123 | recordGroup[curGroupSize] = Record{Id: fmt.Sprintf("%d", totalCount), Values: record} 124 | curGroupSize++ 125 | if curGroupSize == bucketSize { 126 | t0 := time.Now().UnixNano() 127 | db.BulkIndex(recordGroup) 128 | totalRecs = append(totalRecs, totalCount) 129 | indexTimes = append(indexTimes, time.Now().UnixNano()-t0) 130 | queryRoundTimes := make([]int64, len(weights)) 131 | 132 | for idx, query := range weights { 133 | //fmt.Printf("%08d Q start\n", time.Now().UnixNano() % 100000000) 134 | t0 := time.Now().UnixNano() 135 | results := db.LinearQuery(nResults, query) 136 | queryTime := time.Now().UnixNano() - t0 137 | fmt.Printf("%08d Q results: %v\n", time.Now().UnixNano()%100000000, results) 138 | queryRoundTimes[idx] = queryTime 139 | } 140 | curGroupSize = 0 141 | queryTimes = append(queryTimes, queryRoundTimes) 142 | bucketSize += bucketSize * 2 143 | if totalCount >= maxRecords { 144 | break 145 | } 146 | if bucketSize > 100000 { 147 | bucketSize = 100000 148 | } 149 | recordGroup = make([]Record, bucketSize) 150 | } 151 | } 152 | } 153 | if curGroupSize > 0 { 154 | finalRecords := make([]Record, curGroupSize) 155 | copy(finalRecords, recordGroup) 156 | db.BulkIndex(finalRecords) 157 | } 158 | 159 | return totalRecs, indexTimes, queryTimes, nil 160 | } 161 | -------------------------------------------------------------------------------- /bitreader.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "github.com/edsrzf/mmap-go" 7 | "io" 8 | "os" 9 | "unsafe" 10 | ) 11 | 12 | type BitWriter struct { 13 | BufferedWriter *bufio.Writer 14 | File *os.File 15 | Cur uint64 16 | CurBitsUsed uint 17 | } 18 | 19 | func FileIsAtEnd(file *os.File) bool { 20 | stat, _ := file.Stat() 21 | pos, _ := file.Seek(0, 1) 22 | return pos == stat.Size() 23 | } 24 | 25 | func WriteNativeLong(val uint64, writer io.Writer) error { 26 | byteSlice := (*((*[8]byte)(unsafe.Pointer(&val))))[:] 27 | _, err := writer.Write(byteSlice) 28 | return err 29 | } 30 | 31 | func ReadNativeLong(buf []byte) uint64 { 32 | return *((*uint64)(unsafe.Pointer(&buf[0]))) 33 | } 34 | 35 | func NewBitWriter(file *os.File) (*BitWriter, error) { 36 | writer := BitWriter{File: file} 37 | if !FileIsAtEnd(file) { 38 | buf := make([]byte, 16) 39 | 40 | file.Seek(-16, 2) // Goto EOF (whence=2 means "relative to end") 41 | nRead, err := file.Read(buf) 42 | if nRead != 16 { 43 | return nil, err 44 | } 45 | writer.CurBitsUsed = uint(ReadNativeLong(buf[8:])) 46 | writer.Cur = ReadNativeLong(buf) >> (64 - writer.CurBitsUsed) 47 | 48 | file.Seek(-16, 2) // Goto EOF (whence=2 means "relative to end") 49 | } 50 | writer.BufferedWriter = bufio.NewWriter(file) 51 | return &writer, nil 52 | } 53 | 54 | func (writer *BitWriter) Close() error { 55 | bitsUsed := writer.CurBitsUsed 56 | WriteNativeLong(writer.Cur<<(64-bitsUsed), writer.BufferedWriter) 57 | WriteNativeLong(uint64(bitsUsed), writer.BufferedWriter) 58 | err := writer.BufferedWriter.Flush() 59 | if err != nil { 60 | return err 61 | } 62 | return writer.File.Close() 63 | } 64 | 65 | func (writer *BitWriter) WriteBits(val uint64, numBits uint) error { // assumes val is all zeros above numBits 66 | cur, bitsUsed := writer.Cur, writer.CurBitsUsed 67 | overflow := int(bitsUsed+numBits) - 64 68 | if overflow >= 0 { // split the write 69 | initialBits := numBits - uint(overflow) 70 | cur = (cur << initialBits) | (val >> uint(overflow)) 71 | err := WriteNativeLong(cur, writer.BufferedWriter) 72 | if err != nil { 73 | return err 74 | } 75 | writer.Cur = val 76 | writer.CurBitsUsed = uint(overflow) 77 | } else { 78 | writer.Cur = (cur << numBits) | val 79 | writer.CurBitsUsed += numBits 80 | } 81 | return nil 82 | } 83 | 84 | func (writer *BitWriter) WriteVarUInt32(val uint32) error { 85 | var sizeFactor uint64 86 | if val&0xfffffff0 == 0 { 87 | sizeFactor = 0 88 | } else if val&0xffffff00 == 0 { 89 | sizeFactor = 1 90 | } else if val&0xffff0000 == 0 { 91 | sizeFactor = 2 92 | } else { 93 | sizeFactor = 3 94 | } 95 | writer.WriteBits(sizeFactor, 2) 96 | numBits := uint(4 << sizeFactor) 97 | writer.WriteBits(uint64(val), numBits) 98 | return nil 99 | } 100 | 101 | type BitReader struct { 102 | OrigMmap *mmap.MMap 103 | Mmap []uint64 104 | MmapPtr uint 105 | MmapPtrBitsLeft uint 106 | File *os.File 107 | Cur uint64 108 | CurBitsLeft uint 109 | } 110 | 111 | func NewBitReader(file *os.File) (*BitReader, error) { 112 | mapSlice, err := mmap.Map(file, mmap.RDONLY, 0) 113 | if err != nil { 114 | panic(err) 115 | } 116 | curPos, err := file.Seek(0, 1) 117 | if curPos%8 != 0 { 118 | panic(fmt.Sprintf("BitReader started at byte %v; must be 8 byte aligned", curPos)) 119 | } 120 | return &BitReader{ 121 | File: file, 122 | OrigMmap: &mapSlice, 123 | Mmap: (*((*[10000000]uint64)(unsafe.Pointer(&mapSlice[0]))))[:], 124 | MmapPtr: uint(curPos / 8), 125 | MmapPtrBitsLeft: 64, 126 | }, nil 127 | } 128 | 129 | func (reader *BitReader) Close() error { 130 | reader.Mmap = []uint64{} 131 | err := reader.OrigMmap.Unmap() 132 | if err != nil { 133 | return err 134 | } 135 | return reader.File.Close() 136 | } 137 | 138 | func (reader *BitReader) Refill(cur uint64, bitsLeft uint, numNeeded uint) (uint64, uint, error) { 139 | wanted := 64 - bitsLeft 140 | if wanted >= reader.MmapPtrBitsLeft { 141 | bits := reader.Mmap[reader.MmapPtr] << (64 - reader.MmapPtrBitsLeft) 142 | cur = cur | (bits >> bitsLeft) 143 | bitsLeft += reader.MmapPtrBitsLeft 144 | wanted -= reader.MmapPtrBitsLeft 145 | reader.MmapPtrBitsLeft = 64 146 | reader.MmapPtr += 1 147 | if wanted == 0 { 148 | return cur, bitsLeft, nil 149 | } 150 | } 151 | bits := reader.Mmap[reader.MmapPtr] << (64 - reader.MmapPtrBitsLeft) 152 | cur = cur | (bits >> bitsLeft) 153 | reader.MmapPtrBitsLeft -= wanted 154 | bitsLeft = 64 155 | return cur, bitsLeft, nil 156 | } 157 | 158 | func (reader *BitReader) ReadBits(numBits uint) (uint64, error) { 159 | cur, bitsLeft := reader.Cur, reader.CurBitsLeft 160 | var err error 161 | if bitsLeft < numBits { 162 | cur, bitsLeft, err = reader.Refill(cur, bitsLeft, numBits) 163 | if err != nil { 164 | return 0, err 165 | } 166 | } 167 | val := cur >> (64 - numBits) 168 | cur = cur << numBits 169 | bitsLeft -= numBits 170 | reader.Cur, reader.CurBitsLeft = cur, bitsLeft 171 | return val, nil 172 | } 173 | 174 | func (reader *BitReader) ReadVarUInt32() (uint32, error) { 175 | sizeFactor, err := reader.ReadBits(2) 176 | if err != nil { 177 | return 0, err 178 | } 179 | numNeeded := uint(4 << sizeFactor) 180 | val, err := reader.ReadBits(numNeeded) 181 | if err != nil { 182 | return 0, err 183 | } 184 | return uint32(val), nil 185 | } 186 | -------------------------------------------------------------------------------- /bitreader_test.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | ) 7 | 8 | func TestBitReader(t *testing.T) { 9 | filename := RmAllTestData()("bitreader") 10 | defer RmAllTestData() 11 | 12 | file, err := os.Create(filename) 13 | if err != nil { 14 | t.Fatalf("%v", err) 15 | } 16 | 17 | wtr, err := NewBitWriter(file) 18 | if err != nil { 19 | t.Fatalf("%v", err) 20 | } 21 | wtr.WriteVarUInt32(7) 22 | wtr.WriteBits(42, 21) 23 | wtr.WriteVarUInt32(0) 24 | wtr.WriteVarUInt32(1) 25 | wtr.WriteVarUInt32(2) 26 | wtr.WriteVarUInt32(123) 27 | wtr.WriteVarUInt32(12345) 28 | wtr.WriteVarUInt32(1234567) 29 | wtr.WriteVarUInt32(123456789) 30 | err = wtr.Close() 31 | if err != nil { 32 | t.Fatalf("%v", err) 33 | } 34 | 35 | // try adding mroe stuff at the end 36 | file, err = os.OpenFile(filename, os.O_RDWR, 0666) 37 | if err != nil { 38 | t.Fatalf("%v", err) 39 | } 40 | wtr, err = NewBitWriter(file) 41 | if err != nil { 42 | t.Fatalf("%v", err) 43 | } 44 | wtr.WriteVarUInt32(7654321) 45 | err = wtr.Close() 46 | if err != nil { 47 | t.Fatalf("%v", err) 48 | } 49 | 50 | fd, err := os.OpenFile(filename, os.O_RDWR, 0666) 51 | if err != nil { 52 | t.Fatalf("%v", err) 53 | } 54 | rdr, err := NewBitReader(fd) 55 | if err != nil { 56 | t.Fatalf("%v", err) 57 | } 58 | val, err := rdr.ReadVarUInt32() 59 | if err != nil || val != 7 { 60 | t.Fatalf("val:%v, err:%v", val, err) 61 | } 62 | fixedval, err := rdr.ReadBits(21) 63 | if err != nil || fixedval != 42 { 64 | t.Fatalf("val:%v, err:%v", fixedval, err) 65 | } 66 | val, err = rdr.ReadVarUInt32() 67 | if err != nil || val != 0 { 68 | t.Fatalf("val:%v, err:%v", val, err) 69 | } 70 | val, err = rdr.ReadVarUInt32() 71 | if err != nil || val != 1 { 72 | t.Fatalf("val:%v, err:%v", val, err) 73 | } 74 | val, err = rdr.ReadVarUInt32() 75 | if err != nil || val != 2 { 76 | t.Fatalf("val:%v, err:%v", val, err) 77 | } 78 | val, err = rdr.ReadVarUInt32() 79 | if err != nil || val != 123 { 80 | t.Fatalf("val:%v, err:%v", val, err) 81 | } 82 | val, err = rdr.ReadVarUInt32() 83 | if err != nil || val != 12345 { 84 | t.Fatalf("val:%v, err:%v", val, err) 85 | } 86 | val, err = rdr.ReadVarUInt32() 87 | if err != nil || val != 1234567 { 88 | t.Fatalf("val:%v, err:%v", val, err) 89 | } 90 | val, err = rdr.ReadVarUInt32() 91 | if err != nil || val != 123456789 { 92 | t.Fatalf("val:%v, err:%v", val, err) 93 | } 94 | val, err = rdr.ReadVarUInt32() 95 | if err != nil || val != 7654321 { 96 | t.Fatalf("val:%v, err:%v", val, err) 97 | } 98 | err = rdr.Close() 99 | if err != nil { 100 | t.Fatalf("%v", err) 101 | } 102 | 103 | } 104 | 105 | func TestBitReaderVolume(t *testing.T) { 106 | filename := RmAllTestData()("bitreader.volume") 107 | defer RmAllTestData() 108 | 109 | file, err := os.Create(filename) 110 | if err != nil { 111 | t.Fatalf("%v", err) 112 | } 113 | 114 | wtr, err := NewBitWriter(file) 115 | if err != nil { 116 | t.Fatalf("%v", err) 117 | } 118 | 119 | for i := 0; i < 200; i++ { 120 | wtr.WriteVarUInt32(uint32(i * i)) 121 | wtr.WriteBits(uint64(i), uint(i%23)+10) 122 | } 123 | err = wtr.Close() 124 | if err != nil { 125 | t.Fatalf("%v", err) 126 | } 127 | 128 | fd, err := os.OpenFile(filename, os.O_RDWR, 0666) 129 | if err != nil { 130 | t.Fatalf("%v", err) 131 | } 132 | rdr, err := NewBitReader(fd) 133 | if err != nil { 134 | t.Fatalf("%v", err) 135 | } 136 | for i := 0; i < 200; i++ { 137 | val, err := rdr.ReadVarUInt32() 138 | if err != nil || int(val) != i*i { 139 | t.Fatalf("val:%v, err:%v", val, err) 140 | } 141 | fixedval, err := rdr.ReadBits(uint(i%23) + 10) 142 | if err != nil || int(fixedval) != i { 143 | t.Fatalf("val:%v, err:%v", fixedval, err) 144 | } 145 | } 146 | err = rdr.Close() 147 | if err != nil { 148 | t.Fatalf("%v", err) 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /boltiddb.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "github.com/boltdb/bolt" 7 | ) 8 | 9 | func NewBoltIdDb(file string) (*BoltIdDb, error) { 10 | db, err := bolt.Open(file, 0600, nil) 11 | if err != nil { 12 | return nil, err 13 | } 14 | return &BoltIdDb{Db: db}, nil 15 | } 16 | 17 | type BoltIdDb struct { 18 | Db *bolt.DB 19 | } 20 | 21 | func encodeScoreId(id int64) []byte { 22 | var buf [9]byte 23 | slice := buf[:] 24 | sz := binary.PutVarint(slice, id) 25 | return slice[:sz] 26 | } 27 | 28 | var boltBucketName []byte = []byte("ScoreDbIds") 29 | 30 | func (db *BoltIdDb) Put(scoreIds []int64, clientIds []string) error { 31 | return db.Db.Update(func(tx *bolt.Tx) error { 32 | b, err := tx.CreateBucketIfNotExists([]byte(boltBucketName)) 33 | if err != nil { 34 | return err 35 | } 36 | for idx, scoreId := range scoreIds { 37 | err = b.Put(encodeScoreId(scoreId), []byte(clientIds[idx])) 38 | if err != nil { 39 | return err 40 | } 41 | } 42 | return nil 43 | }) 44 | } 45 | 46 | func (db *BoltIdDb) Get(scoreIds []int64) ([]string, error) { 47 | result := make([]string, len(scoreIds)) 48 | 49 | err := db.Db.View(func(tx *bolt.Tx) error { 50 | b := tx.Bucket([]byte(boltBucketName)) 51 | for idx, scoreId := range scoreIds { 52 | clientIdBytes := b.Get(encodeScoreId(scoreId)) 53 | if clientIdBytes == nil { 54 | return fmt.Errorf("Unable to find client id for internal id %d", scoreId) 55 | } 56 | result[idx] = string(clientIdBytes[:]) 57 | //fmt.Printf(" ID %v %v %v %v\n", idx, scoreId, clientIdBytes, result[idx]) 58 | } 59 | return nil 60 | }) 61 | 62 | return result, err 63 | } 64 | -------------------------------------------------------------------------------- /bucket_execution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pschanely/scoredb/57beea075b4b5a53ee0a27b9752a0ca544c4510d/bucket_execution.png -------------------------------------------------------------------------------- /customlineardocitr.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "sort" 5 | ) 6 | 7 | type CustomPoint struct { 8 | X, Y float32 9 | } 10 | 11 | // Remaps a value according to a user-specified function that linearly interpolates 12 | // among a set of (x, y) points. 13 | type CustomLinearDocItr struct { 14 | points []CustomPoint 15 | docItr DocItr 16 | } 17 | 18 | func ComputeCustomFunc(x float32, points []CustomPoint) float32 { 19 | numPoints := len(points) 20 | idx := sort.Search(numPoints, func(i int) bool { 21 | return points[i].X >= x 22 | }) 23 | if idx == 0 { 24 | return points[0].Y 25 | } else if idx == numPoints { 26 | return points[numPoints-1].Y 27 | } else { 28 | p1 := points[idx-1] 29 | p2 := points[idx] 30 | pctInto := (x - p1.X) / (p2.X - p1.X) 31 | return p2.Y*pctInto + p1.Y*(1.0-pctInto) 32 | } 33 | } 34 | 35 | func (op *CustomLinearDocItr) Name() string { return "CustomLinearDocItr" } 36 | func (op *CustomLinearDocItr) Cur() (int64, float32) { 37 | docId, score := op.docItr.Cur() 38 | return docId, ComputeCustomFunc(score, op.points) 39 | } 40 | func (op *CustomLinearDocItr) GetBounds() (min, max float32) { 41 | insideMin, insideMax := op.docItr.GetBounds() 42 | outsideMin := ComputeCustomFunc(insideMin, op.points) 43 | outsideMax := ComputeCustomFunc(insideMax, op.points) 44 | if outsideMin > outsideMax { // swap if required 45 | outsideMin, outsideMax = outsideMax, outsideMin 46 | } 47 | // functions need not be monotonic, check for peaks inside the X range 48 | for _, point := range op.points { 49 | if point.X <= insideMin { 50 | continue 51 | } else if point.X >= insideMax { 52 | break 53 | } else { 54 | y := point.Y 55 | outsideMax = Max(outsideMax, y) 56 | outsideMin = Min(outsideMin, y) 57 | } 58 | } 59 | return outsideMin, outsideMax 60 | } 61 | func (op *CustomLinearDocItr) Close() { 62 | op.docItr.Close() 63 | } 64 | func (op *CustomLinearDocItr) Next(minId int64) bool { 65 | return op.docItr.Next(minId) 66 | } 67 | 68 | func CheckIntersection(yValue float32, p1, p2 CustomPoint, insideMin, insideMax *float32) { 69 | var xIntersect float32 70 | // intersect descending: y 3 at {3 3}-{6 1}: 0 71 | if p1.Y <= yValue && yValue <= p2.Y { // intersect while function is ascending 72 | earliness := (p2.Y - yValue) / (p2.Y - p1.Y) 73 | xIntersect = p1.X*earliness + p2.X*(1.0-earliness) 74 | } else if p1.Y >= yValue && yValue >= p2.Y { // intersect while function is descending 75 | lateness := (p1.Y - yValue) / (p1.Y - p2.Y) 76 | xIntersect = p2.X*lateness + p1.X*(1.0-lateness) 77 | } else { 78 | return 79 | } 80 | *insideMin = Min(xIntersect, *insideMin) 81 | *insideMax = Max(xIntersect, *insideMax) 82 | } 83 | 84 | func (op *CustomLinearDocItr) SetBounds(outsideMin, outsideMax float32) bool { 85 | insideMin, insideMax := PositiveInfinity, NegativeInfinity // start with impossible (inverted) range 86 | for idx := len(op.points) - 1; idx > 0; idx-- { 87 | p1 := op.points[idx-1] 88 | p2 := op.points[idx] 89 | CheckIntersection(outsideMin, p1, p2, &insideMin, &insideMax) 90 | CheckIntersection(outsideMax, p1, p2, &insideMin, &insideMax) 91 | if outsideMin <= p2.Y && p2.Y <= outsideMax { 92 | insideMin = Min(insideMin, p2.X) 93 | insideMax = Max(insideMax, p2.X) 94 | } 95 | } 96 | firstPoint := op.points[0] 97 | if outsideMin <= firstPoint.Y && firstPoint.Y <= outsideMax { 98 | insideMin = NegativeInfinity 99 | } 100 | lastPoint := op.points[len(op.points)-1] 101 | if outsideMin <= lastPoint.Y && lastPoint.Y <= outsideMax { 102 | insideMax = PositiveInfinity 103 | } 104 | return op.docItr.SetBounds(insideMin, insideMax) 105 | } 106 | -------------------------------------------------------------------------------- /customlineardocitr_test.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | ) 7 | 8 | func BoundsEqualish(actualMin, actualMax, expectedMin, expectedMax float32) bool { 9 | tolerance := 0.0000001 10 | if math.Abs(float64(actualMin-expectedMin)) > tolerance { 11 | return false 12 | } 13 | if math.Abs(float64(actualMax-expectedMax)) > tolerance { 14 | return false 15 | } 16 | return true 17 | } 18 | 19 | func TestComputeCustomFunc(t *testing.T) { 20 | v := ComputeCustomFunc(1.0, []CustomPoint{CustomPoint{0.0, 0.0}, CustomPoint{3.0, 3.0}}) 21 | if v != 1.0 { 22 | t.Fatalf("%v", v) 23 | } 24 | v = ComputeCustomFunc(-1, []CustomPoint{CustomPoint{0, 0}, CustomPoint{3, 3}}) 25 | if v != 0.0 { 26 | t.Fatalf("%v", v) 27 | } 28 | v = ComputeCustomFunc(3, []CustomPoint{CustomPoint{0, 0}, CustomPoint{3, 3}}) 29 | if v != 3.0 { 30 | t.Fatalf("%v", v) 31 | } 32 | } 33 | 34 | func TestCustomLinearDocItr(t *testing.T) { 35 | inside := NewMemoryScoreDocItr([]float32{-1, 0, 2, 8, 5, 9, 12}) 36 | outside := CustomLinearDocItr{ 37 | docItr: inside, 38 | points: []CustomPoint{ 39 | CustomPoint{0, 0}, // kind of a zig-zag function... 40 | CustomPoint{3, 3}, 41 | CustomPoint{6, 1}, 42 | CustomPoint{9, 2}, 43 | }, 44 | } 45 | 46 | min, max := inside.GetBounds() 47 | if !BoundsEqualish(min, max, -1, 12) { 48 | t.Fatalf("%v:%v", min, max) 49 | } 50 | min, max = outside.GetBounds() 51 | if !BoundsEqualish(min, max, 0.0, 3.0) { 52 | t.Fatalf("%v:%v", min, max) 53 | } 54 | 55 | // should leave unchanged 56 | outside.SetBounds(0, 4) 57 | min, max = inside.GetBounds() 58 | if !BoundsEqualish(min, max, -1, 12) { 59 | t.Fatalf("%v:%v", min, max) 60 | } 61 | min, max = outside.GetBounds() 62 | if !BoundsEqualish(min, max, 0.0, 3.0) { 63 | t.Fatalf("%v:%v", min, max) 64 | } 65 | 66 | // nudge the start up some 67 | outside.SetBounds(0.5, 3) 68 | min, max = inside.GetBounds() 69 | if !BoundsEqualish(min, max, 0.5, 12) { 70 | t.Fatalf("%v:%v", min, max) 71 | } 72 | min, max = outside.GetBounds() 73 | if !BoundsEqualish(min, max, 0.5, 3.0) { 74 | t.Fatalf("%v:%v", min, max) 75 | } 76 | 77 | // chop off the end (leaves a hole in the middle of the function) 78 | outside.SetBounds(0.5, 1.5) 79 | min, max = inside.GetBounds() 80 | if !BoundsEqualish(min, max, 0.5, 7.5) { 81 | t.Fatalf("%v:%v", min, max) 82 | } 83 | min, max = outside.GetBounds() 84 | if !BoundsEqualish(min, max, 0.5, 3.0) { 85 | t.Fatalf("%v:%v", min, max) 86 | } 87 | 88 | // chop off most of the end 89 | outside.SetBounds(0.5, 0.9) 90 | min, max = inside.GetBounds() 91 | if !BoundsEqualish(min, max, 0.5, 0.9) { 92 | t.Fatalf("%v:%v", min, max) 93 | } 94 | min, max = outside.GetBounds() 95 | if !BoundsEqualish(min, max, 0.5, 0.9) { 96 | t.Fatalf("%v:%v", min, max) 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /custommapdocitr.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import () 4 | 5 | // Remaps a value according to a user-specified mapping of values to scores 6 | type CustomMapDocItr struct { 7 | points map[float32]float32 8 | deflt float32 9 | docItr DocItr 10 | } 11 | 12 | func (op *CustomMapDocItr) ComputeCustomFunc(val float32) float32 { 13 | score, ok := op.points[val] 14 | if ok { 15 | return score 16 | } else { 17 | return op.deflt 18 | } 19 | } 20 | 21 | func (op *CustomMapDocItr) Name() string { return "CustomMapDocItr" } 22 | func (op *CustomMapDocItr) Cur() (int64, float32) { 23 | docId, score := op.docItr.Cur() 24 | return docId, op.ComputeCustomFunc(score) 25 | } 26 | func (op *CustomMapDocItr) GetBounds() (min, max float32) { 27 | insideMin, insideMax := op.docItr.GetBounds() 28 | outsideMin := op.deflt 29 | outsideMax := op.deflt 30 | for input, output := range op.points { 31 | if insideMin <= input && input <= insideMax { 32 | outsideMin = Min(outsideMin, output) 33 | outsideMax = Max(outsideMax, output) 34 | } 35 | } 36 | return outsideMin, outsideMax 37 | } 38 | func (op *CustomMapDocItr) Close() { 39 | op.docItr.Close() 40 | } 41 | func (op *CustomMapDocItr) Next(minId int64) bool { 42 | return op.docItr.Next(minId) 43 | } 44 | 45 | func (op *CustomMapDocItr) SetBounds(outsideMin, outsideMax float32) bool { 46 | if outsideMin <= op.deflt && op.deflt <= outsideMax { 47 | return true 48 | } 49 | 50 | insideMin, insideMax := PositiveInfinity, NegativeInfinity // start with impossible (inverted) range 51 | for input, output := range op.points { 52 | if outsideMin <= output && output <= outsideMax { 53 | insideMin = Min(insideMin, input) 54 | insideMax = Max(insideMax, input) 55 | } 56 | } 57 | return op.docItr.SetBounds(insideMin, insideMax) 58 | } 59 | -------------------------------------------------------------------------------- /custommapdocitr_test.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestCustomMapDocItr(t *testing.T) { 8 | inside := NewMemoryScoreDocItr([]float32{-1, 0, 2, 8, 5, 9, 12}) 9 | outside := CustomMapDocItr{ 10 | docItr: inside, 11 | deflt: 0.0, 12 | points: map[float32]float32{ // kind of a zig-zag function... 13 | -2: -2, 14 | 2: 2, 15 | 5: 3, 16 | 6: 1, 17 | }, 18 | } 19 | 20 | min, max := inside.GetBounds() 21 | if !BoundsEqualish(min, max, -1, 12) { 22 | t.Fatalf("%v:%v", min, max) 23 | } 24 | min, max = outside.GetBounds() 25 | if !BoundsEqualish(min, max, 0.0, 3.0) { 26 | t.Fatalf("%v:%v", min, max) 27 | } 28 | 29 | // should leave unchanged 30 | outside.SetBounds(-2, 4) 31 | min, max = inside.GetBounds() 32 | if !BoundsEqualish(min, max, -1, 12) { 33 | t.Fatalf("%v:%v", min, max) 34 | } 35 | min, max = outside.GetBounds() 36 | if !BoundsEqualish(min, max, 0.0, 3.0) { 37 | t.Fatalf("%v:%v", min, max) 38 | } 39 | 40 | // nudge the start up some 41 | outside.SetBounds(0.25, 3) 42 | min, max = inside.GetBounds() 43 | if !BoundsEqualish(min, max, 2, 6) { 44 | t.Fatalf("%v:%v", min, max) 45 | } 46 | min, max = outside.GetBounds() 47 | if !BoundsEqualish(min, max, 0.0, 3.0) { 48 | t.Fatalf("%v:%v", min, max) 49 | } 50 | 51 | outside.SetBounds(0.5, 1.5) 52 | min, max = inside.GetBounds() 53 | if !BoundsEqualish(min, max, 6, 6) { 54 | t.Fatalf("%v:%v", min, max) 55 | } 56 | min, max = outside.GetBounds() 57 | if !BoundsEqualish(min, max, 0, 1.0) { 58 | t.Fatalf("%v:%v", min, max) 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /dataset_tools/census_p_rec_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #for ZIPFILE in /mnt/census1990/census_1990/1990_PUMS_A/*.zip ; do 4 | # unzip -c $ZIPFILE 5 | #done | grep '^P' >census1990_people.dat 6 | 7 | INPUT=census1990_people.dat 8 | OUTPUT=census1990_people.csv 9 | 10 | # for this dataset, gawk output is different than mawk or nawk, for 7 records (out of millions) 11 | AWK=${AWK:-awk} 12 | 13 | COLUMNS=" 14 | age 15 | children 16 | depart_for_work 17 | traveltime_to_work 18 | weekly_work_hours 19 | last_week_work_hours 20 | carpool_riders 21 | income 22 | wages 23 | poverty_percentage 24 | sex 25 | military_service_years 26 | " 27 | 28 | (printf 'id' 29 | for COL in $COLUMNS ; do 30 | printf ',%s' "$COL" 31 | done 32 | printf '\n' ) >$OUTPUT 33 | 34 | $AWK '{ 35 | children = substr($0,89,2); 36 | children = (children == "00") ? 0 : int(children) - 1; 37 | 38 | printf("r%d,", NR); 39 | printf("%s,", substr($0, 15,2)); # age 40 | printf("%s,", children ); # children 41 | printf("%s,", substr($0,105,4)); # depart_for_work 42 | printf("%s,", substr($0,109,2)); # traveltime_to_work 43 | printf("%s,", substr($0,125,2)); # weekly_work_hours 44 | printf("%s,", substr($0, 93,2)); # last_week_work_hours 45 | printf("%s,", substr($0,104,1)); # carpool_riders 46 | printf("%s,", substr($0,133,6)); # income 47 | printf("%s,", substr($0,139,6)); # wages 48 | printf("%s,", substr($0, 41,3)); # poverty_percentage 49 | printf("%s,", substr($0, 11,1)); # sex 50 | printf("%s\n", substr($0, 83,2)); # military_service_years 51 | } ' <$INPUT >>$OUTPUT 52 | -------------------------------------------------------------------------------- /dataset_tools/sample.csv: -------------------------------------------------------------------------------- 1 | id,first,second,third 2 | r1,1,2,3 3 | r2,0.1,11.234,01.23 4 | r3,000,03,001 5 | -------------------------------------------------------------------------------- /db.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "container/heap" 5 | "errors" 6 | "fmt" 7 | "math" 8 | ) 9 | 10 | type Query struct { 11 | Offset int 12 | Limit int 13 | MinScore float32 14 | 15 | // mixed, nested arrays of strings and numbers describing a function; for example: ["sum", ["field", "age"], ["field", "height"]] 16 | Scorer []interface{} 17 | } 18 | 19 | type DocScore struct { 20 | DocId int64 21 | Score float32 22 | } 23 | 24 | type Record struct { 25 | Id string 26 | Values map[string]float32 27 | } 28 | 29 | type QueryResult struct { 30 | Ids []string 31 | Scores []float32 32 | } 33 | 34 | // Three layers of database interfaces, each one wrapping the next: 35 | 36 | type Db interface { // Outermost interface; clients use this 37 | BulkIndex(records []Record) error 38 | Index(id string, values map[string]float32) error 39 | Query(query Query) (QueryResult, error) 40 | } 41 | 42 | type StreamingDb interface { // Uses a DocItr based query, useful for middleware that alters or combines result streams 43 | BulkIndex(records []map[string]float32) ([]int64, error) 44 | QueryItr(Scorer []interface{}) (DocItr, error) 45 | } 46 | 47 | type DbBackend interface { // the minimal interface to implement storage (filesystem, memory, etc) 48 | BulkIndex(records []map[string]float32) ([]int64, error) 49 | FieldDocItr(field string) DocItr 50 | } 51 | 52 | type IdBackend interface { // stores a mapping from scoredb's identifiers to the clients' 53 | Put(scoreIds []int64, clientIds []string) error 54 | Get(scoreIds []int64) ([]string, error) 55 | } 56 | 57 | type BaseDb struct { 58 | StreamingDb StreamingDb 59 | IdDb IdBackend 60 | } 61 | 62 | func (db BaseDb) BulkIndex(records []Record) error { 63 | clientIds := make([]string, len(records)) 64 | values := make([]map[string]float32, len(records)) 65 | for idx, rec := range records { 66 | values[idx] = rec.Values 67 | clientIds[idx] = rec.Id 68 | } 69 | scoreIds, err := db.StreamingDb.BulkIndex(values) 70 | if err != nil { 71 | return err 72 | } 73 | return db.IdDb.Put(scoreIds, clientIds) 74 | } 75 | 76 | func (db BaseDb) Index(id string, values map[string]float32) error { 77 | return db.BulkIndex([]Record{Record{Id: id, Values: values}}) 78 | } 79 | 80 | func CandidateIsLess(r1, r2 DocScore) bool { 81 | s1, s2 := r1.Score, r2.Score 82 | if s1 < s2 { 83 | return true 84 | } else if s1 > s2 { 85 | return false 86 | } else { 87 | return r1.DocId < r2.DocId 88 | } 89 | } 90 | 91 | type BaseDbResultSet []DocScore 92 | 93 | func (h BaseDbResultSet) Len() int { return len(h) } 94 | func (h BaseDbResultSet) Less(i, j int) bool { return CandidateIsLess(h[i], h[j]) } 95 | func (h BaseDbResultSet) Swap(i, j int) { h[i], h[j] = h[j], h[i] } 96 | func (h *BaseDbResultSet) Push(x interface{}) { 97 | *h = append(*h, x.(DocScore)) 98 | } 99 | func (h *BaseDbResultSet) Pop() interface{} { 100 | old := *h 101 | n := len(old) 102 | x := old[n-1] 103 | *h = old[0 : n-1] 104 | return x 105 | } 106 | 107 | func (db BaseDb) Query(query Query) (QueryResult, error) { 108 | itr, err := db.StreamingDb.QueryItr(query.Scorer) 109 | if err != nil { 110 | return QueryResult{}, err 111 | } 112 | minScore, offset, limit := query.MinScore, query.Offset, query.Limit 113 | if limit == 0 { // we short circuit this case because the code below assumes at least one result 114 | return QueryResult{Ids: []string{}}, nil 115 | } 116 | //fmt.Printf("> %+v\n", query); 117 | numResults := offset + limit 118 | resultData := make(BaseDbResultSet, 0, numResults+1) 119 | results := &resultData 120 | heap.Init(results) 121 | minCandidate := DocScore{Score: float32(math.Inf(-1))} 122 | maxScore := float32(math.Inf(1)) 123 | docId := int64(-1) 124 | var score float32 125 | for itr.Next(docId + 1) { 126 | docId, score = itr.Cur() 127 | if score < minScore { 128 | continue 129 | } 130 | candidate := DocScore{DocId: docId, Score: score} 131 | if CandidateIsLess(minCandidate, candidate) { 132 | heap.Push(results, candidate) 133 | if results.Len() > numResults { 134 | heap.Pop(results) 135 | minCandidate = resultData[0] 136 | itr.SetBounds(minCandidate.Score, maxScore) 137 | } 138 | } 139 | } 140 | itr.Close() 141 | 142 | for offset > 0 && len(resultData) > 0 { 143 | heap.Pop(results) 144 | offset -= 1 145 | } 146 | 147 | numResults = results.Len() 148 | var resultIds = make([]int64, numResults) 149 | var resultScores = make([]float32, numResults) 150 | for idx, _ := range resultIds { 151 | rec := heap.Pop(results).(DocScore) 152 | i := numResults - (idx + 1) 153 | resultIds[i] = rec.DocId 154 | resultScores[i] = rec.Score 155 | } 156 | //fmt.Printf("< %+v\n", resultIds); 157 | //fmt.Printf("< %+v\n", resultScores); 158 | 159 | clientIds, err := db.IdDb.Get(resultIds) 160 | if err != nil { 161 | return QueryResult{}, err 162 | } 163 | return QueryResult{Ids: clientIds, Scores: resultScores}, nil 164 | } 165 | 166 | func ToFloat32(val interface{}) (float32, error) { 167 | switch typed := val.(type) { 168 | case float32: 169 | return typed, nil 170 | case float64: 171 | return float32(typed), nil 172 | default: 173 | return 0.0, errors.New(fmt.Sprintf("Invalid value ('%s') given, must be floating point number", val)) 174 | } 175 | } 176 | 177 | func ToXyPoints(input interface{}) ([]CustomPoint, error) { 178 | switch inputPoints := input.(type) { 179 | case []interface{}: 180 | points := make([]CustomPoint, len(inputPoints)) 181 | for idx, inputPoint := range inputPoints { 182 | pair := inputPoint.([]interface{}) 183 | if len(pair) != 2 { 184 | return nil, fmt.Errorf("Invalid (x,y) point; found: '%v' instead", pair) 185 | } 186 | xPoint, err := ToFloat32(pair[0]) 187 | if err != nil { 188 | return nil, err 189 | } 190 | yPoint, err := ToFloat32(pair[1]) 191 | if err != nil { 192 | return nil, err 193 | } 194 | points[idx] = CustomPoint{xPoint, yPoint} 195 | } 196 | return points, nil 197 | default: 198 | return nil, fmt.Errorf("Expected array of (x,y) points; found: '%v' instead", input) 199 | } 200 | } 201 | 202 | // BaseStreamingDb : The usual way to bridge a StreamingDb to a DbBackend 203 | 204 | type BaseStreamingDb struct { 205 | Backend DbBackend 206 | } 207 | 208 | func (db BaseStreamingDb) BulkIndex(records []map[string]float32) ([]int64, error) { 209 | return db.Backend.BulkIndex(records) 210 | } 211 | 212 | func (db BaseStreamingDb) QueryItr(scorer []interface{}) (DocItr, error) { 213 | args := scorer[1:] 214 | switch scorer[0].(string) { 215 | case "sum": 216 | fieldItrs := make([]DocItr, len(args)) 217 | for idx, v := range args { 218 | itr, err := db.QueryItr(v.([]interface{})) 219 | if err != nil { 220 | return nil, err 221 | } 222 | fieldItrs[idx] = itr 223 | } 224 | return NewSumDocItr(fieldItrs), nil 225 | case "product": 226 | fieldItrs := make([]DocItr, len(args)) 227 | for idx, v := range args { 228 | itr, err := db.QueryItr(v.([]interface{})) 229 | if err != nil { 230 | return nil, err 231 | } 232 | fieldItrs[idx] = itr 233 | } 234 | return NewProductDocItr(fieldItrs), nil 235 | case "min": 236 | fieldItrs := make([]DocItr, len(args)) 237 | for idx, v := range args { 238 | itr, err := db.QueryItr(v.([]interface{})) 239 | if err != nil { 240 | return nil, err 241 | } 242 | fieldItrs[idx] = itr 243 | } 244 | return NewMinDocItr(fieldItrs), nil 245 | case "scale": 246 | if len(args) != 2 { 247 | return nil, errors.New("Wrong number of arguments to scale function") 248 | } 249 | itr, err := db.QueryItr(args[1].([]interface{})) 250 | if err != nil { 251 | return nil, err 252 | } 253 | weight := args[0] 254 | switch typed := weight.(type) { 255 | case float32: 256 | return &ScaleDocItr{typed, itr}, nil 257 | case float64: 258 | return &ScaleDocItr{float32(typed), itr}, nil 259 | default: 260 | return nil, errors.New(fmt.Sprintf("Invalid weight ('%s') given to scale function, must be floating point number", weight)) 261 | } 262 | case "diff": 263 | if len(args) != 2 { 264 | return nil, errors.New("Wrong number of arguments to diff function") 265 | } 266 | target, err := ToFloat32(args[0]) 267 | if err != nil { 268 | return nil, err 269 | } 270 | itr, err := db.QueryItr(args[1].([]interface{})) 271 | if err != nil { 272 | return nil, err 273 | } 274 | return &DiffDocItr{ 275 | target: target, 276 | itr: itr, 277 | }, nil 278 | case "pow": 279 | if len(args) != 2 { 280 | return nil, errors.New("Wrong number of arguments to pow function") 281 | } 282 | exp, err := ToFloat32(args[1]) 283 | if err != nil { 284 | return nil, err 285 | } 286 | itr, err := db.QueryItr(args[0].([]interface{})) 287 | if err != nil { 288 | return nil, err 289 | } 290 | return &PowDocItr{ 291 | itr: itr, 292 | exp: exp, 293 | }, nil 294 | 295 | case "custom_map": 296 | if len(args) != 3 { 297 | return nil, errors.New("Wrong number of arguments to custom_map function") 298 | } 299 | 300 | points, err := ToXyPoints(args[0]) 301 | if err != nil { 302 | return nil, err 303 | } 304 | 305 | deflt, err := ToFloat32(args[1]) 306 | if err != nil { 307 | return nil, err 308 | } 309 | 310 | itr, err := db.QueryItr(args[2].([]interface{})) 311 | if err != nil { 312 | return nil, err 313 | } 314 | 315 | scoremap := make(map[float32]float32) 316 | for _, pt := range points { 317 | scoremap[pt.X] = pt.Y 318 | } 319 | return &CustomMapDocItr{ 320 | points: scoremap, 321 | deflt: deflt, 322 | docItr: itr, 323 | }, nil 324 | 325 | case "custom_linear": 326 | if len(args) != 2 { 327 | return nil, errors.New("Wrong number of arguments to custom_linear function") 328 | } 329 | 330 | inputPoints := args[0].([]interface{}) 331 | points := make([]CustomPoint, len(inputPoints)) 332 | for idx, inputPoint := range inputPoints { 333 | pair := inputPoint.([]interface{}) 334 | if len(pair) != 2 { 335 | return nil, fmt.Errorf("Invalid (x,y) point in custom_linear; found: '%v' instead", pair) 336 | } 337 | xPoint, err := ToFloat32(pair[0]) 338 | if err != nil { 339 | return nil, err 340 | } 341 | yPoint, err := ToFloat32(pair[1]) 342 | if err != nil { 343 | return nil, err 344 | } 345 | points[idx] = CustomPoint{xPoint, yPoint} 346 | } 347 | 348 | itr, err := db.QueryItr(args[1].([]interface{})) 349 | if err != nil { 350 | return nil, err 351 | } 352 | 353 | return &CustomLinearDocItr{ 354 | points: points, 355 | docItr: itr, 356 | }, nil 357 | 358 | case "geo_distance": 359 | if len(args) != 4 { 360 | return nil, errors.New("Wrong number of arguments to geo_distance function") 361 | } 362 | lat, err := ToFloat32(args[0]) 363 | if err != nil { 364 | return nil, err 365 | } 366 | lng, err := ToFloat32(args[1]) 367 | if err != nil { 368 | return nil, err 369 | } 370 | latFieldName := args[2].(string) 371 | lngFieldName := args[3].(string) 372 | latItr := &DiffDocItr{target: lat, itr: db.Backend.FieldDocItr(latFieldName)} 373 | lngItr := &DiffDocItr{target: lng, itr: db.Backend.FieldDocItr(lngFieldName)} 374 | // bias longitude distances by approximate latitude (matters less at poles) 375 | multiplier := float32(math.Cos(float64(lat) * math.Pi / 180.0)) 376 | biasedLngItr := &ScaleDocItr{multiplier, lngItr} 377 | // square each component 378 | latSquaredItr := NewPowDocItr(latItr, 2.0) 379 | lngSquaredItr := NewPowDocItr(biasedLngItr, 2.0) 380 | // sum and square root 381 | distanceItr := NewPowDocItr(NewSumDocItr([]DocItr{latSquaredItr, lngSquaredItr}), 0.5) 382 | // convert degrees distance to radians and multiply by radius of the earth (in km) 383 | earthRadius := float32(6371.0 * math.Pi / 180.0) 384 | return &ScaleDocItr{earthRadius, distanceItr}, nil 385 | case "field": 386 | if len(args) != 1 { 387 | return nil, errors.New("Wrong number of arguments to field function") 388 | } 389 | key := args[0].(string) 390 | return db.Backend.FieldDocItr(key), nil 391 | default: 392 | return nil, errors.New(fmt.Sprintf("Scoring function '%s' is not recognized", scorer[0])) 393 | } 394 | } 395 | -------------------------------------------------------------------------------- /db_test.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "os" 7 | "path" 8 | "strings" 9 | "testing" 10 | ) 11 | 12 | func CallAndCheck(db Db, t *testing.T, r1 []string, limit int, scorer []interface{}) { 13 | r2, err := db.Query(Query{Limit: limit, Scorer: scorer, MinScore: float32(math.Inf(-1))}) 14 | if err != nil { 15 | t.Fatal(err) 16 | } 17 | if len(r1) != len(r2.Ids) { 18 | t.Fatalf("expected: %v found: %v", r1, r2) 19 | } 20 | for idx, v1 := range r1 { 21 | if v1 != r2.Ids[idx] { 22 | t.Fatalf("expected: %v found: %v", r1, r2) 23 | } 24 | } 25 | } 26 | 27 | func DbBasicsTest(db Db, t *testing.T) { 28 | err := db.Index("r1", map[string]float32{"age": 32, "height": 2.0, "lat": 45.0, "lon": -70.0}) 29 | if err != nil { 30 | t.Error(fmt.Sprintf("%v", err)) 31 | } 32 | err = db.Index("r2", map[string]float32{"age": 25, "height": 1.5, "lat": 43.0, "lon": -69.0}) 33 | if err != nil { 34 | t.Error(fmt.Sprintf("%v", err)) 35 | } 36 | err = db.Index("r3", map[string]float32{"age": 16, "height": 2.5, "lat": 45.0, "lon": -95.0}) 37 | if err != nil { 38 | t.Error(fmt.Sprintf("%v", err)) 39 | } 40 | CallAndCheck(db, t, []string{"r3", "r1"}, 2, []interface{}{"field", "height"}) 41 | CallAndCheck(db, t, []string{"r1", "r2"}, 2, []interface{}{"sum", 42 | []interface{}{"field", "age"}, 43 | []interface{}{"field", "height"}}) 44 | CallAndCheck(db, t, []string{"r1"}, 1, []interface{}{"sum", 45 | []interface{}{"field", "age"}, 46 | []interface{}{"field", "height"}}) 47 | CallAndCheck(db, t, []string{"r3", "r1"}, 2, []interface{}{"sum", 48 | []interface{}{"scale", 0.1, []interface{}{"field", "age"}}, 49 | []interface{}{"scale", 10.0, []interface{}{"field", "height"}}}) 50 | CallAndCheck(db, t, []string{"r3", "r2"}, 2, []interface{}{"sum", 51 | []interface{}{"scale", -1.0, []interface{}{"field", "age"}}, 52 | []interface{}{"scale", -1.0, []interface{}{"field", "height"}}}) 53 | CallAndCheck(db, t, []string{"r2", "r1", "r3"}, 3, []interface{}{"sum", 54 | []interface{}{"scale", 1.0, []interface{}{"field", "age"}}, 55 | []interface{}{"scale", -100.0, []interface{}{"field", "height"}}}) 56 | CallAndCheck(db, t, []string{}, 0, []interface{}{"sum", 57 | []interface{}{"field", "age"}, 58 | []interface{}{"field", "height"}}) 59 | CallAndCheck(db, t, []string{"r1", "r2", "r3"}, 3, []interface{}{"sum", 60 | []interface{}{"field", "age"}, 61 | []interface{}{"pow", []interface{}{"field", "height"}, 2.0}}) 62 | CallAndCheck(db, t, []string{"r3", "r1", "r2"}, 3, []interface{}{"sum", 63 | []interface{}{"field", "age"}, 64 | []interface{}{"pow", []interface{}{"field", "height"}, 10.0}}) 65 | CallAndCheck(db, t, []string{"r1", "r3", "r2"}, 3, []interface{}{"product", 66 | []interface{}{"field", "age"}, 67 | []interface{}{"field", "height"}}) 68 | CallAndCheck(db, t, []string{"r3", "r1", "r2"}, 3, []interface{}{"min", 69 | []interface{}{"field", "age"}, 70 | []interface{}{"field", "height"}}) 71 | CallAndCheck(db, t, []string{"r1", "r2", "r3"}, 3, []interface{}{"custom_linear", 72 | []interface{}{ // scores by closeness to age 30: 73 | []interface{}{float32(0), float32(0.0)}, 74 | []interface{}{float32(30), float32(1.0)}, 75 | []interface{}{float32(100), float32(0.0)}}, 76 | []interface{}{"field", "age"}}) 77 | CallAndCheck(db, t, []string{"r3", "r2", "r1"}, 3, []interface{}{"geo_distance", 45.0, -69.9, "lat", "lon"}) 78 | CallAndCheck(db, t, []string{"r3", "r1", "r2"}, 3, []interface{}{"geo_distance", 20.0, 70.0, "lat", "lon"}) 79 | } 80 | 81 | func RmAllTestData() func(name string) string { 82 | tmpDir := os.TempDir() 83 | dirfd, err := os.Open(tmpDir) 84 | if err == nil { 85 | names, err := dirfd.Readdirnames(0) 86 | if err == nil { 87 | for _, name := range names { 88 | if strings.HasPrefix(name, "scoredbtest.") { 89 | os.RemoveAll(path.Join(tmpDir, name)) 90 | } 91 | } 92 | } 93 | } 94 | return func(name string) string { 95 | fullname := path.Join(tmpDir, "scoredbtest."+name) 96 | return fullname 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /diffdocitr.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import () 4 | 5 | // (Absolute) difference between a value and a constant 6 | type DiffDocItr struct { 7 | target float32 8 | itr DocItr 9 | } 10 | 11 | func Abs(val float32) float32 { 12 | if val < 0 { 13 | return -val 14 | } else { 15 | return val 16 | } 17 | } 18 | 19 | func Max(v1, v2 float32) float32 { 20 | if v1 < v2 { 21 | return v2 22 | } else { 23 | return v1 24 | } 25 | } 26 | 27 | func Min(v1, v2 float32) float32 { 28 | if v1 > v2 { 29 | return v2 30 | } else { 31 | return v1 32 | } 33 | } 34 | 35 | func (op *DiffDocItr) Name() string { return "DiffDocItr" } 36 | func (op *DiffDocItr) Cur() (int64, float32) { 37 | docId, score := op.itr.Cur() 38 | return docId, Abs(score - op.target) 39 | } 40 | func (op *DiffDocItr) GetBounds() (min, max float32) { 41 | target := op.target 42 | min, max = op.itr.GetBounds() 43 | d1 := Abs(min - target) 44 | d2 := Abs(max - target) 45 | maxDist := Max(d1, d2) 46 | if min <= target && target <= max { 47 | return 0.0, maxDist 48 | } else { 49 | return Min(d1, d2), maxDist 50 | } 51 | } 52 | func (op *DiffDocItr) Close() { 53 | op.itr.Close() 54 | } 55 | func (op *DiffDocItr) Next(minId int64) bool { 56 | return op.itr.Next(minId) 57 | } 58 | 59 | func (op *DiffDocItr) SetBounds(min, max float32) bool { 60 | // min is not useful to us right now 61 | target := op.target 62 | return op.itr.SetBounds(target-max, target+max) 63 | } 64 | -------------------------------------------------------------------------------- /docitr.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "math" 5 | ) 6 | 7 | var PositiveInfinity = float32(math.Inf(1)) 8 | var NegativeInfinity = float32(math.Inf(-1)) 9 | 10 | type DocItr interface { 11 | // An iterator over (document id, score) values. 12 | 13 | Name() string 14 | 15 | // return false if the iterator is now known to not produce any more values 16 | SetBounds(min, max float32) bool 17 | 18 | GetBounds() (min, max float32) 19 | 20 | // Next() skips the iterator ahead to at least as far as the given id. 21 | // It always advances the iterator at least one position. 22 | // It Returns false if there are no remaining values. 23 | // Iterators need a call to Next(0) to intialize them to a real value; they all initially have a docId of -1 24 | Next(minId int64) bool 25 | 26 | Close() // release resources held by this iterator (if any) 27 | 28 | Cur() (int64, float32) // doc id and score of current result, or (-1, 0.0) if the iterator has not been initialized 29 | 30 | } 31 | -------------------------------------------------------------------------------- /elastic.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "io/ioutil" 8 | "log" 9 | "net/http" 10 | "strconv" 11 | "strings" 12 | ) 13 | 14 | type EsScoreDb struct { 15 | BaseURL, Index string 16 | } 17 | 18 | func (db *EsScoreDb) BulkIndex(records []Record) error { 19 | var jsonbuf bytes.Buffer 20 | for _, rec := range records { 21 | jsonbuf.WriteString(fmt.Sprintf("{\"index\":{\"_id\":\"%s\"}}\n", rec.Id)) 22 | buf, err := json.Marshal(rec.Values) 23 | if err != nil { 24 | return err 25 | } 26 | jsonbuf.Write(buf) 27 | jsonbuf.WriteString("\n") 28 | } 29 | payload := jsonbuf.String() 30 | url := db.BaseURL + db.Index + "/external/_bulk" 31 | //fmt.Printf("Bulk: %v @ %v\n", payload, url) 32 | resp, err := http.Post(url, "application/json", strings.NewReader(payload)) 33 | if err != nil { 34 | panic(err) 35 | } 36 | body, _ := ioutil.ReadAll(resp.Body) 37 | resp.Body.Close() 38 | //fmt.Printf("Bulk resp: %+v\n", string(body)) 39 | var parsedResponse struct{ Errors bool } 40 | err = json.Unmarshal(body, &parsedResponse) 41 | if err != nil { 42 | panic(err) 43 | } 44 | if parsedResponse.Errors { 45 | panic(string(body)) 46 | } 47 | 48 | db.RefreshIndex() 49 | 50 | return nil 51 | } 52 | 53 | type EsQueryResponse struct { 54 | Hits struct { 55 | Hits []struct { 56 | Id string `json:"_id"` 57 | } `json:"hits"` 58 | } `json:"hits"` 59 | } 60 | 61 | func (db *EsScoreDb) LinearQuery(numResults int, weights map[string]float32) []string { 62 | var scorefactors bytes.Buffer 63 | first := true 64 | for key, val := range weights { 65 | if !first { 66 | scorefactors.WriteString(",") 67 | } else { 68 | first = false 69 | } 70 | scorefactors.WriteString(fmt.Sprintf(`{"field_value_factor":{"field":"%s","factor":%f}}`, key, val)) 71 | } 72 | data := fmt.Sprintf(`{ 73 | "size":%d, 74 | "fields":[], 75 | "query":{ 76 | "function_score":{ 77 | "functions":[%s], 78 | "score_mode": "sum" 79 | } 80 | } 81 | }`, numResults, scorefactors.String()) 82 | resp, err := http.Post(db.BaseURL+db.Index+"/external/_search?pretty", "application/json", strings.NewReader(data)) 83 | if err != nil { 84 | panic(err) 85 | } 86 | body, _ := ioutil.ReadAll(resp.Body) 87 | resp.Body.Close() 88 | //fmt.Println(string(body)) 89 | queryResp := EsQueryResponse{} 90 | err = json.Unmarshal(body, &queryResp) 91 | if err != nil { 92 | panic(err) 93 | } 94 | hits := queryResp.Hits.Hits 95 | resultIds := make([]string, len(hits)) 96 | for idx, rec := range hits { 97 | resultIds[idx] = rec.Id 98 | } 99 | return resultIds 100 | } 101 | 102 | func (db *EsScoreDb) DeleteIndex() { 103 | req, _ := http.NewRequest("DELETE", db.BaseURL+db.Index, nil) 104 | resp, _ := http.DefaultClient.Do(req) 105 | body, _ := ioutil.ReadAll(resp.Body) 106 | resp.Body.Close() 107 | fmt.Println("Delete Index: " + string(body)) 108 | } 109 | 110 | func (db *EsScoreDb) CreateIndex() { 111 | payload := "{\"settings\": {\"index\": {\"number_of_shards\" : 1}}}" 112 | req, _ := http.NewRequest("PUT", db.BaseURL+db.Index, strings.NewReader(payload)) 113 | resp, _ := http.DefaultClient.Do(req) 114 | body, _ := ioutil.ReadAll(resp.Body) 115 | resp.Body.Close() 116 | fmt.Println("Create Index: " + string(body)) 117 | } 118 | 119 | func (db *EsScoreDb) RefreshIndex() { 120 | req, _ := http.NewRequest("POST", db.BaseURL+db.Index+"/_refresh", nil) 121 | resp, _ := http.DefaultClient.Do(req) 122 | ioutil.ReadAll(resp.Body) 123 | resp.Body.Close() 124 | //fmt.Println("Refresh Index: " + string(body)) 125 | } 126 | 127 | func (db *EsScoreDb) ParseQuery(query string) map[string]float32 { 128 | fields := strings.Split(query, ",") 129 | coefs := make(map[string]float32) 130 | for _, f := range fields { 131 | fieldparts := strings.Split(f, "=") 132 | if len(fieldparts) != 2 { 133 | log.Fatalf("ERROR: malformed query\n") 134 | } 135 | val, _ := strconv.ParseFloat(fieldparts[1], 32) 136 | coefs[fieldparts[0]] = float32(val) 137 | } 138 | return coefs 139 | } 140 | 141 | /* 142 | var ( 143 | deleteflag = flag.Bool("delete", false, "delete data from elasticsearch") 144 | queryflag = flag.String("query", "", "column_name=weighting_factor,...") 145 | urlflag = flag.String("esurl", "http://localhost:9200/", "URL to elasticsearch instance with trailing slash") 146 | indexflag = flag.String("index", "scoredb", "Elasticsearch index name") 147 | ) 148 | 149 | func main() { 150 | flag.Parse() 151 | db := NewEsScoreDb{BaseUrl: *urlflag, Index: *indexflag} 152 | if *deleteflag { 153 | db.DeleteData() 154 | } else if len(*queryflag) > 0 { 155 | db.LinearQuery(10, db.ParseQuery(*queryflag)) 156 | } else { 157 | fmt.Println("need to use --query querystring, or --delete") 158 | } 159 | } 160 | */ 161 | -------------------------------------------------------------------------------- /fielddocitr.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "container/heap" 5 | //"fmt" 6 | "math" 7 | //"time" 8 | ) 9 | 10 | type FieldDocItr struct { 11 | field string 12 | score float32 13 | docId int64 14 | min, max float32 15 | lists FieldDocItrs 16 | } 17 | 18 | func NewFieldDocItr(field string, lists FieldDocItrs) *FieldDocItr { 19 | itr := &FieldDocItr{ 20 | field: field, 21 | score: 0.0, 22 | docId: -1, 23 | lists: lists, 24 | } 25 | min, max := float32(math.Inf(1)), float32(math.Inf(-1)) 26 | for _, docItr := range lists { 27 | curMin, curMax := docItr.GetBounds() 28 | if curMin < min { 29 | min = curMin 30 | } 31 | if curMax > max { 32 | max = curMax 33 | } 34 | } 35 | itr.min, itr.max = min, max 36 | return itr 37 | } 38 | 39 | type FieldDocItrs []DocItr // FieldDocItrs implements heap.Interface 40 | func (so FieldDocItrs) Len() int { return len(so) } 41 | func (so FieldDocItrs) Less(i, j int) bool { 42 | d1, _ := so[i].Cur() 43 | d2, _ := so[j].Cur() 44 | return d1 < d2 45 | } 46 | func (so *FieldDocItrs) Pop() interface{} { 47 | old := *so 48 | n := len(old) 49 | item := old[n-1] 50 | *so = old[0 : n-1] 51 | return item 52 | } 53 | func (so *FieldDocItrs) Push(x interface{}) { 54 | *so = append(*so, x.(DocItr)) 55 | } 56 | func (so FieldDocItrs) Swap(i, j int) { 57 | so[i], so[j] = so[j], so[i] 58 | } 59 | 60 | func (op *FieldDocItr) Name() string { return "FieldDocItr" } 61 | func (op *FieldDocItr) Cur() (int64, float32) { 62 | return op.docId, op.score 63 | } 64 | func (op *FieldDocItr) GetBounds() (min, max float32) { 65 | return op.min, op.max 66 | } 67 | func (op *FieldDocItr) SetBounds(min, max float32) bool { 68 | op.min = min 69 | op.max = max 70 | for { 71 | keepGoing := false 72 | anyMore := false 73 | for idx, subOp := range op.lists { 74 | if subOp.SetBounds(min, max) { 75 | anyMore = true 76 | } else { 77 | subOp.Close() 78 | lists := op.lists 79 | lists[idx] = lists[len(lists)-1] 80 | op.lists = lists[:len(lists)-1] 81 | keepGoing = true 82 | //fmt.Printf("%08d Field elim @doc %08d, %05d remain (%s)\n", time.Now().UnixNano() % 100000000, op.docId, len(op.lists), op.field) 83 | break 84 | } 85 | } 86 | if !keepGoing { 87 | return anyMore 88 | } 89 | heap.Init(&op.lists) 90 | } 91 | } 92 | 93 | func (op *FieldDocItr) Close() { 94 | for _, list := range op.lists { 95 | list.Close() 96 | } 97 | } 98 | 99 | func (op *FieldDocItr) Next(minId int64) bool { 100 | if len(op.lists) == 0 { 101 | return false 102 | } 103 | var docId int64 104 | var score float32 105 | for { 106 | docId, score = op.lists[0].Cur() 107 | if docId >= minId { 108 | break 109 | } 110 | if !op.lists[0].Next(minId) { 111 | op.lists[0].Close() 112 | heap.Remove(&op.lists, 0) 113 | if len(op.lists) == 0 { 114 | //fmt.Printf("FieldDocItr Next(%v) %v END\n", minId, op.field) 115 | return false 116 | } 117 | } else { 118 | heap.Fix(&op.lists, 0) 119 | } 120 | } 121 | op.docId = docId 122 | op.score = score 123 | //fmt.Printf("FieldDocItr Next(%v) %v %v %v\n", minId, op.field, op.docId, op.score) 124 | return true 125 | } 126 | -------------------------------------------------------------------------------- /fielddocitr_test.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestFieldOp(t *testing.T) { 8 | l1 := NewMemoryDocItr( 9 | []float32{1.0, 1.0, 0.5, 1.0, 0.5}, 10 | []int64{1, 5, 7, 8, 9}, 11 | ) 12 | l2 := NewMemoryDocItr( 13 | []float32{1.0, 1.0}, 14 | []int64{2, 5}, 15 | ) 16 | fieldop := FieldDocItr{lists: FieldDocItrs{l1, l2}} 17 | if !fieldop.Next(0) { 18 | t.FailNow() 19 | } 20 | docId, _ := fieldop.Cur() 21 | if docId != 1 { 22 | t.FailNow() 23 | } 24 | if !fieldop.Next(2) { 25 | t.FailNow() 26 | } 27 | docId, _ = fieldop.Cur() 28 | if docId != 2 { 29 | t.FailNow() 30 | } 31 | if !fieldop.Next(3) { 32 | t.FailNow() 33 | } 34 | docId, _ = fieldop.Cur() 35 | if docId != 5 { 36 | t.FailNow() 37 | } 38 | if !fieldop.SetBounds(0.75, 1.0) { 39 | t.FailNow() 40 | } 41 | if !fieldop.Next(6) { 42 | t.FailNow() 43 | } 44 | docId, _ = fieldop.Cur() 45 | if docId != 8 { 46 | t.FailNow() 47 | } 48 | if fieldop.Next(9) { 49 | t.FailNow() 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /fsscoredb.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | "fmt" 7 | "io/ioutil" 8 | "math" 9 | "os" 10 | "path" 11 | "strconv" 12 | //"time" 13 | ) 14 | 15 | func NewFsScoreDb(dataDir string) *FsScoreDb { 16 | err := EnsureDirectory(dataDir) 17 | if err != nil { 18 | panic(err) 19 | } 20 | fields := make(map[string]OrderedFileInfos) 21 | 22 | // Load pre-existing file headers 23 | highestId := int64(0) 24 | fieldNames, err := ioutil.ReadDir(dataDir) 25 | if err != nil { 26 | panic(err) 27 | } 28 | for _, fieldName := range fieldNames { 29 | fieldPath := path.Join(dataDir, fieldName.Name()) 30 | fields[fieldPath] = make(OrderedFileInfos, 0) 31 | dataFiles, err := ioutil.ReadDir(fieldPath) 32 | if err != nil { 33 | panic(err) 34 | } 35 | for _, dataFile := range dataFiles { 36 | numVarBits := 32 - len(dataFile.Name()) 37 | prefixVal, err := strconv.ParseInt(dataFile.Name(), 2, 32) 38 | if err != nil { 39 | continue 40 | } 41 | dataFilePath := path.Join(fieldPath, dataFile.Name()) 42 | fd, err := os.OpenFile(dataFilePath, os.O_RDONLY, 0) 43 | if err != nil { 44 | panic(err) 45 | } 46 | var header PostingListHeader 47 | err = binary.Read(fd, binary.LittleEndian, &header) 48 | if err != nil { 49 | panic(err) 50 | } 51 | fd.Close() 52 | if header.LastDocId > highestId { 53 | highestId = header.LastDocId 54 | } 55 | fileInfo := &FileInfo{ 56 | header: &header, 57 | path: dataFilePath, 58 | numVariableBits: uint(numVarBits), 59 | minVal: math.Float32frombits(uint32(prefixVal << uint(numVarBits))), 60 | } 61 | fields[fieldName.Name()] = append(fields[fieldName.Name()], fileInfo) 62 | } 63 | 64 | } 65 | 66 | //fmt.Printf("INIT fs score db %v (highest id %d)\n", dataDir, highestId) 67 | return &FsScoreDb{ 68 | dataDir: dataDir, 69 | fields: fields, 70 | nextId: highestId + 1, 71 | } 72 | } 73 | 74 | type FsScoreDb struct { 75 | dataDir string 76 | fields map[string]OrderedFileInfos 77 | nextId int64 78 | } 79 | 80 | type PostingListHeader struct { 81 | FirstDocId int64 82 | LastDocId int64 83 | NumDocs int64 84 | MinVal float32 85 | MaxVal float32 86 | FirstDocScore float32 87 | Version uint8 88 | // padding to make struct 8-byte aligned when using encoding/binary operations: 89 | _ uint8 90 | _ uint16 91 | } 92 | 93 | type FileInfo struct { 94 | header *PostingListHeader 95 | writer *BitWriter 96 | path string 97 | numVariableBits uint // number of bits at the bottom of the float that are variable (smaller means it is a more specific bucket) 98 | minVal float32 // the minimum value allowed in the bucket (minVal and maxVal in the PostingListHeader are for the actual values stored in the list) 99 | } 100 | 101 | type OrderedFileInfos []*FileInfo 102 | 103 | func (a OrderedFileInfos) Len() int { return len(a) } 104 | func (a OrderedFileInfos) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 105 | func (a OrderedFileInfos) Less(i, j int) bool { 106 | if a[i].minVal < a[j].minVal { 107 | return true 108 | } else if a[i].minVal > a[j].minVal { 109 | return false 110 | } else { 111 | return a[i].numVariableBits > a[j].numVariableBits 112 | } 113 | } 114 | 115 | func MaxDocsForFile(fileInfo *FileInfo) int64 { 116 | header := fileInfo.header 117 | if header.MinVal == header.MaxVal { // do not split single-valued lists 118 | return math.MaxInt64 119 | } 120 | if fileInfo.numVariableBits <= 0 { // do not split lists at full precision 121 | return math.MaxInt64 122 | } 123 | fixedFractionBits := 23 - fileInfo.numVariableBits // 23 bits is size of the fraction part 124 | return 20*1568 + (1 << (fixedFractionBits)) 125 | } 126 | 127 | func Exists(path string) bool { 128 | _, err := os.Stat(path) 129 | return !os.IsNotExist(err) 130 | } 131 | 132 | func EnsureDirectory(dir string) error { 133 | if Exists(dir) { 134 | return nil 135 | } else { 136 | parent := path.Dir(dir) 137 | EnsureDirectory(parent) 138 | return os.Mkdir(dir, 0755) 139 | } 140 | } 141 | 142 | var INITIAL_VAR_BITS = uint(23 - 0) 143 | var HEADER_SIZE = int64(binary.Size(PostingListHeader{})) 144 | var numOpenFiles = 0 145 | 146 | func FindPostingListFileForWrite(db *FsScoreDb, docId int64, key string, value float32) (*FileInfo, error) { 147 | var err error 148 | fieldDir := path.Join(db.dataDir, key) 149 | files, ok := db.fields[key] 150 | if !ok { 151 | db.fields[key] = make(OrderedFileInfos, 0) 152 | files = db.fields[key] 153 | EnsureDirectory(fieldDir) 154 | } 155 | var fileInfo *FileInfo = nil 156 | bestVarBits := uint(32) 157 | // TODO idea here is that we should be able to use the ordering of OrderedFileInfos to 158 | // binary search for the right one; right now this is just a simplistic linear scan 159 | for _, curFileInfo := range files { 160 | numVar := curFileInfo.numVariableBits 161 | if math.Float32bits(curFileInfo.minVal)>>numVar == math.Float32bits(value)>>numVar { 162 | if numVar < bestVarBits { 163 | fileInfo = curFileInfo 164 | bestVarBits = numVar 165 | } 166 | } 167 | } 168 | if fileInfo == nil { // no matching posting list found 169 | fileInfo, err = MakeFileInfo(fieldDir, value, INITIAL_VAR_BITS, docId) 170 | if err != nil { 171 | return nil, err 172 | } 173 | files = append(files, fileInfo) 174 | db.fields[key] = files 175 | if err != nil { 176 | return nil, err 177 | } 178 | } else { 179 | if fileInfo.header.NumDocs >= MaxDocsForFile(fileInfo) { 180 | newBits := uint(fileInfo.numVariableBits - 3) 181 | if newBits < 0 { 182 | newBits = 0 183 | } 184 | fileInfo, err = MakeFileInfo(fieldDir, value, newBits, docId) 185 | if err != nil { 186 | return nil, err 187 | } 188 | files = append(files, fileInfo) 189 | db.fields[key] = files 190 | } 191 | } 192 | 193 | if fileInfo.writer == nil { 194 | numOpenFiles += 1 195 | fd, err := os.OpenFile(fileInfo.path, os.O_RDWR, 0666) 196 | if err != nil { 197 | return nil, err 198 | } 199 | var header PostingListHeader 200 | err = binary.Read(fd, binary.LittleEndian, &header) 201 | if err != nil { 202 | return nil, err 203 | } 204 | fileInfo.header = &header 205 | writer, err := NewBitWriter(fd) 206 | if err != nil { 207 | return nil, err 208 | } 209 | fileInfo.writer = writer 210 | } 211 | return fileInfo, nil 212 | } 213 | 214 | func MakeFileInfo(fieldDir string, value float32, numVarBits uint, docId int64) (*FileInfo, error) { 215 | var fd *os.File 216 | var err error 217 | var header PostingListHeader 218 | 219 | scoreBits := math.Float32bits(value) 220 | minVal := math.Float32frombits((scoreBits >> numVarBits) << numVarBits) 221 | numFixedBits := 32 - numVarBits 222 | scoreBitString := fmt.Sprintf("%032b", int64(scoreBits)) 223 | fixedBits := scoreBitString[:numFixedBits] 224 | filename := path.Join(fieldDir, fixedBits) 225 | 226 | if Exists(filename) { 227 | numOpenFiles += 1 228 | fd, err = os.OpenFile(filename, os.O_RDWR, 0666) 229 | if err != nil { 230 | return nil, err 231 | } 232 | err = binary.Read(fd, binary.LittleEndian, &header) 233 | if err != nil { 234 | return nil, err 235 | } 236 | fd.Seek(0, 2) // Goto EOF (whence=2 means "relative to end") 237 | } else { 238 | numOpenFiles += 1 239 | fd, err = os.Create(filename) 240 | if err != nil { 241 | return nil, err 242 | } 243 | header = PostingListHeader{ 244 | Version: 1, 245 | MinVal: value, 246 | MaxVal: value, 247 | FirstDocId: docId, 248 | FirstDocScore: value, 249 | LastDocId: docId, 250 | NumDocs: 1, 251 | } 252 | err = binary.Write(fd, binary.LittleEndian, header) 253 | if err != nil { 254 | return nil, err 255 | } 256 | } 257 | if header.Version != 1 { 258 | return nil, errors.New("Incorrect file version") 259 | } 260 | writer, err := NewBitWriter(fd) 261 | if err != nil { 262 | return nil, err 263 | } 264 | return &FileInfo{ 265 | header: &header, 266 | writer: writer, 267 | path: filename, 268 | numVariableBits: numVarBits, 269 | minVal: minVal, 270 | }, nil 271 | } 272 | 273 | func WritePostingListEntry(fileInfo *FileInfo, docId int64, score float32) { 274 | header := fileInfo.header 275 | docIncr := docId - header.LastDocId 276 | 277 | if docIncr == 0 { 278 | // special case for first entry (it exists in the header, so do not write here) 279 | return 280 | } 281 | 282 | // header maintenance 283 | header.LastDocId = docId 284 | header.NumDocs += 1 285 | if score < header.MinVal { 286 | header.MinVal = score 287 | } 288 | if score > header.MaxVal { 289 | header.MaxVal = score 290 | } 291 | scoreBits := math.Float32bits(score) 292 | scoreMask := uint32(0xffffffff) >> (32 - fileInfo.numVariableBits) 293 | scoreRemainder := uint64(scoreBits & scoreMask) 294 | 295 | if scoreRemainder == 0 { 296 | fileInfo.writer.WriteVarUInt32(uint32(docIncr << 1)) 297 | } else { 298 | fileInfo.writer.WriteVarUInt32(uint32((docIncr << 1) | 1)) 299 | fileInfo.writer.WriteBits(scoreRemainder, fileInfo.numVariableBits) 300 | } 301 | 302 | } 303 | 304 | func (op *PostingListDocItr) Close() { 305 | if op.reader != nil { 306 | numOpenFiles -= 1 307 | err := op.reader.Close() 308 | if err != nil { 309 | panic(fmt.Sprintf("%v", err)) 310 | } 311 | } 312 | } 313 | 314 | func (op *PostingListDocItr) Next(minId int64) bool { 315 | reader := op.reader 316 | if reader == nil { 317 | if op.docId == -1 && minId <= op.header.FirstDocId { 318 | op.docId = op.header.FirstDocId 319 | op.score = op.header.FirstDocScore 320 | return true 321 | } else { 322 | //fmt.Printf("%08d Open @doc %08d %s\n", time.Now().UnixNano() % 100000000, minId, op.path) 323 | fd, err := os.OpenFile(op.path, os.O_RDONLY, 0) 324 | numOpenFiles += 1 325 | if err != nil { 326 | panic(fmt.Sprintf("%v", err)) 327 | } 328 | _, err = fd.Seek(HEADER_SIZE, 0) 329 | if err != nil { 330 | panic(fmt.Sprintf("%v", err)) 331 | } 332 | reader, err = NewBitReader(fd) 333 | if err != nil { 334 | panic(fmt.Sprintf("%v", err)) 335 | } 336 | op.reader = reader 337 | } 338 | } 339 | docId := op.docId 340 | for { 341 | if docId == op.maxDocId { 342 | return false 343 | } 344 | pair, err := reader.ReadVarUInt32() 345 | if err != nil { 346 | panic(fmt.Sprintf("%v", err)) 347 | } 348 | docIncr := pair >> 1 349 | var valueBits uint64 350 | if pair&1 == 1 { 351 | valueBits, err = reader.ReadBits(op.numVarBits) 352 | if err != nil { 353 | panic(fmt.Sprintf("%v", err)) 354 | } 355 | } 356 | if docIncr == 0 { 357 | panic(fmt.Sprintf("Inconsistent file data @ %v %v", reader.MmapPtr*8, op.path)) 358 | } 359 | docId += int64(docIncr) 360 | if docId < minId { 361 | continue 362 | } 363 | score := math.Float32frombits(op.rangePrefix | uint32(valueBits)) 364 | op.docId = docId 365 | op.score = score 366 | return true 367 | } 368 | } 369 | 370 | func (db *FsScoreDb) BulkIndex(records []map[string]float32) ([]int64, error) { 371 | ids := make([]int64, len(records)) 372 | for idx, record := range records { 373 | docid := db.nextId 374 | db.nextId += 1 375 | for key, value := range record { 376 | fileInfo, err := FindPostingListFileForWrite(db, docid, key, value) 377 | if err != nil { 378 | return nil, err 379 | } 380 | WritePostingListEntry(fileInfo, docid, value) 381 | ids[idx] = docid 382 | } 383 | } 384 | CloseWriters(db) 385 | return ids, nil 386 | } 387 | 388 | func CloseWriters(db *FsScoreDb) error { 389 | for _, fieldIndex := range db.fields { 390 | for idx, fileInfo := range fieldIndex { 391 | writer := fileInfo.writer 392 | if writer == nil { 393 | continue 394 | } 395 | origPos, err := writer.File.Seek(0, 1) // save position to restore later 396 | if err != nil { 397 | return err 398 | } 399 | _, err = writer.File.Seek(0, 0) 400 | if err != nil { 401 | return err 402 | } 403 | err = binary.Write(writer.File, binary.LittleEndian, fileInfo.header) 404 | if err != nil { 405 | return err 406 | } 407 | _, err = writer.File.Seek(origPos, 0) 408 | if err != nil { 409 | return err 410 | } 411 | err = writer.Close() 412 | if err != nil { 413 | return err 414 | } 415 | numOpenFiles -= 1 416 | fieldIndex[idx].writer = nil 417 | } 418 | } 419 | return nil 420 | } 421 | 422 | func (db *FsScoreDb) Index(record map[string]float32) (int64, error) { 423 | docid := db.nextId 424 | db.nextId += 1 425 | for key, value := range record { 426 | fileInfo, err := FindPostingListFileForWrite(db, docid, key, value) 427 | if err != nil { 428 | return -1, err 429 | } 430 | WritePostingListEntry(fileInfo, docid, value) 431 | } 432 | CloseWriters(db) 433 | return docid, nil 434 | } 435 | 436 | func (db *FsScoreDb) FieldDocItr(fieldName string) DocItr { 437 | files, ok := db.fields[fieldName] 438 | if !ok { 439 | return NewMemoryScoreDocItr([]float32{}) 440 | } 441 | itrs := make([]DocItr, len(files)) 442 | for fileIdx, fileInfo := range files { 443 | itrs[fileIdx] = NewPostingListDocItr(math.Float32bits(fileInfo.minVal), fileInfo.path, fileInfo.header, fileInfo.numVariableBits) 444 | } 445 | return NewFieldDocItr(fieldName, itrs) 446 | } 447 | 448 | type PostingListDocItr struct { 449 | score float32 450 | docId int64 451 | maxDocId int64 452 | min, max float32 453 | numVarBits uint 454 | rangePrefix uint32 455 | path string 456 | reader *BitReader 457 | header *PostingListHeader 458 | } 459 | 460 | func NewPostingListDocItr(rangePrefix uint32, path string, header *PostingListHeader, numVarBits uint) DocItr { 461 | itr := &PostingListDocItr{ 462 | score: 0.0, 463 | docId: -1, 464 | maxDocId: header.LastDocId, 465 | min: header.MinVal, 466 | max: header.MaxVal, 467 | numVarBits: numVarBits, 468 | rangePrefix: rangePrefix, 469 | path: path, 470 | header: header, 471 | } 472 | return itr 473 | } 474 | 475 | func (op *PostingListDocItr) Name() string { return "PostingListDocItr" } 476 | func (op *PostingListDocItr) Cur() (int64, float32) { 477 | return op.docId, op.score 478 | } 479 | func (op *PostingListDocItr) GetBounds() (min, max float32) { 480 | return op.min, op.max 481 | } 482 | func (op *PostingListDocItr) SetBounds(min, max float32) bool { 483 | if min > op.min { 484 | op.min = min 485 | } 486 | if max < op.max { 487 | op.max = max 488 | } 489 | if op.min > op.max { 490 | return false 491 | } 492 | return true 493 | } 494 | -------------------------------------------------------------------------------- /fsscoredb_test.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestFsScore(t *testing.T) { 9 | testdir := RmAllTestData()("fsscoredb.1") 10 | defer RmAllTestData() 11 | db := BaseDb{StreamingDb: BaseStreamingDb{NewFsScoreDb(testdir)}, IdDb: NewMemoryIdDb()} 12 | DbBasicsTest(db, t) 13 | } 14 | 15 | func TestFsScoreLarge(t *testing.T) { 16 | testdir := RmAllTestData()("fsscoredb.2") 17 | defer RmAllTestData() 18 | db := BaseDb{StreamingDb: BaseStreamingDb{NewFsScoreDb(testdir)}, IdDb: NewMemoryIdDb()} 19 | 20 | for i := 0; i < 100; i++ { 21 | db.Index(fmt.Sprintf("r%d", i), map[string]float32{"age": float32(1000 + 100 - i), "height": 100 + 1.0 + float32(i%10)/10.0}) 22 | } 23 | 24 | CallAndCheck(db, t, []string{"r0", "r1"}, 2, []interface{}{"sum", 25 | []interface{}{"field", "age"}, 26 | []interface{}{"field", "height"}}) 27 | } 28 | -------------------------------------------------------------------------------- /http.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io/ioutil" 7 | "math" 8 | "net/http" 9 | "net/url" 10 | "strconv" 11 | ) 12 | 13 | type ScoreDbServer struct { 14 | Db Db 15 | ReadOnly, AutoMigrate bool 16 | } 17 | 18 | func serializeIds(ids []int64) (string, error) { 19 | b, err := json.Marshal(ids) 20 | if err != nil { 21 | return "", err 22 | } 23 | s := string(b) 24 | return s, nil 25 | } 26 | 27 | func QueryIntVal(queryParams url.Values, key string, defaultValue int) (int, error) { 28 | vals, ok := queryParams[key] 29 | if !ok || len(vals) == 0 { 30 | return defaultValue, nil 31 | } 32 | return strconv.Atoi(vals[0]) 33 | } 34 | 35 | func QueryFloatVal(queryParams url.Values, key string, defaultValue float32) (float32, error) { 36 | vals, ok := queryParams[key] 37 | if !ok || len(vals) == 0 { 38 | return defaultValue, nil 39 | } 40 | f64, err := strconv.ParseFloat(vals[0], 32) 41 | if err != nil { 42 | return 0.0, err 43 | } else { 44 | return float32(f64), nil 45 | } 46 | } 47 | 48 | func (sds *ScoreDbServer) ServeHTTP(w http.ResponseWriter, req *http.Request) { 49 | p := req.URL.Path 50 | if p[0] == '/' { 51 | p = p[1:] 52 | } 53 | 54 | if req.Method == "PUT" && !sds.ReadOnly { 55 | 56 | b, err := ioutil.ReadAll(req.Body) 57 | if err != nil { 58 | http.Error(w, "Could not read request body", 400) 59 | return 60 | } 61 | var records []Record 62 | if len(p) > 0 { 63 | var values map[string]float32 64 | err = json.Unmarshal(b, &values) 65 | if err == nil { 66 | records = append(records, Record{Id: p, Values: values}) 67 | } 68 | } else { 69 | err = json.Unmarshal(b, &records) 70 | } 71 | if err != nil { 72 | http.Error(w, fmt.Sprintf("Could not parse json: %v", err), 400) 73 | return 74 | } 75 | err = sds.Db.BulkIndex(records) 76 | if err != nil { 77 | http.Error(w, "Could not index data", 500) 78 | return 79 | } 80 | 81 | } else if req.Method == "GET" && len(p) == 0 { 82 | 83 | queryParams := req.URL.Query() 84 | 85 | offset, err := QueryIntVal(queryParams, "offset", 0) 86 | if err != nil { 87 | http.Error(w, "Invalid value for offset", 400) 88 | return 89 | } 90 | 91 | limit, err := QueryIntVal(queryParams, "limit", 10) 92 | if err != nil { 93 | http.Error(w, "Invalid value for limit", 400) 94 | return 95 | } 96 | 97 | minScore, err := QueryFloatVal(queryParams, "minScore", float32(math.Inf(-1))) 98 | if err != nil { 99 | http.Error(w, "Invalid value for minscore", 400) 100 | return 101 | } 102 | 103 | scorerStrings, ok := queryParams["score"] 104 | if !ok || len(scorerStrings) == 0 { 105 | http.Error(w, "No score function was specified", 400) 106 | return 107 | } 108 | scorer := new([]interface{}) 109 | err = json.Unmarshal([]byte(scorerStrings[0]), scorer) 110 | if err != nil { 111 | http.Error(w, "Score parameter is not a valid JSON array", 400) 112 | return 113 | } 114 | 115 | query := Query{ 116 | Offset: offset, 117 | Limit: limit, 118 | MinScore: minScore, 119 | Scorer: *scorer, 120 | } 121 | 122 | results, err := sds.Db.Query(query) 123 | if err != nil { 124 | fmt.Printf("Internal error. %+v: %v\n", query, err) 125 | http.Error(w, "Internal Error in ScoreDB; please report", 500) 126 | return 127 | } 128 | response, err := json.Marshal(results) 129 | if err != nil { 130 | fmt.Printf("Internal error. %+v: %v\n", query, err) 131 | http.Error(w, "Internal Error in ScoreDB; please report", 500) 132 | return 133 | } 134 | fmt.Fprintf(w, "%s\n", response) 135 | 136 | } else { 137 | 138 | http.NotFound(w, req) 139 | return 140 | 141 | } 142 | } 143 | 144 | func ServeHttp(addr string, db Db, readOnly bool) error { 145 | scoreDbServer := ScoreDbServer{Db: db, ReadOnly: readOnly} 146 | return http.ListenAndServe(addr, &scoreDbServer) 147 | } 148 | -------------------------------------------------------------------------------- /memorydb.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | ) 7 | 8 | func NewMemoryIdDb() MemoryIdDb { 9 | return MemoryIdDb{make(map[int64]string)} 10 | } 11 | 12 | type MemoryIdDb struct { 13 | bindings map[int64]string 14 | } 15 | 16 | func (db MemoryIdDb) Put(scoreIds []int64, clientIds []string) error { 17 | for idx, scoreId := range scoreIds { 18 | db.bindings[scoreId] = clientIds[idx] 19 | } 20 | return nil 21 | } 22 | 23 | func (db MemoryIdDb) Get(scoreIds []int64) ([]string, error) { 24 | result := make([]string, len(scoreIds)) 25 | for idx, scoreId := range scoreIds { 26 | clientId, ok := db.bindings[scoreId] 27 | if !ok { 28 | return nil, fmt.Errorf("Unable to find client id for internal id %d", scoreId) 29 | 30 | } 31 | result[idx] = clientId 32 | } 33 | return result, nil 34 | } 35 | 36 | type MemoryScoreDb struct { 37 | Fields map[string][]float32 38 | nextId int64 39 | } 40 | 41 | func NewMemoryScoreDb() *MemoryScoreDb { 42 | return &MemoryScoreDb{ 43 | Fields: make(map[string][]float32), 44 | nextId: 1, 45 | } 46 | } 47 | 48 | func (db *MemoryScoreDb) BulkIndex(records []map[string]float32) ([]int64, error) { 49 | fields := db.Fields 50 | ids := make([]int64, len(records)) 51 | for idx, record := range records { 52 | ids[idx] = db.nextId 53 | db.nextId += 1 54 | for key, value := range record { 55 | _, ok := fields[key] 56 | if !ok { 57 | fields[key] = make([]float32, 0, 64) 58 | } 59 | fields[key] = append(fields[key], value) 60 | } 61 | } 62 | return ids, nil 63 | } 64 | 65 | func (db *MemoryScoreDb) FieldDocItr(fieldName string) DocItr { 66 | scores := db.Fields[fieldName] 67 | return NewMemoryScoreDocItr(scores) 68 | } 69 | 70 | func NewMemoryScoreDocItr(scores []float32) *MemoryScoreDocItr { 71 | min, max := float32(math.Inf(1)), float32(math.Inf(-1)) 72 | for _, score := range scores { 73 | if score < min { 74 | min = score 75 | } 76 | if score > max { 77 | max = score 78 | } 79 | } 80 | return &MemoryScoreDocItr{ 81 | scores: scores, 82 | idx: -1, 83 | min: min, 84 | max: max, 85 | } 86 | } 87 | 88 | type MemoryScoreDocItr struct { 89 | scores []float32 90 | idx int 91 | min, max float32 92 | } 93 | 94 | func (op *MemoryScoreDocItr) Name() string { return "MemoryScoreDocItr" } 95 | func (op *MemoryScoreDocItr) Cur() (int64, float32) { 96 | idx := op.idx 97 | if idx < 0 || idx >= len(op.scores) { 98 | return -1, 0.0 99 | } 100 | return int64(idx + 1), op.scores[idx] 101 | 102 | } 103 | func (op *MemoryScoreDocItr) GetBounds() (min, max float32) { 104 | return op.min, op.max 105 | } 106 | func (op *MemoryScoreDocItr) SetBounds(min, max float32) bool { 107 | op.min = Max(op.min, min) 108 | op.max = Min(op.max, max) 109 | return true 110 | } 111 | 112 | func (op *MemoryScoreDocItr) Close() { 113 | } 114 | 115 | func (op *MemoryScoreDocItr) Next(minId int64) bool { 116 | if minId == 0 { 117 | minId = 1 118 | } 119 | op.idx = int(minId - 1) 120 | return op.idx < len(op.scores) 121 | } 122 | -------------------------------------------------------------------------------- /memorydb_test.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestMemoryScoreDb(t *testing.T) { 8 | db := BaseDb{StreamingDb: BaseStreamingDb{NewMemoryScoreDb()}, IdDb: NewMemoryIdDb()} 9 | DbBasicsTest(db, t) 10 | } 11 | -------------------------------------------------------------------------------- /memorydocitr.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "math" 5 | ) 6 | 7 | type MemoryDocItr struct { 8 | score float32 9 | docId int64 10 | min, max float32 11 | 12 | scores []float32 13 | docs []int64 14 | index int 15 | } 16 | 17 | func NewMemoryDocItr(scores []float32, docs []int64) *MemoryDocItr { 18 | return &MemoryDocItr{ 19 | score: 0.0, 20 | docId: -1, 21 | min: float32(math.Inf(-1)), 22 | max: float32(math.Inf(1)), 23 | scores: scores, 24 | docs: docs, 25 | index: -1, 26 | } 27 | } 28 | func (op *MemoryDocItr) Cur() (int64, float32) { 29 | return op.docId, op.score 30 | } 31 | func (op *MemoryDocItr) GetBounds() (min, max float32) { return op.min, op.max } 32 | func (op *MemoryDocItr) SetBounds(min, max float32) bool { 33 | if min > op.max || max < op.min { 34 | return false 35 | } 36 | if min > op.min { 37 | op.min = min 38 | } 39 | if max < op.max { 40 | op.max = max 41 | } 42 | return true 43 | } 44 | func (op *MemoryDocItr) Name() string { return "MemoryDocItr" } 45 | func (op *MemoryDocItr) Close() {} 46 | func (op *MemoryDocItr) Next(minId int64) bool { 47 | for { 48 | op.index += 1 49 | index := op.index 50 | if index >= len(op.docs) { 51 | return false 52 | } 53 | docId := op.docs[index] 54 | if docId >= minId { 55 | score := op.scores[index] 56 | if score >= op.min && score <= op.max { 57 | op.score = score 58 | op.docId = op.docs[index] 59 | return true 60 | } 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /migratabledb.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | type MigratableDb struct { 9 | Current Db 10 | } 11 | 12 | func (db *MigratableDb) BulkIndex(records []Record) error { 13 | return db.Current.BulkIndex(records) 14 | } 15 | 16 | func (db *MigratableDb) Index(id string, values map[string]float32) error { 17 | return db.Current.Index(id, values) 18 | } 19 | 20 | func (db *MigratableDb) Query(query Query) (QueryResult, error) { 21 | fmt.Printf("Query versus %v at %v", db.Current, time.Now().Unix()) 22 | return db.Current.Query(query) 23 | } 24 | -------------------------------------------------------------------------------- /mindocitr.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | ) 7 | 8 | type MinComponents []DocItr 9 | 10 | func (a MinComponents) Len() int { return len(a) } 11 | func (a MinComponents) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 12 | func (a MinComponents) Less(i, j int) bool { 13 | min1, max1 := a[i].GetBounds() 14 | min2, max2 := a[j].GetBounds() 15 | return max1-min1 > max2-min2 16 | } 17 | 18 | type MinDocItr struct { 19 | score float32 20 | docId int64 21 | min, max float32 22 | parts MinComponents 23 | } 24 | 25 | func NewMinDocItr(itrs []DocItr) *MinDocItr { 26 | min, max := float32(0.0), float32(0.0) 27 | components := make(MinComponents, len(itrs)) 28 | for idx, part := range itrs { 29 | curMin, curMax := part.GetBounds() 30 | components[idx] = part 31 | if idx == 0 { 32 | min, max = curMin, curMax 33 | } else { 34 | min = Min(min, curMin) 35 | max = Min(max, curMax) 36 | } 37 | } 38 | sort.Sort(components) 39 | return &MinDocItr{ 40 | score: 0.0, 41 | docId: -1, 42 | min: min, 43 | max: max, 44 | parts: components, 45 | } 46 | } 47 | 48 | func (op *MinDocItr) Name() string { return "MinDocItr" } 49 | func (op *MinDocItr) Cur() (int64, float32) { 50 | return op.docId, op.score 51 | } 52 | func (op *MinDocItr) GetBounds() (min, max float32) { return op.min, op.max } 53 | func (op *MinDocItr) Close() { 54 | for _, part := range op.parts { 55 | part.Close() 56 | } 57 | } 58 | 59 | func (op *MinDocItr) Next(minId int64) bool { 60 | min, max := op.min, op.max 61 | keepGoing := true 62 | var score float32 63 | for keepGoing { 64 | //fmt.Printf("MinDocItr Next itr (%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score) 65 | keepGoing = false 66 | score = PositiveInfinity 67 | for _, part := range op.parts { 68 | var curDocId int64 69 | var curScore float32 70 | for { 71 | curDocId, curScore = part.Cur() 72 | if curDocId >= minId { 73 | break 74 | } 75 | if !part.Next(minId) { 76 | return false 77 | } 78 | } 79 | if curDocId > minId { 80 | minId = curDocId 81 | keepGoing = true 82 | break 83 | } 84 | score = Min(score, curScore) 85 | } 86 | if !keepGoing { 87 | if score < min || score > max { 88 | minId += 1 89 | keepGoing = true 90 | } 91 | } 92 | } 93 | op.docId = minId 94 | op.score = score 95 | //fmt.Printf("MinDocItr Next(%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score) 96 | return true 97 | } 98 | 99 | func (op *MinDocItr) SetBounds(min, max float32) bool { 100 | fmt.Printf("MinDocItr SetBounds %v %v\n", min, max) 101 | op.min = min 102 | for _, component := range op.parts { 103 | curMin, curMax := component.GetBounds() 104 | if curMin < min { 105 | //fmt.Printf("MinDocItr SetBounds for component %v %v\n", min, curMax) 106 | if !component.SetBounds(min, curMax) { 107 | return false 108 | } 109 | } 110 | } 111 | return true 112 | } 113 | -------------------------------------------------------------------------------- /powdocitr.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "math" 5 | ) 6 | 7 | // Takes a constant power of a value. 8 | // Important: for bounds caluclation reasons, assumes only positive values are provided as inputs! 9 | type PowDocItr struct { 10 | exp, oneOverExp float32 11 | itr DocItr 12 | } 13 | 14 | func NewPowDocItr(itr DocItr, exp float32) *PowDocItr { 15 | return &PowDocItr{exp: exp, oneOverExp: 1.0 / exp, itr: itr} 16 | } 17 | 18 | func Pow(val, exp float32) float32 { 19 | return float32(math.Pow(float64(val), float64(exp))) 20 | } 21 | 22 | func (op *PowDocItr) Name() string { return "PowDocItr" } 23 | func (op *PowDocItr) Cur() (int64, float32) { 24 | docId, score := op.itr.Cur() 25 | return docId, Pow(score, op.exp) 26 | } 27 | func (op *PowDocItr) Close() { 28 | op.itr.Close() 29 | } 30 | func (op *PowDocItr) Next(minId int64) bool { 31 | ret := op.itr.Next(minId) 32 | return ret 33 | } 34 | func (op *PowDocItr) GetBounds() (min, max float32) { 35 | exp := op.exp 36 | min, max = op.itr.GetBounds() 37 | v1 := Pow(min, exp) 38 | v2 := Pow(max, exp) 39 | if v1 < v2 { 40 | return v1, v2 41 | } else { 42 | return v2, v1 43 | } 44 | } 45 | func (op *PowDocItr) SetBounds(min, max float32) bool { 46 | min = Max(0, min) 47 | max = Max(0, max) 48 | oneOverExp := op.oneOverExp 49 | v1 := Pow(min, oneOverExp) 50 | v2 := Pow(max, oneOverExp) 51 | if v1 < v2 { 52 | return op.itr.SetBounds(v1, v2) 53 | } else { 54 | return op.itr.SetBounds(v2, v1) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /productdocitr.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "sort" 5 | ) 6 | 7 | type ProductComponents []DocItr 8 | 9 | func (a ProductComponents) Len() int { return len(a) } 10 | func (a ProductComponents) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 11 | func (a ProductComponents) Less(i, j int) bool { 12 | min1, max1 := a[i].GetBounds() 13 | min2, max2 := a[j].GetBounds() 14 | return max1-min1 > max2-min2 15 | } 16 | 17 | type ProductDocItr struct { 18 | score float32 19 | docId int64 20 | min, max float32 21 | parts ProductComponents 22 | } 23 | 24 | func NewProductDocItr(itrs []DocItr) *ProductDocItr { 25 | min, max := float32(0.0), float32(0.0) 26 | components := make(ProductComponents, len(itrs)) 27 | for idx, part := range itrs { 28 | curMin, curMax := part.GetBounds() 29 | //fmt.Printf("Init %v %v %v\n", idx, curMin, curMax) 30 | components[idx] = part 31 | if idx == 0 { 32 | min, max = curMin, curMax 33 | } else { 34 | // assumes positive inputs: 35 | min *= curMin 36 | max *= curMax 37 | } 38 | } 39 | sort.Sort(components) 40 | return &ProductDocItr{ 41 | score: 0.0, 42 | docId: -1, 43 | min: min, 44 | max: max, 45 | parts: components, 46 | } 47 | } 48 | 49 | func (op *ProductDocItr) Name() string { return "ProductDocItr" } 50 | func (op *ProductDocItr) Cur() (int64, float32) { 51 | return op.docId, op.score 52 | } 53 | func (op *ProductDocItr) GetBounds() (min, max float32) { return op.min, op.max } 54 | func (op *ProductDocItr) Close() { 55 | for _, part := range op.parts { 56 | part.Close() 57 | } 58 | } 59 | func (op *ProductDocItr) Next(minId int64) bool { 60 | min, max := op.min, op.max 61 | keepGoing := true 62 | var score float32 63 | for keepGoing { 64 | //fmt.Printf("ProductDocItr Next itr (%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score) 65 | keepGoing = false 66 | score = float32(1.0) 67 | for _, part := range op.parts { 68 | var curDocId int64 69 | var curScore float32 70 | for { 71 | curDocId, curScore = part.Cur() 72 | if curDocId >= minId { 73 | break 74 | } 75 | if !part.Next(minId) { 76 | return false 77 | } 78 | } 79 | if curDocId > minId { 80 | minId = curDocId 81 | keepGoing = true 82 | break 83 | } 84 | score *= curScore 85 | } 86 | if !keepGoing { 87 | if score < min || score > max { 88 | minId += 1 89 | keepGoing = true 90 | } 91 | } 92 | } 93 | op.docId = minId 94 | op.score = score 95 | //fmt.Printf("ProductDocItr Next(%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score) 96 | return true 97 | } 98 | 99 | func (op *ProductDocItr) SetBounds(min, max float32) bool { 100 | //fmt.Printf("ProductDocItr SetBounds %v %v\n", min, max) 101 | op.min = min 102 | op.max = max 103 | 104 | for curfield, component := range op.parts { 105 | newMin, newMax := min, max 106 | for otherfactor, otherComponent := range op.parts { 107 | // Then divide by the other maxes or mins 108 | if curfield != otherfactor { 109 | otherMin, otherMax := otherComponent.GetBounds() 110 | if otherMax == 0.0 { 111 | newMin = 0.0 112 | } else { 113 | newMin /= otherMax 114 | } 115 | if otherMin == 0.0 { 116 | newMax = PositiveInfinity 117 | } else { 118 | newMax /= otherMin 119 | } 120 | } 121 | } 122 | curMin, curMax := component.GetBounds() 123 | if newMin < curMin { 124 | newMin = curMin 125 | } 126 | if newMax > curMax { 127 | newMax = curMax 128 | } 129 | if newMin != curMin || newMax != curMax { 130 | //fmt.Printf("ProductDocItr SetBounds for component %v %v\n", newMin, newMax) 131 | component.SetBounds(newMin, newMax) 132 | } 133 | } 134 | return true 135 | } 136 | -------------------------------------------------------------------------------- /productdocitr_test.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestProductDocItr(t *testing.T) { 8 | i1 := NewMemoryScoreDocItr([]float32{0.2, 0.8, 0.5}) 9 | i2 := NewMemoryScoreDocItr([]float32{1.0, 0.0, 0.7}) 10 | itr := NewProductDocItr([]DocItr{i1, i2}) 11 | 12 | if itr.min != 0.0 { 13 | t.Fatalf("%v", itr.min) 14 | } 15 | if itr.max != 0.8 { 16 | t.Fatalf("%v", itr.max) 17 | } 18 | 19 | itr.SetBounds(0.5, 1.0) 20 | 21 | min1, _ := i1.GetBounds() 22 | min2, _ := i2.GetBounds() 23 | 24 | if min1 != 0.5 { 25 | t.Fatalf("%v", min1) 26 | } 27 | if min2*0.2 == 0.5 { 28 | t.Fatalf("%v", min2) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /scale_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pschanely/scoredb/57beea075b4b5a53ee0a27b9752a0ca544c4510d/scale_performance.png -------------------------------------------------------------------------------- /scaledocitr.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import () 4 | 5 | // Multiplies a value by a constant 6 | type ScaleDocItr struct { 7 | factor float32 8 | docItr DocItr 9 | } 10 | 11 | func (op *ScaleDocItr) Name() string { return "ScaleDocItr" } 12 | func (op *ScaleDocItr) Cur() (int64, float32) { 13 | docId, score := op.docItr.Cur() 14 | return docId, score * op.factor 15 | } 16 | func (op *ScaleDocItr) GetBounds() (min, max float32) { 17 | min, max = op.docItr.GetBounds() 18 | factor := op.factor 19 | if factor >= 0 { 20 | return min * op.factor, max * op.factor 21 | } else { 22 | return max * op.factor, min * op.factor 23 | } 24 | } 25 | func (op *ScaleDocItr) Close() { 26 | op.docItr.Close() 27 | } 28 | func (op *ScaleDocItr) Next(minId int64) bool { 29 | return op.docItr.Next(minId) 30 | } 31 | 32 | func (op *ScaleDocItr) SetBounds(min, max float32) bool { 33 | factor := op.factor 34 | if factor >= 0 { 35 | return op.docItr.SetBounds(min/op.factor, max/op.factor) 36 | } else { 37 | return op.docItr.SetBounds(max/op.factor, min/op.factor) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /scoredb/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "encoding/json" 6 | "flag" 7 | "fmt" 8 | "github.com/pschanely/scoredb" 9 | "log" 10 | "os" 11 | "path" 12 | "runtime" 13 | "strings" 14 | "time" 15 | ) 16 | 17 | func MakeStandardDb(dataDir string, numShards int) (*scoredb.BaseDb, error) { 18 | var shards []scoredb.StreamingDb 19 | 20 | if scoredb.Exists(dataDir) && scoredb.Exists(path.Join(dataDir, "shard.0")) { 21 | i := 0 22 | shards = make([]scoredb.StreamingDb, 0, numShards) 23 | for { 24 | shardDir := path.Join(dataDir, fmt.Sprintf("shard.%d", i)) 25 | if scoredb.Exists(shardDir) { 26 | shards = append(shards, scoredb.BaseStreamingDb{scoredb.NewFsScoreDb(shardDir)}) 27 | } else { 28 | break 29 | } 30 | i += 1 31 | } 32 | } else { 33 | shards = make([]scoredb.StreamingDb, numShards) 34 | for i := range shards { 35 | shardDir := path.Join(dataDir, fmt.Sprintf("shard.%d", i)) 36 | shards[i] = scoredb.BaseStreamingDb{scoredb.NewFsScoreDb(shardDir)} 37 | } 38 | } 39 | idDb, err := scoredb.NewBoltIdDb(path.Join(dataDir, "iddb")) 40 | if err != nil { 41 | return nil, err 42 | } 43 | return &scoredb.BaseDb{ 44 | StreamingDb: scoredb.ShardedDb{ 45 | Shards: shards, 46 | }, 47 | IdDb: idDb, 48 | }, nil 49 | } 50 | 51 | func watchDir(db *scoredb.MigratableDb, baseDir string, namePrefix string) { 52 | log.Printf("Watching for databases at %s%s*\n", baseDir, namePrefix) 53 | var lastName = "" 54 | for { 55 | dir, err := os.Open(baseDir) 56 | var fileInfos []os.FileInfo 57 | if err == nil { 58 | fileInfos, err = dir.Readdir(0) 59 | dir.Close() 60 | } 61 | if err != nil { 62 | log.Printf("Unable to read %v: %v\n", dir, err) 63 | time.Sleep(55 * time.Second) 64 | } else { 65 | var newDbName = "" 66 | for _, fileInfo := range fileInfos { 67 | name := fileInfo.Name() 68 | if strings.HasPrefix(name, namePrefix) { 69 | if name > newDbName { 70 | newDbName = name 71 | } 72 | } 73 | } 74 | if newDbName > lastName { 75 | fmt.Printf("Detected database at %s%s\n", baseDir, newDbName) 76 | fullDbName := path.Join(baseDir, newDbName) 77 | newDb, err := MakeStandardDb(fullDbName, 1) 78 | if err != nil { 79 | log.Printf("Unable to load database at %s%s (%v); ignoring\n", dir, fullDbName, err) 80 | } else { 81 | fmt.Printf("The database at %s%s is live at %v\n", baseDir, fullDbName, time.Now().Unix()) 82 | db.Current = newDb 83 | lastName = newDbName 84 | } 85 | } 86 | } 87 | time.Sleep(10 * time.Second) 88 | } 89 | } 90 | 91 | func SetupDirLoading(databaseDir string) *scoredb.MigratableDb { 92 | migratable := scoredb.MigratableDb{Current: nil} 93 | baseDir, namePrefix := path.Split(databaseDir) 94 | fmt.Printf("Watching for new databases named %s* in %s\n", namePrefix, baseDir) 95 | go watchDir(&migratable, baseDir, namePrefix) 96 | return &migratable 97 | } 98 | 99 | func main() { 100 | 101 | serveCommand := flag.NewFlagSet("serve", flag.ExitOnError) 102 | servePort := serveCommand.Int("port", 11625, "listening port in http mode, defaults to 11625") 103 | serveIntf := serveCommand.String("interface", "", "network interface to listen on in http mode, defaults to empty string (any interface)") 104 | serveDataDir := serveCommand.String("datadir", "./data", "Storage directory for database") 105 | serveNumShards := serveCommand.Int("numshards", 4, "Number of shards") 106 | serveReadOnly := serveCommand.Bool("readonly", false, "Only allow GET requests") 107 | serveAutoMigrate := serveCommand.Bool("automigrate", false, "When new directories appear matching *, atomically swap in the database at that directory. (lexigraphically last)") 108 | 109 | loadCommand := flag.NewFlagSet("load", flag.ExitOnError) 110 | loadDataDir := loadCommand.String("datadir", "./data", "Storage directory for database") 111 | loadNumShards := loadCommand.Int("numshards", 4, "Number of shards (ignored if db already exists)") 112 | 113 | benchCommand := flag.NewFlagSet("benchmark", flag.ExitOnError) 114 | benchCsvFilename := benchCommand.String("csv", "", "csv filename of census data") 115 | benchMaxRecords := benchCommand.Int64("maxrecords", 1000*1000, "Maximum size of database to benchmark (in # of records)") 116 | benchCsvOutput := benchCommand.String("out", "output.csv", "csv of performance data to output") 117 | benchEsUrl := benchCommand.String("esurl", "http://localhost:9200/", "URL of elasticsearch instance") 118 | benchEsIndex := benchCommand.String("esindex", "benchmark_scoredb", "Index name to use for elasticsearch") 119 | benchFsDataDir := benchCommand.String("fsdatadir", "./benchmark_data", "Storage directory for native scoredb database") 120 | 121 | /* 122 | for cmd := range([]*flag.FlagSet{serveCommand, benchCommand}) { 123 | // common args here 124 | } 125 | */ 126 | 127 | if len(os.Args) <= 1 { 128 | fmt.Println("usage: scoredb []") 129 | fmt.Println("Commands:") 130 | fmt.Println(" serve Run a scoredb server") 131 | fmt.Println(" load Load json lines from stdin") 132 | fmt.Println(" benchmark Run performance benchmarks") 133 | fmt.Println("For more help, run scoredb -h") 134 | os.Exit(1) 135 | } 136 | var db scoredb.Db 137 | var err error 138 | switch os.Args[1] { 139 | case "serve": 140 | serveCommand.Parse(os.Args[2:]) 141 | if *serveAutoMigrate { 142 | db = SetupDirLoading(*serveDataDir) 143 | } else { 144 | db, err = MakeStandardDb(*serveDataDir, *serveNumShards) 145 | if err != nil { 146 | log.Fatalf("Failed to initialize database at %v: %v\n", *serveDataDir, err) 147 | } 148 | } 149 | addr := fmt.Sprintf("%s:%d", *serveIntf, *servePort) 150 | fmt.Printf("Serving on %s\n", addr) 151 | log.Fatal(scoredb.ServeHttp(addr, db, *serveReadOnly)) 152 | case "load": 153 | loadCommand.Parse(os.Args[2:]) 154 | db, err := MakeStandardDb(*loadDataDir, *loadNumShards) 155 | if err != nil { 156 | log.Fatal(fmt.Sprintf("Failed to initialize database at %v: %v\n", *loadDataDir, err)) 157 | } 158 | scanner := bufio.NewScanner(os.Stdin) 159 | batchSize := 200 160 | batchIndex := 0 161 | var batch = make([]scoredb.Record, batchSize) 162 | for scanner.Scan() { 163 | record := scoredb.Record{} 164 | line := scanner.Bytes() 165 | json.Unmarshal(line, &record) 166 | batch[batchIndex] = record 167 | batchIndex += 1 168 | if batchIndex >= batchSize { 169 | db.BulkIndex(batch) 170 | batchIndex = 0 171 | batch = make([]scoredb.Record, batchSize) 172 | } 173 | } 174 | if batchIndex > 0 { 175 | db.BulkIndex(batch[:batchIndex]) 176 | } 177 | case "benchmark": 178 | outputFd, err := os.Create(*benchCsvOutput) 179 | if err != nil { 180 | log.Fatal(fmt.Sprintf("Failed to output output csv file at %v: %v\n", *benchCsvOutput, err)) 181 | } 182 | 183 | runtime.GOMAXPROCS(runtime.NumCPU()) 184 | benchCommand.Parse(os.Args[2:]) 185 | esDb := &scoredb.EsScoreDb{BaseURL: *benchEsUrl, Index: *benchEsIndex} 186 | fsDb, err := MakeStandardDb(*benchFsDataDir, 4) 187 | if err != nil { 188 | log.Fatal(fmt.Sprintf("Failed to initialize database at %v: %v\n", *benchFsDataDir, err)) 189 | } 190 | if !scoredb.Exists(*benchCsvFilename) { 191 | log.Fatal(fmt.Sprintf("Cannot find source csv data file at '%s'", *benchCsvFilename)) 192 | } 193 | 194 | fmt.Printf("Running es benchmarks\n") 195 | esDb.DeleteIndex() 196 | esDb.CreateIndex() 197 | counts, esIndexTimes, esQueryTimes, err := scoredb.RunBenchmark(esDb, *benchCsvFilename, *benchMaxRecords) 198 | //esDb.DeleteIndex() 199 | if err != nil { 200 | log.Fatal(fmt.Sprintf("Failed to run es benchmark: %v\n", err)) 201 | } 202 | 203 | fmt.Printf("Running native benchmarks\n") 204 | _, fsIndexTimes, fsQueryTimes, err := scoredb.RunBenchmark(fsDb, *benchCsvFilename, *benchMaxRecords) 205 | if err != nil { 206 | log.Fatal(fmt.Sprintf("Failed to run native benchmark: %v\n", err)) 207 | } 208 | 209 | fmt.Fprintf(outputFd, "records,es_index,native_index,es_query_1,native_query_1,es_query_2,native_query_2\n") 210 | for idx := 0; idx < len(esIndexTimes); idx++ { 211 | fmt.Fprintf(outputFd, "%v,%v,%v", counts[idx], esIndexTimes[idx], fsIndexTimes[idx]) 212 | for idx2 := 0; idx2 < len(esQueryTimes[idx]); idx2++ { 213 | fmt.Fprintf(outputFd, ",%v,%v", esQueryTimes[idx][idx2], fsQueryTimes[idx][idx2]) 214 | } 215 | fmt.Fprintf(outputFd, "\n") 216 | } 217 | outputFd.Close() 218 | default: 219 | fmt.Printf("%q is not valid command.\n", os.Args[1]) 220 | os.Exit(2) 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /shardeddb.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "math/rand" 7 | ) 8 | 9 | type ShardedDb struct { 10 | Shards []StreamingDb 11 | } 12 | 13 | var reservedShardBits = uint(14) 14 | 15 | func NewShardedDb(shards []StreamingDb) (*ShardedDb, error) { 16 | maxShards := (1 << reservedShardBits) - 1 17 | if len(shards) >= 1<= bounds.max { 82 | continue 83 | } 84 | resultChannel <- CandidateResult{DocId: docId, Score: score, WorkerNum: myWorkerNum} 85 | /* 86 | select { 87 | case newBounds, ok := <- boundsChannel: 88 | if ok { 89 | if bounds != newBounds { 90 | bounds = newBounds 91 | itr.SetBounds(bounds.min, bounds.max) 92 | } 93 | } 94 | } 95 | */ 96 | 97 | newBounds := <-boundsChannel 98 | 99 | if bounds != newBounds { 100 | bounds = newBounds 101 | itr.SetBounds(bounds.min, bounds.max) 102 | } 103 | 104 | } 105 | itr.Close() 106 | resultChannel <- CandidateResult{DocId: -1} 107 | } 108 | 109 | func NewParallelDocItr(parts []DocItr) *ParallelDocItr { 110 | op := ParallelDocItr{ 111 | score: 0.0, 112 | docId: -1, 113 | NumAlive: len(parts), 114 | Bounds: Bounds{min: float32(math.Inf(-1)), max: float32(math.Inf(1))}, 115 | ResultChannel: make(chan CandidateResult), 116 | Comms: make([](chan Bounds), len(parts)), 117 | } 118 | for idx, part := range parts { 119 | part := part 120 | curMin, curMax := part.GetBounds() 121 | op.Bounds.min = Min(op.Bounds.min, curMin) 122 | op.Bounds.max = Max(op.Bounds.max, curMax) 123 | boundsChannel := make(chan Bounds) 124 | op.Comms[idx] = boundsChannel 125 | go RunItr(part, idx, op.ResultChannel, boundsChannel) 126 | } 127 | return &op 128 | } 129 | 130 | func (op *ParallelDocItr) Name() string { 131 | return "ParallelDocItr" 132 | } 133 | 134 | func (op *ParallelDocItr) SetBounds(min, max float32) bool { 135 | op.Bounds.min, op.Bounds.max = min, max 136 | return true 137 | } 138 | 139 | func (op *ParallelDocItr) GetBounds() (min, max float32) { 140 | return op.Bounds.min, op.Bounds.max 141 | } 142 | 143 | func (op *ParallelDocItr) Next(minId int64) bool { 144 | for { 145 | result := <-op.ResultChannel 146 | if result.DocId == -1 { 147 | op.NumAlive -= 1 148 | if op.NumAlive <= 0 { 149 | return false 150 | } 151 | } else { 152 | workerNum := result.WorkerNum 153 | if result.Score > op.Bounds.min && result.Score < op.Bounds.max { 154 | op.docId = ShardIdToExt(result.DocId, workerNum) 155 | op.score = result.Score 156 | op.Comms[workerNum] <- op.Bounds 157 | return true 158 | } else { 159 | op.Comms[workerNum] <- op.Bounds 160 | } 161 | } 162 | } 163 | } 164 | 165 | func (op *ParallelDocItr) Close() {} // unsure... 166 | 167 | func (op *ParallelDocItr) Cur() (int64, float32) { 168 | return op.docId, op.score 169 | } 170 | -------------------------------------------------------------------------------- /shardeddb_test.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestShardedDb(t *testing.T) { 8 | pathmaker := RmAllTestData() 9 | defer RmAllTestData() 10 | idDb, err := NewBoltIdDb(pathmaker("shard_ids")) 11 | if err != nil { 12 | t.Fatal(err) 13 | } 14 | db := BaseDb{ 15 | StreamingDb: ShardedDb{ 16 | Shards: []StreamingDb{ 17 | BaseStreamingDb{NewFsScoreDb(pathmaker("shard_1"))}, 18 | BaseStreamingDb{NewFsScoreDb(pathmaker("shard_2"))}, 19 | }, 20 | }, 21 | IdDb: idDb, 22 | } 23 | DbBasicsTest(db, t) 24 | } 25 | -------------------------------------------------------------------------------- /stub.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | type StubDb struct { 4 | idx int64 5 | } 6 | 7 | func (sdb *StubDb) Index(record map[string]float32) (int64, error) { 8 | sdb.idx += 1 9 | return sdb.idx, nil 10 | } 11 | 12 | func (sdb *StubDb) BulkIndex(records []map[string]float32) ([]int64, error) { 13 | ids := make([]int64, len(records)) 14 | for i, _ := range records { 15 | sdb.idx++ 16 | ids[i] = sdb.idx 17 | } 18 | return ids, nil 19 | } 20 | 21 | func (db *StubDb) Query(query Query) (QueryResult, error) { 22 | return QueryResult{Ids: []string{"7", "42"}}, nil 23 | } 24 | 25 | func (db *StubDb) LinearQuery(numResults int, coefs map[string]float32) []string { 26 | return []string{"7", "42"} 27 | } 28 | -------------------------------------------------------------------------------- /sumdocitr.go: -------------------------------------------------------------------------------- 1 | package scoredb 2 | 3 | import ( 4 | "math" 5 | "sort" 6 | ) 7 | 8 | type SumComponent struct { 9 | docItr DocItr 10 | scoreRange float32 11 | } 12 | 13 | type SumComponents []SumComponent 14 | 15 | func (a SumComponents) Len() int { return len(a) } 16 | func (a SumComponents) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 17 | func (a SumComponents) Less(i, j int) bool { return a[i].scoreRange > a[j].scoreRange } 18 | 19 | type SumDocItr struct { 20 | score float32 21 | docId int64 22 | min, max float32 23 | parts []SumComponent 24 | } 25 | 26 | func NewSumDocItr(itrs []DocItr) *SumDocItr { 27 | min, max := float32(0.0), float32(0.0) 28 | components := make(SumComponents, len(itrs)) 29 | for idx, part := range itrs { 30 | curMin, curMax := part.GetBounds() 31 | components[idx].docItr = part 32 | components[idx].scoreRange = float32(math.Abs(float64(curMax - curMin))) 33 | min += curMin 34 | max += curMax 35 | } 36 | sort.Sort(components) 37 | return &SumDocItr{ 38 | score: 0.0, 39 | docId: -1, 40 | min: min, 41 | max: max, 42 | parts: components, 43 | } 44 | } 45 | 46 | func (op *SumDocItr) Name() string { return "SumDocItr" } 47 | func (op *SumDocItr) Cur() (int64, float32) { 48 | return op.docId, op.score 49 | } 50 | func (op *SumDocItr) GetBounds() (min, max float32) { return op.min, op.max } 51 | func (op *SumDocItr) Close() { 52 | for _, part := range op.parts { 53 | part.docItr.Close() 54 | } 55 | } 56 | func (op *SumDocItr) Next(minId int64) bool { 57 | min, max := op.min, op.max 58 | keepGoing := true 59 | var score float32 60 | for keepGoing { 61 | keepGoing = false 62 | score = float32(0.0) 63 | for _, part := range op.parts { 64 | var curDocId int64 65 | var curScore float32 66 | for { 67 | curDocId, curScore = part.docItr.Cur() 68 | if curDocId >= minId { 69 | break 70 | } 71 | if !part.docItr.Next(minId) { 72 | return false 73 | } 74 | } 75 | if curDocId > minId { 76 | minId = curDocId 77 | keepGoing = true 78 | break 79 | } 80 | score += curScore 81 | } 82 | if !keepGoing { 83 | if score < min || score > max { 84 | minId += 1 85 | keepGoing = true 86 | } 87 | } 88 | } 89 | op.docId = minId 90 | op.score = score 91 | //fmt.Printf("SumDocItr Next(%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score) 92 | return true 93 | } 94 | 95 | func (op *SumDocItr) SetBounds(min, max float32) bool { 96 | //fmt.Printf("SumDocItr SetBounds %v %v\n", min, max) 97 | op.min = min 98 | op.max = max 99 | 100 | for curfield, component := range op.parts { 101 | newMin, newMax := min, max 102 | // subtract out the ranges of all the other components (the remaining range will be mine) 103 | for otherfactor, otherComponent := range op.parts { 104 | //Then subtract the other maxes or mins 105 | if curfield != otherfactor { 106 | otherMin, otherMax := otherComponent.docItr.GetBounds() 107 | newMin -= otherMax 108 | newMax -= otherMin 109 | } 110 | } 111 | curMin, curMax := component.docItr.GetBounds() 112 | if newMin < curMin { 113 | newMin = curMin 114 | } 115 | if newMax > curMax { 116 | newMax = curMax 117 | } 118 | if newMin != curMin || newMax != curMax { 119 | //fmt.Printf("SumDocItr SetBounds for component %v %v\n", newMin, newMax) 120 | component.docItr.SetBounds(newMin, newMax) 121 | } 122 | } 123 | return true 124 | } 125 | --------------------------------------------------------------------------------