├── .gitignore
├── LICENSE
├── README.md
├── bench.go
├── bitreader.go
├── bitreader_test.go
├── boltiddb.go
├── bucket_execution.png
├── customlineardocitr.go
├── customlineardocitr_test.go
├── custommapdocitr.go
├── custommapdocitr_test.go
├── dataset_tools
├── census_p_rec_gen.sh
└── sample.csv
├── db.go
├── db_test.go
├── diffdocitr.go
├── docitr.go
├── elastic.go
├── fielddocitr.go
├── fielddocitr_test.go
├── fsscoredb.go
├── fsscoredb_test.go
├── http.go
├── memorydb.go
├── memorydb_test.go
├── memorydocitr.go
├── migratabledb.go
├── mindocitr.go
├── powdocitr.go
├── productdocitr.go
├── productdocitr_test.go
├── scale_performance.png
├── scaledocitr.go
├── scoredb
└── main.go
├── shardeddb.go
├── shardeddb_test.go
├── stub.go
└── sumdocitr.go
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
2 | *.o
3 | *.a
4 | *.so
5 |
6 | # Folders
7 | _obj
8 | _test
9 | data
10 |
11 | # Architecture specific extensions/prefixes
12 | *.[568vq]
13 | [568vq].out
14 |
15 | *.cgo1.go
16 | *.cgo2.c
17 | _cgo_defun.c
18 | _cgo_gotypes.go
19 | _cgo_export.*
20 |
21 | _testmain.go
22 |
23 | *.exe
24 | *.test
25 | *.prof
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Phillip Schanely
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # scoredb
2 |
3 | A simple database index optimized for returning results by custom scoring functions.
4 |
5 | To my knowledge, it is the only open source system with an algorithm designed for this purpose; in some cases, it is faster than elasticsearch's implementation by an order of magnitude. (see below)
6 |
7 | # Why?
8 |
9 | Scoredb is optimized for systems that want to find the top scoring results, where the scoring function is specified by the client,
10 | and may depend on more than one field.
11 | It may be a good choice for any system that needs to incorporate multiple factors when returning results.
12 | For instance, it might power a used car website to produce results based on factors like mileage, year, and distance.
13 |
14 |
15 | # Run It
16 |
17 | Though Scoredb has a straightforward programatic interface, you can run a simple standalone HTTP server like so:
18 |
19 | ```
20 | $ go get github.com/pschanely/scoredb
21 | $ go install github.com/pschanely/scoredb/...
22 | $ ${GOPATH}/bin/scoredb serve -datadir my_data_directory -port 11625
23 | ```
24 | ... and in another shell:
25 | ```
26 | # insert some people with ages and weights
27 | $ curl -XPUT http://localhost:11625/jim -d '{"age":21, "weight":170}'
28 | $ curl -XPUT http://localhost:11625/bob -d '{"age":34, "weight":150}'
29 |
30 | # get people by age
31 | $ curl -G 'http://localhost:11625' --data-urlencode 'score=["field", "age"]'
32 | {"Ids":["bob","jim"]}
33 |
34 | # get people by the sum of their age and weight:
35 | $ curl -G 'http://localhost:11625' --data-urlencode 'score=["sum", ["field", "age"], ["field", "weight"]]'
36 | {"Ids":["jim","bob"]}
37 | ```
38 |
39 |
40 | # The Algorithm
41 |
42 | Scoredb uses a format on disk that is very similar to that used by text search systems like solr and elasticsearch.
43 | We divide each field into ranges of values (buckets) and, for each bucket, maintain a file containing the IDs of objects that have their value inside that range.
44 |
45 | The IDs in each file are strictly increasing; this means that we can traverse several buckets efficiently by using a heap of buckets to find the next smallest id among many buckets.
46 |
47 | As we traverse the buckets, we score the objects produced and put them into a candidate result set. The result set is capped at the `&limit=` parameter specified by the user. As poorly scoring results get kicked out of the candidate result set, we can infer a lower bound on the final score. With some math, we can propagate that lower bound backwards through the scoring function to infer bounds on the individual fields. These bounds may then be used to stop traversing very poorly scoring buckets that could not produce a good enough final score. In this manner, as the candidate result set gets better and better, the system can eliminate more and more buckets to arrive at a result very quickly.
48 |
49 | The following graph shows bucket elimination over the course of an example query combining two fields, "age" and "wages":
50 |
51 |
52 |
53 |
54 | # Performance
55 |
56 | Few database systems support custom scoring functions, and fewer (possibly none?) use algorithms designed for that purpose.
57 | In practice, I've found elasticsearch's
58 | [custom scoring functions](https://www.elastic.co/guide/en/elasticsearch/reference/0.90/query-dsl-function-score-query.html#query-dsl-function-score-query)
59 | to be quite fast, so I've benchmarked against it here. Please let me know about other systems I might benchmark against!
60 |
61 | This is a graph of how 5 different queries perform with varying database sizes (yellow is elasticsearch and blue is scoredb):
62 |
63 |
64 |
65 | The elasticsearch query times (yellow) look like they're rising exponentially, but it's actually linear because the X-axis has a logarithmic scale.
66 |
67 | The dataset is anonymized US census data, each object representing an individual. These are the 5 scoring functions used for benchmarking, in order from fastest to slowest (for scoredb):
68 |
69 | ```
70 | 10 * number_of_children + age
71 | 10000 * age + yearly_wages
72 | 100 * age + yearly_wages
73 | 40 * gender + weekly_work_hours
74 | 100.0 * gender + 9 * num_children + age + weekly_work_hours
75 | 5 * num_children + age + weekly_work_hours
76 | ```
77 |
78 | This is an unscientific test! Just my personal laptop, [this datafile](http://millstonecw.com/censusdata.csv.bz2) repeated a few times over for the biggest datasets, and `scoredb benchmark -maxrecords 10000000 -csv censusdata.csv`. There's no substitute for testing with your own data, queries, and hardware.
79 |
80 | It's clear from the graph that scoredb's performance can vary significantly based on the scoring function.
81 | Some guidance on scoring:
82 |
83 | * Prefer to combine fields with addition, multiplication, and, in particular, minimum, because they allow the computation of useful lower bounds. Combining fields with a max() function does not, because a bad value in one field can be completely overcome by a good value in another.
84 | * Combining many fields instead of a few will make the query take longer, because it takes longer to determine useful lower bounds on each field.
85 | * Prefer to engineer weights so that the contributions from each of your fields is similar in scale. Scoredb may never be able to find useful bounds on fields that tweak the final score very slightly.
86 |
87 |
88 | # Limitations
89 |
90 | Scoredb is minimalistic and highly specialized; it is intended to just act as one piece of a larger system:
91 | * Scoredb **has no delete or update operation**. To remove or change an object, you must build a new index. See below for how to swap a new index in under a running instance without downtime.
92 | * It stores objects as a flat set of key-value pairs with string keys and numeric values only. (internally, all values are 32 bit floating point values)
93 | * Scoredb can only respond to queries with lists of identifiers; scoredb's indexes do not provide efficient access to the original field data.
94 | * Scoredb has no built-in clustering, redundancy, or backup functions.
95 | * Adding objects to scoredb is slow if you add them one at a time. Bulk insertion should be used whenever possible.
96 | * Scoredb requires many open files; sometimes thousands of them. You will need to increase default filehandle limits on your system (see "ulimit" on linux).
97 | * Scoredb expects you to provide every field for every object; objects that are missing a field cannot be returned from queries that use the missing fields.
98 | * Scoredb data files are endian specific; most modern CPUs are little endian, so you won't normally have to worry about this.
99 |
100 | # Index Bulk Load
101 |
102 | You can create a database without running a server using the `scoredb load` command, which expects newline separated json records on stdin.
103 | So, for instance:
104 | ```
105 | printf '{"id":"person_1", "values":{"age":10, "height":53}}\n' > data.jsonl
106 | printf '{"id":"person_2", "values":{"age":32, "height":68}}\n' >> data.jsonl
107 | cat data.jsonl | scoredb load
108 | ```
109 |
110 | # Index Swapping
111 |
112 | If you need deletes or updates, you'll have to perodically rebuild your database and swap in updated versions.
113 | If you specify the -automigrate option to the server, it will look for new database directories that begin with the given data directory
114 | and keep the (lexigraphically largest) one live. Use an atomic mv command to put it in place like so:
115 |
116 | ```
117 | $ cat new_data.jsonlines | scoredb load -datadir ./live_db_v00001 # Load initial data
118 | $ scoredb serve -readonly -automigrate -datadir ./live_db_v # Start server
119 |
120 | # when ready for a new version of the database,
121 |
122 | $ cat new_data.jsonlines | scoredb load -datadir ./tmp_db # Create the database
123 | $ mv ./tmp_db ./live_db_v00002 # Rename to match the watched prefix
124 |
125 | # The server should detect and load the new database here.
126 |
127 | $ rm -rf ./live_db_v00001 # Now, remove the old database
128 | ```
129 |
130 | # Supported Query Functions
131 |
132 | As shown above, queries are expressed as JSON expressions and then url encoded into the "score" query parameter.
133 | Each expression takes a lisp-like form: `[, , , ...]`. These are the supported functions:
134 |
135 | #### `["field", ]`
136 | Simply produces the value of `` as a score.
137 | * Example: `["field", "age"]` (return the age value as a score)
138 |
139 | #### `["scale", , ]`
140 | Takes the result of `` and multiplies it by ``. `` may be negative.
141 | * Example: `["scale", 2.0, ["field", "age"]]` (age, doubled)
142 |
143 | #### `["sum", , , ...]`
144 | Sums the results of each ``.
145 | * Example: `["sum", ["field", "age"], ["field", "height"]]` (add age and height together)
146 |
147 | #### `["product", , , ...]`
148 | Multiplies the result of each `` together. For bounding reasons, negative inputs are not allowed.
149 | * Example: `["product", ["field", "age"], ["field", "height"]]` (multiply age by height)
150 |
151 | #### `["min", , , ...]`
152 | Takes the least score resulting from all ``s.
153 | * Example: `["min", ["field", "age"], ["field", "height"]]` (Take age or height, whichever is smaller)
154 |
155 | ####`["diff", , ]`
156 | Returns the absolute difference between the values produced by both subexpressions.
157 | * Example: `["diff", ["field", "age"], ["field", "height"]]` (the difference between each age and height)
158 |
159 | #### `["pow", , ]`
160 | Raises the result from the given subexpression to the `` power.
161 | `` may be fractional (for Nth roots) or negative.
162 | However, for bounding reasons, the subexpression may not produce negative values.
163 | * Example: `["pow", ["field", "age"], 2.0]` (age, squared)
164 |
165 | #### `["custom_linear", [[, ], [, ], ..], ]`
166 | Establishes a user-defined function using a set of linearly interpolated [x, y] points.
167 | Inputs smaller than the smallest X value or larger than the largest X value get the closest specified Y value.
168 | * Example: `["custom_linear", [[0, 0.0], [30, 1.0], [80, 0.0]], ["field", "age"]]` Maping ages to scores: 30 year-olds get a score of one, gradually declining to a score of zero for infants and the elderly.
169 |
170 | #### `["geo_distance", , , , ]`
171 | Returns the distance to a fixed point in kilometers as a score.
172 | This is experimental: may be inaccurate for large distances, and fails across the prime meridian.
173 | Since you typically want smaller distances to have higher scores, you'll probably want to wrap the "scale" or "custom_linear" functions around this one to invert it.
174 | * Example: `["geo_distance", 40.7, -74.0, "home_lat", "home_lng"]` Scores each result by how far its home_lat and home_lng fields put it from New York City.
175 |
176 |
177 | # Status
178 |
179 | Though it has reasonable test coverage and a small, straightforward codebase, scoredb is certainly alpha-quality software.
180 |
181 | Your bug reports are greatly appreciated.
182 |
183 |
184 | # Thanks
185 |
186 | Thanks are due to the [Samsung Accelerator](http://samsungaccelerator.com) which let us start this project as a hackathon proof of concept. Scoredb was built with this awesome team (in github lexicographic order!):
187 |
188 | * https://github.com/davidgljay
189 | * https://github.com/ploxiln
190 | * https://github.com/pschanely
191 | * https://github.com/rmarianski
192 | * https://github.com/sleepylemur
193 |
194 |
195 | # Plugs
196 |
197 | Check out of some of our other side projects too:
198 |
199 | * [wildflower-touch](https://github.com/pschanely/wildflower-touch) is proof-of-concept programming IDE and language for touch devices.
200 | * [music-tonight](http://musictonightapp.com) makes playlists of bands playing near you, tonight.
201 |
--------------------------------------------------------------------------------
/bench.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "bufio"
5 | "encoding/csv"
6 | "fmt"
7 | "io"
8 | "os"
9 | "strconv"
10 | "time"
11 | )
12 |
13 | type LinearCombinationBackend interface {
14 | BulkIndex(records []Record) error
15 | LinearQuery(numResults int, coefs map[string]float32) []string
16 | }
17 |
18 | func (db BaseDb) LinearQuery(numResults int, weights map[string]float32) []string {
19 | scorer := make([]interface{}, len(weights)+1)
20 | scorer[0] = "sum"
21 | idx := 1
22 | for key, weight := range weights {
23 | scorer[idx] = []interface{}{"scale", weight, []interface{}{"field", key}}
24 | idx += 1
25 | }
26 | result, _ := db.Query(Query{
27 | Limit: numResults,
28 | Scorer: scorer,
29 | })
30 | return result.Ids
31 | }
32 |
33 | func RunBenchmark(db LinearCombinationBackend, csvFilename string, maxRecords int64) ([]int64, []int64, [][]int64, error) {
34 | fp, err := os.Open(csvFilename)
35 | if err != nil {
36 | return nil, nil, nil, err
37 | }
38 | defer fp.Close()
39 |
40 | bufReader := bufio.NewReader(fp)
41 | csvReader := csv.NewReader(bufReader)
42 |
43 | header, err := csvReader.Read()
44 | if err == io.EOF {
45 | return nil, nil, nil, fmt.Errorf("Missing csv header")
46 | } else if err != nil {
47 | return nil, nil, nil, fmt.Errorf("Error reading csv header")
48 | }
49 |
50 | // TODO ensure we have at least one value?
51 |
52 | colMap := make(map[int]string, len(header))
53 | for colIdx, colName := range header {
54 | colMap[colIdx] = colName
55 | }
56 |
57 | totalRecs := []int64{}
58 | indexTimes := []int64{}
59 | queryTimes := [][]int64{}
60 | nResults := 10
61 | weights := []map[string]float32{
62 | map[string]float32{
63 | "age": 100.0,
64 | "wages": 1.0,
65 | },
66 | map[string]float32{
67 | "age": 10000.0,
68 | "wages": 1.0,
69 | },
70 | map[string]float32{
71 | "sex": 40.0,
72 | "weekly_work_hours": 1.0,
73 | },
74 | map[string]float32{
75 | "fertility": 10.0,
76 | "age": 1.0,
77 | },
78 | map[string]float32{
79 | "fertility": 5.0,
80 | "age": 1.0,
81 | "weekly_work_hours": 1.0,
82 | },
83 | map[string]float32{
84 | "sex": 100.0,
85 | "fertility": 9.0,
86 | "age": 1.0,
87 | "weekly_work_hours": 1.0,
88 | },
89 | }
90 |
91 | bucketSize := 1000
92 | recordGroup := make([]Record, bucketSize)
93 | totalCount := int64(0)
94 | curGroupSize := 0
95 |
96 | for {
97 | row, err := csvReader.Read()
98 | if err == io.EOF {
99 | break
100 | } else if err != nil {
101 | return nil, nil, nil, fmt.Errorf("Error reading csv contents")
102 | }
103 | record := make(map[string]float32, len(row))
104 | for fieldIdx, fieldValue := range row {
105 | recordKey, ok := colMap[fieldIdx]
106 | if !ok {
107 | // if we don't have header mappings, skip
108 | break
109 | }
110 | val64, err := strconv.ParseFloat(fieldValue, 32)
111 | if err != nil {
112 | continue
113 | }
114 | val32 := float32(val64)
115 | record[recordKey] = val32
116 | }
117 | if len(record) > 0 {
118 | // indexing one at a time
119 | // id := db.Index(record)
120 | // recordIndexIds = append(recordIndexIds, id)
121 |
122 | totalCount++
123 | recordGroup[curGroupSize] = Record{Id: fmt.Sprintf("%d", totalCount), Values: record}
124 | curGroupSize++
125 | if curGroupSize == bucketSize {
126 | t0 := time.Now().UnixNano()
127 | db.BulkIndex(recordGroup)
128 | totalRecs = append(totalRecs, totalCount)
129 | indexTimes = append(indexTimes, time.Now().UnixNano()-t0)
130 | queryRoundTimes := make([]int64, len(weights))
131 |
132 | for idx, query := range weights {
133 | //fmt.Printf("%08d Q start\n", time.Now().UnixNano() % 100000000)
134 | t0 := time.Now().UnixNano()
135 | results := db.LinearQuery(nResults, query)
136 | queryTime := time.Now().UnixNano() - t0
137 | fmt.Printf("%08d Q results: %v\n", time.Now().UnixNano()%100000000, results)
138 | queryRoundTimes[idx] = queryTime
139 | }
140 | curGroupSize = 0
141 | queryTimes = append(queryTimes, queryRoundTimes)
142 | bucketSize += bucketSize * 2
143 | if totalCount >= maxRecords {
144 | break
145 | }
146 | if bucketSize > 100000 {
147 | bucketSize = 100000
148 | }
149 | recordGroup = make([]Record, bucketSize)
150 | }
151 | }
152 | }
153 | if curGroupSize > 0 {
154 | finalRecords := make([]Record, curGroupSize)
155 | copy(finalRecords, recordGroup)
156 | db.BulkIndex(finalRecords)
157 | }
158 |
159 | return totalRecs, indexTimes, queryTimes, nil
160 | }
161 |
--------------------------------------------------------------------------------
/bitreader.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "bufio"
5 | "fmt"
6 | "github.com/edsrzf/mmap-go"
7 | "io"
8 | "os"
9 | "unsafe"
10 | )
11 |
12 | type BitWriter struct {
13 | BufferedWriter *bufio.Writer
14 | File *os.File
15 | Cur uint64
16 | CurBitsUsed uint
17 | }
18 |
19 | func FileIsAtEnd(file *os.File) bool {
20 | stat, _ := file.Stat()
21 | pos, _ := file.Seek(0, 1)
22 | return pos == stat.Size()
23 | }
24 |
25 | func WriteNativeLong(val uint64, writer io.Writer) error {
26 | byteSlice := (*((*[8]byte)(unsafe.Pointer(&val))))[:]
27 | _, err := writer.Write(byteSlice)
28 | return err
29 | }
30 |
31 | func ReadNativeLong(buf []byte) uint64 {
32 | return *((*uint64)(unsafe.Pointer(&buf[0])))
33 | }
34 |
35 | func NewBitWriter(file *os.File) (*BitWriter, error) {
36 | writer := BitWriter{File: file}
37 | if !FileIsAtEnd(file) {
38 | buf := make([]byte, 16)
39 |
40 | file.Seek(-16, 2) // Goto EOF (whence=2 means "relative to end")
41 | nRead, err := file.Read(buf)
42 | if nRead != 16 {
43 | return nil, err
44 | }
45 | writer.CurBitsUsed = uint(ReadNativeLong(buf[8:]))
46 | writer.Cur = ReadNativeLong(buf) >> (64 - writer.CurBitsUsed)
47 |
48 | file.Seek(-16, 2) // Goto EOF (whence=2 means "relative to end")
49 | }
50 | writer.BufferedWriter = bufio.NewWriter(file)
51 | return &writer, nil
52 | }
53 |
54 | func (writer *BitWriter) Close() error {
55 | bitsUsed := writer.CurBitsUsed
56 | WriteNativeLong(writer.Cur<<(64-bitsUsed), writer.BufferedWriter)
57 | WriteNativeLong(uint64(bitsUsed), writer.BufferedWriter)
58 | err := writer.BufferedWriter.Flush()
59 | if err != nil {
60 | return err
61 | }
62 | return writer.File.Close()
63 | }
64 |
65 | func (writer *BitWriter) WriteBits(val uint64, numBits uint) error { // assumes val is all zeros above numBits
66 | cur, bitsUsed := writer.Cur, writer.CurBitsUsed
67 | overflow := int(bitsUsed+numBits) - 64
68 | if overflow >= 0 { // split the write
69 | initialBits := numBits - uint(overflow)
70 | cur = (cur << initialBits) | (val >> uint(overflow))
71 | err := WriteNativeLong(cur, writer.BufferedWriter)
72 | if err != nil {
73 | return err
74 | }
75 | writer.Cur = val
76 | writer.CurBitsUsed = uint(overflow)
77 | } else {
78 | writer.Cur = (cur << numBits) | val
79 | writer.CurBitsUsed += numBits
80 | }
81 | return nil
82 | }
83 |
84 | func (writer *BitWriter) WriteVarUInt32(val uint32) error {
85 | var sizeFactor uint64
86 | if val&0xfffffff0 == 0 {
87 | sizeFactor = 0
88 | } else if val&0xffffff00 == 0 {
89 | sizeFactor = 1
90 | } else if val&0xffff0000 == 0 {
91 | sizeFactor = 2
92 | } else {
93 | sizeFactor = 3
94 | }
95 | writer.WriteBits(sizeFactor, 2)
96 | numBits := uint(4 << sizeFactor)
97 | writer.WriteBits(uint64(val), numBits)
98 | return nil
99 | }
100 |
101 | type BitReader struct {
102 | OrigMmap *mmap.MMap
103 | Mmap []uint64
104 | MmapPtr uint
105 | MmapPtrBitsLeft uint
106 | File *os.File
107 | Cur uint64
108 | CurBitsLeft uint
109 | }
110 |
111 | func NewBitReader(file *os.File) (*BitReader, error) {
112 | mapSlice, err := mmap.Map(file, mmap.RDONLY, 0)
113 | if err != nil {
114 | panic(err)
115 | }
116 | curPos, err := file.Seek(0, 1)
117 | if curPos%8 != 0 {
118 | panic(fmt.Sprintf("BitReader started at byte %v; must be 8 byte aligned", curPos))
119 | }
120 | return &BitReader{
121 | File: file,
122 | OrigMmap: &mapSlice,
123 | Mmap: (*((*[10000000]uint64)(unsafe.Pointer(&mapSlice[0]))))[:],
124 | MmapPtr: uint(curPos / 8),
125 | MmapPtrBitsLeft: 64,
126 | }, nil
127 | }
128 |
129 | func (reader *BitReader) Close() error {
130 | reader.Mmap = []uint64{}
131 | err := reader.OrigMmap.Unmap()
132 | if err != nil {
133 | return err
134 | }
135 | return reader.File.Close()
136 | }
137 |
138 | func (reader *BitReader) Refill(cur uint64, bitsLeft uint, numNeeded uint) (uint64, uint, error) {
139 | wanted := 64 - bitsLeft
140 | if wanted >= reader.MmapPtrBitsLeft {
141 | bits := reader.Mmap[reader.MmapPtr] << (64 - reader.MmapPtrBitsLeft)
142 | cur = cur | (bits >> bitsLeft)
143 | bitsLeft += reader.MmapPtrBitsLeft
144 | wanted -= reader.MmapPtrBitsLeft
145 | reader.MmapPtrBitsLeft = 64
146 | reader.MmapPtr += 1
147 | if wanted == 0 {
148 | return cur, bitsLeft, nil
149 | }
150 | }
151 | bits := reader.Mmap[reader.MmapPtr] << (64 - reader.MmapPtrBitsLeft)
152 | cur = cur | (bits >> bitsLeft)
153 | reader.MmapPtrBitsLeft -= wanted
154 | bitsLeft = 64
155 | return cur, bitsLeft, nil
156 | }
157 |
158 | func (reader *BitReader) ReadBits(numBits uint) (uint64, error) {
159 | cur, bitsLeft := reader.Cur, reader.CurBitsLeft
160 | var err error
161 | if bitsLeft < numBits {
162 | cur, bitsLeft, err = reader.Refill(cur, bitsLeft, numBits)
163 | if err != nil {
164 | return 0, err
165 | }
166 | }
167 | val := cur >> (64 - numBits)
168 | cur = cur << numBits
169 | bitsLeft -= numBits
170 | reader.Cur, reader.CurBitsLeft = cur, bitsLeft
171 | return val, nil
172 | }
173 |
174 | func (reader *BitReader) ReadVarUInt32() (uint32, error) {
175 | sizeFactor, err := reader.ReadBits(2)
176 | if err != nil {
177 | return 0, err
178 | }
179 | numNeeded := uint(4 << sizeFactor)
180 | val, err := reader.ReadBits(numNeeded)
181 | if err != nil {
182 | return 0, err
183 | }
184 | return uint32(val), nil
185 | }
186 |
--------------------------------------------------------------------------------
/bitreader_test.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "os"
5 | "testing"
6 | )
7 |
8 | func TestBitReader(t *testing.T) {
9 | filename := RmAllTestData()("bitreader")
10 | defer RmAllTestData()
11 |
12 | file, err := os.Create(filename)
13 | if err != nil {
14 | t.Fatalf("%v", err)
15 | }
16 |
17 | wtr, err := NewBitWriter(file)
18 | if err != nil {
19 | t.Fatalf("%v", err)
20 | }
21 | wtr.WriteVarUInt32(7)
22 | wtr.WriteBits(42, 21)
23 | wtr.WriteVarUInt32(0)
24 | wtr.WriteVarUInt32(1)
25 | wtr.WriteVarUInt32(2)
26 | wtr.WriteVarUInt32(123)
27 | wtr.WriteVarUInt32(12345)
28 | wtr.WriteVarUInt32(1234567)
29 | wtr.WriteVarUInt32(123456789)
30 | err = wtr.Close()
31 | if err != nil {
32 | t.Fatalf("%v", err)
33 | }
34 |
35 | // try adding mroe stuff at the end
36 | file, err = os.OpenFile(filename, os.O_RDWR, 0666)
37 | if err != nil {
38 | t.Fatalf("%v", err)
39 | }
40 | wtr, err = NewBitWriter(file)
41 | if err != nil {
42 | t.Fatalf("%v", err)
43 | }
44 | wtr.WriteVarUInt32(7654321)
45 | err = wtr.Close()
46 | if err != nil {
47 | t.Fatalf("%v", err)
48 | }
49 |
50 | fd, err := os.OpenFile(filename, os.O_RDWR, 0666)
51 | if err != nil {
52 | t.Fatalf("%v", err)
53 | }
54 | rdr, err := NewBitReader(fd)
55 | if err != nil {
56 | t.Fatalf("%v", err)
57 | }
58 | val, err := rdr.ReadVarUInt32()
59 | if err != nil || val != 7 {
60 | t.Fatalf("val:%v, err:%v", val, err)
61 | }
62 | fixedval, err := rdr.ReadBits(21)
63 | if err != nil || fixedval != 42 {
64 | t.Fatalf("val:%v, err:%v", fixedval, err)
65 | }
66 | val, err = rdr.ReadVarUInt32()
67 | if err != nil || val != 0 {
68 | t.Fatalf("val:%v, err:%v", val, err)
69 | }
70 | val, err = rdr.ReadVarUInt32()
71 | if err != nil || val != 1 {
72 | t.Fatalf("val:%v, err:%v", val, err)
73 | }
74 | val, err = rdr.ReadVarUInt32()
75 | if err != nil || val != 2 {
76 | t.Fatalf("val:%v, err:%v", val, err)
77 | }
78 | val, err = rdr.ReadVarUInt32()
79 | if err != nil || val != 123 {
80 | t.Fatalf("val:%v, err:%v", val, err)
81 | }
82 | val, err = rdr.ReadVarUInt32()
83 | if err != nil || val != 12345 {
84 | t.Fatalf("val:%v, err:%v", val, err)
85 | }
86 | val, err = rdr.ReadVarUInt32()
87 | if err != nil || val != 1234567 {
88 | t.Fatalf("val:%v, err:%v", val, err)
89 | }
90 | val, err = rdr.ReadVarUInt32()
91 | if err != nil || val != 123456789 {
92 | t.Fatalf("val:%v, err:%v", val, err)
93 | }
94 | val, err = rdr.ReadVarUInt32()
95 | if err != nil || val != 7654321 {
96 | t.Fatalf("val:%v, err:%v", val, err)
97 | }
98 | err = rdr.Close()
99 | if err != nil {
100 | t.Fatalf("%v", err)
101 | }
102 |
103 | }
104 |
105 | func TestBitReaderVolume(t *testing.T) {
106 | filename := RmAllTestData()("bitreader.volume")
107 | defer RmAllTestData()
108 |
109 | file, err := os.Create(filename)
110 | if err != nil {
111 | t.Fatalf("%v", err)
112 | }
113 |
114 | wtr, err := NewBitWriter(file)
115 | if err != nil {
116 | t.Fatalf("%v", err)
117 | }
118 |
119 | for i := 0; i < 200; i++ {
120 | wtr.WriteVarUInt32(uint32(i * i))
121 | wtr.WriteBits(uint64(i), uint(i%23)+10)
122 | }
123 | err = wtr.Close()
124 | if err != nil {
125 | t.Fatalf("%v", err)
126 | }
127 |
128 | fd, err := os.OpenFile(filename, os.O_RDWR, 0666)
129 | if err != nil {
130 | t.Fatalf("%v", err)
131 | }
132 | rdr, err := NewBitReader(fd)
133 | if err != nil {
134 | t.Fatalf("%v", err)
135 | }
136 | for i := 0; i < 200; i++ {
137 | val, err := rdr.ReadVarUInt32()
138 | if err != nil || int(val) != i*i {
139 | t.Fatalf("val:%v, err:%v", val, err)
140 | }
141 | fixedval, err := rdr.ReadBits(uint(i%23) + 10)
142 | if err != nil || int(fixedval) != i {
143 | t.Fatalf("val:%v, err:%v", fixedval, err)
144 | }
145 | }
146 | err = rdr.Close()
147 | if err != nil {
148 | t.Fatalf("%v", err)
149 | }
150 | }
151 |
--------------------------------------------------------------------------------
/boltiddb.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "encoding/binary"
5 | "fmt"
6 | "github.com/boltdb/bolt"
7 | )
8 |
9 | func NewBoltIdDb(file string) (*BoltIdDb, error) {
10 | db, err := bolt.Open(file, 0600, nil)
11 | if err != nil {
12 | return nil, err
13 | }
14 | return &BoltIdDb{Db: db}, nil
15 | }
16 |
17 | type BoltIdDb struct {
18 | Db *bolt.DB
19 | }
20 |
21 | func encodeScoreId(id int64) []byte {
22 | var buf [9]byte
23 | slice := buf[:]
24 | sz := binary.PutVarint(slice, id)
25 | return slice[:sz]
26 | }
27 |
28 | var boltBucketName []byte = []byte("ScoreDbIds")
29 |
30 | func (db *BoltIdDb) Put(scoreIds []int64, clientIds []string) error {
31 | return db.Db.Update(func(tx *bolt.Tx) error {
32 | b, err := tx.CreateBucketIfNotExists([]byte(boltBucketName))
33 | if err != nil {
34 | return err
35 | }
36 | for idx, scoreId := range scoreIds {
37 | err = b.Put(encodeScoreId(scoreId), []byte(clientIds[idx]))
38 | if err != nil {
39 | return err
40 | }
41 | }
42 | return nil
43 | })
44 | }
45 |
46 | func (db *BoltIdDb) Get(scoreIds []int64) ([]string, error) {
47 | result := make([]string, len(scoreIds))
48 |
49 | err := db.Db.View(func(tx *bolt.Tx) error {
50 | b := tx.Bucket([]byte(boltBucketName))
51 | for idx, scoreId := range scoreIds {
52 | clientIdBytes := b.Get(encodeScoreId(scoreId))
53 | if clientIdBytes == nil {
54 | return fmt.Errorf("Unable to find client id for internal id %d", scoreId)
55 | }
56 | result[idx] = string(clientIdBytes[:])
57 | //fmt.Printf(" ID %v %v %v %v\n", idx, scoreId, clientIdBytes, result[idx])
58 | }
59 | return nil
60 | })
61 |
62 | return result, err
63 | }
64 |
--------------------------------------------------------------------------------
/bucket_execution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pschanely/scoredb/57beea075b4b5a53ee0a27b9752a0ca544c4510d/bucket_execution.png
--------------------------------------------------------------------------------
/customlineardocitr.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "sort"
5 | )
6 |
7 | type CustomPoint struct {
8 | X, Y float32
9 | }
10 |
11 | // Remaps a value according to a user-specified function that linearly interpolates
12 | // among a set of (x, y) points.
13 | type CustomLinearDocItr struct {
14 | points []CustomPoint
15 | docItr DocItr
16 | }
17 |
18 | func ComputeCustomFunc(x float32, points []CustomPoint) float32 {
19 | numPoints := len(points)
20 | idx := sort.Search(numPoints, func(i int) bool {
21 | return points[i].X >= x
22 | })
23 | if idx == 0 {
24 | return points[0].Y
25 | } else if idx == numPoints {
26 | return points[numPoints-1].Y
27 | } else {
28 | p1 := points[idx-1]
29 | p2 := points[idx]
30 | pctInto := (x - p1.X) / (p2.X - p1.X)
31 | return p2.Y*pctInto + p1.Y*(1.0-pctInto)
32 | }
33 | }
34 |
35 | func (op *CustomLinearDocItr) Name() string { return "CustomLinearDocItr" }
36 | func (op *CustomLinearDocItr) Cur() (int64, float32) {
37 | docId, score := op.docItr.Cur()
38 | return docId, ComputeCustomFunc(score, op.points)
39 | }
40 | func (op *CustomLinearDocItr) GetBounds() (min, max float32) {
41 | insideMin, insideMax := op.docItr.GetBounds()
42 | outsideMin := ComputeCustomFunc(insideMin, op.points)
43 | outsideMax := ComputeCustomFunc(insideMax, op.points)
44 | if outsideMin > outsideMax { // swap if required
45 | outsideMin, outsideMax = outsideMax, outsideMin
46 | }
47 | // functions need not be monotonic, check for peaks inside the X range
48 | for _, point := range op.points {
49 | if point.X <= insideMin {
50 | continue
51 | } else if point.X >= insideMax {
52 | break
53 | } else {
54 | y := point.Y
55 | outsideMax = Max(outsideMax, y)
56 | outsideMin = Min(outsideMin, y)
57 | }
58 | }
59 | return outsideMin, outsideMax
60 | }
61 | func (op *CustomLinearDocItr) Close() {
62 | op.docItr.Close()
63 | }
64 | func (op *CustomLinearDocItr) Next(minId int64) bool {
65 | return op.docItr.Next(minId)
66 | }
67 |
68 | func CheckIntersection(yValue float32, p1, p2 CustomPoint, insideMin, insideMax *float32) {
69 | var xIntersect float32
70 | // intersect descending: y 3 at {3 3}-{6 1}: 0
71 | if p1.Y <= yValue && yValue <= p2.Y { // intersect while function is ascending
72 | earliness := (p2.Y - yValue) / (p2.Y - p1.Y)
73 | xIntersect = p1.X*earliness + p2.X*(1.0-earliness)
74 | } else if p1.Y >= yValue && yValue >= p2.Y { // intersect while function is descending
75 | lateness := (p1.Y - yValue) / (p1.Y - p2.Y)
76 | xIntersect = p2.X*lateness + p1.X*(1.0-lateness)
77 | } else {
78 | return
79 | }
80 | *insideMin = Min(xIntersect, *insideMin)
81 | *insideMax = Max(xIntersect, *insideMax)
82 | }
83 |
84 | func (op *CustomLinearDocItr) SetBounds(outsideMin, outsideMax float32) bool {
85 | insideMin, insideMax := PositiveInfinity, NegativeInfinity // start with impossible (inverted) range
86 | for idx := len(op.points) - 1; idx > 0; idx-- {
87 | p1 := op.points[idx-1]
88 | p2 := op.points[idx]
89 | CheckIntersection(outsideMin, p1, p2, &insideMin, &insideMax)
90 | CheckIntersection(outsideMax, p1, p2, &insideMin, &insideMax)
91 | if outsideMin <= p2.Y && p2.Y <= outsideMax {
92 | insideMin = Min(insideMin, p2.X)
93 | insideMax = Max(insideMax, p2.X)
94 | }
95 | }
96 | firstPoint := op.points[0]
97 | if outsideMin <= firstPoint.Y && firstPoint.Y <= outsideMax {
98 | insideMin = NegativeInfinity
99 | }
100 | lastPoint := op.points[len(op.points)-1]
101 | if outsideMin <= lastPoint.Y && lastPoint.Y <= outsideMax {
102 | insideMax = PositiveInfinity
103 | }
104 | return op.docItr.SetBounds(insideMin, insideMax)
105 | }
106 |
--------------------------------------------------------------------------------
/customlineardocitr_test.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "math"
5 | "testing"
6 | )
7 |
8 | func BoundsEqualish(actualMin, actualMax, expectedMin, expectedMax float32) bool {
9 | tolerance := 0.0000001
10 | if math.Abs(float64(actualMin-expectedMin)) > tolerance {
11 | return false
12 | }
13 | if math.Abs(float64(actualMax-expectedMax)) > tolerance {
14 | return false
15 | }
16 | return true
17 | }
18 |
19 | func TestComputeCustomFunc(t *testing.T) {
20 | v := ComputeCustomFunc(1.0, []CustomPoint{CustomPoint{0.0, 0.0}, CustomPoint{3.0, 3.0}})
21 | if v != 1.0 {
22 | t.Fatalf("%v", v)
23 | }
24 | v = ComputeCustomFunc(-1, []CustomPoint{CustomPoint{0, 0}, CustomPoint{3, 3}})
25 | if v != 0.0 {
26 | t.Fatalf("%v", v)
27 | }
28 | v = ComputeCustomFunc(3, []CustomPoint{CustomPoint{0, 0}, CustomPoint{3, 3}})
29 | if v != 3.0 {
30 | t.Fatalf("%v", v)
31 | }
32 | }
33 |
34 | func TestCustomLinearDocItr(t *testing.T) {
35 | inside := NewMemoryScoreDocItr([]float32{-1, 0, 2, 8, 5, 9, 12})
36 | outside := CustomLinearDocItr{
37 | docItr: inside,
38 | points: []CustomPoint{
39 | CustomPoint{0, 0}, // kind of a zig-zag function...
40 | CustomPoint{3, 3},
41 | CustomPoint{6, 1},
42 | CustomPoint{9, 2},
43 | },
44 | }
45 |
46 | min, max := inside.GetBounds()
47 | if !BoundsEqualish(min, max, -1, 12) {
48 | t.Fatalf("%v:%v", min, max)
49 | }
50 | min, max = outside.GetBounds()
51 | if !BoundsEqualish(min, max, 0.0, 3.0) {
52 | t.Fatalf("%v:%v", min, max)
53 | }
54 |
55 | // should leave unchanged
56 | outside.SetBounds(0, 4)
57 | min, max = inside.GetBounds()
58 | if !BoundsEqualish(min, max, -1, 12) {
59 | t.Fatalf("%v:%v", min, max)
60 | }
61 | min, max = outside.GetBounds()
62 | if !BoundsEqualish(min, max, 0.0, 3.0) {
63 | t.Fatalf("%v:%v", min, max)
64 | }
65 |
66 | // nudge the start up some
67 | outside.SetBounds(0.5, 3)
68 | min, max = inside.GetBounds()
69 | if !BoundsEqualish(min, max, 0.5, 12) {
70 | t.Fatalf("%v:%v", min, max)
71 | }
72 | min, max = outside.GetBounds()
73 | if !BoundsEqualish(min, max, 0.5, 3.0) {
74 | t.Fatalf("%v:%v", min, max)
75 | }
76 |
77 | // chop off the end (leaves a hole in the middle of the function)
78 | outside.SetBounds(0.5, 1.5)
79 | min, max = inside.GetBounds()
80 | if !BoundsEqualish(min, max, 0.5, 7.5) {
81 | t.Fatalf("%v:%v", min, max)
82 | }
83 | min, max = outside.GetBounds()
84 | if !BoundsEqualish(min, max, 0.5, 3.0) {
85 | t.Fatalf("%v:%v", min, max)
86 | }
87 |
88 | // chop off most of the end
89 | outside.SetBounds(0.5, 0.9)
90 | min, max = inside.GetBounds()
91 | if !BoundsEqualish(min, max, 0.5, 0.9) {
92 | t.Fatalf("%v:%v", min, max)
93 | }
94 | min, max = outside.GetBounds()
95 | if !BoundsEqualish(min, max, 0.5, 0.9) {
96 | t.Fatalf("%v:%v", min, max)
97 | }
98 |
99 | }
100 |
--------------------------------------------------------------------------------
/custommapdocitr.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import ()
4 |
5 | // Remaps a value according to a user-specified mapping of values to scores
6 | type CustomMapDocItr struct {
7 | points map[float32]float32
8 | deflt float32
9 | docItr DocItr
10 | }
11 |
12 | func (op *CustomMapDocItr) ComputeCustomFunc(val float32) float32 {
13 | score, ok := op.points[val]
14 | if ok {
15 | return score
16 | } else {
17 | return op.deflt
18 | }
19 | }
20 |
21 | func (op *CustomMapDocItr) Name() string { return "CustomMapDocItr" }
22 | func (op *CustomMapDocItr) Cur() (int64, float32) {
23 | docId, score := op.docItr.Cur()
24 | return docId, op.ComputeCustomFunc(score)
25 | }
26 | func (op *CustomMapDocItr) GetBounds() (min, max float32) {
27 | insideMin, insideMax := op.docItr.GetBounds()
28 | outsideMin := op.deflt
29 | outsideMax := op.deflt
30 | for input, output := range op.points {
31 | if insideMin <= input && input <= insideMax {
32 | outsideMin = Min(outsideMin, output)
33 | outsideMax = Max(outsideMax, output)
34 | }
35 | }
36 | return outsideMin, outsideMax
37 | }
38 | func (op *CustomMapDocItr) Close() {
39 | op.docItr.Close()
40 | }
41 | func (op *CustomMapDocItr) Next(minId int64) bool {
42 | return op.docItr.Next(minId)
43 | }
44 |
45 | func (op *CustomMapDocItr) SetBounds(outsideMin, outsideMax float32) bool {
46 | if outsideMin <= op.deflt && op.deflt <= outsideMax {
47 | return true
48 | }
49 |
50 | insideMin, insideMax := PositiveInfinity, NegativeInfinity // start with impossible (inverted) range
51 | for input, output := range op.points {
52 | if outsideMin <= output && output <= outsideMax {
53 | insideMin = Min(insideMin, input)
54 | insideMax = Max(insideMax, input)
55 | }
56 | }
57 | return op.docItr.SetBounds(insideMin, insideMax)
58 | }
59 |
--------------------------------------------------------------------------------
/custommapdocitr_test.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestCustomMapDocItr(t *testing.T) {
8 | inside := NewMemoryScoreDocItr([]float32{-1, 0, 2, 8, 5, 9, 12})
9 | outside := CustomMapDocItr{
10 | docItr: inside,
11 | deflt: 0.0,
12 | points: map[float32]float32{ // kind of a zig-zag function...
13 | -2: -2,
14 | 2: 2,
15 | 5: 3,
16 | 6: 1,
17 | },
18 | }
19 |
20 | min, max := inside.GetBounds()
21 | if !BoundsEqualish(min, max, -1, 12) {
22 | t.Fatalf("%v:%v", min, max)
23 | }
24 | min, max = outside.GetBounds()
25 | if !BoundsEqualish(min, max, 0.0, 3.0) {
26 | t.Fatalf("%v:%v", min, max)
27 | }
28 |
29 | // should leave unchanged
30 | outside.SetBounds(-2, 4)
31 | min, max = inside.GetBounds()
32 | if !BoundsEqualish(min, max, -1, 12) {
33 | t.Fatalf("%v:%v", min, max)
34 | }
35 | min, max = outside.GetBounds()
36 | if !BoundsEqualish(min, max, 0.0, 3.0) {
37 | t.Fatalf("%v:%v", min, max)
38 | }
39 |
40 | // nudge the start up some
41 | outside.SetBounds(0.25, 3)
42 | min, max = inside.GetBounds()
43 | if !BoundsEqualish(min, max, 2, 6) {
44 | t.Fatalf("%v:%v", min, max)
45 | }
46 | min, max = outside.GetBounds()
47 | if !BoundsEqualish(min, max, 0.0, 3.0) {
48 | t.Fatalf("%v:%v", min, max)
49 | }
50 |
51 | outside.SetBounds(0.5, 1.5)
52 | min, max = inside.GetBounds()
53 | if !BoundsEqualish(min, max, 6, 6) {
54 | t.Fatalf("%v:%v", min, max)
55 | }
56 | min, max = outside.GetBounds()
57 | if !BoundsEqualish(min, max, 0, 1.0) {
58 | t.Fatalf("%v:%v", min, max)
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/dataset_tools/census_p_rec_gen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #for ZIPFILE in /mnt/census1990/census_1990/1990_PUMS_A/*.zip ; do
4 | # unzip -c $ZIPFILE
5 | #done | grep '^P' >census1990_people.dat
6 |
7 | INPUT=census1990_people.dat
8 | OUTPUT=census1990_people.csv
9 |
10 | # for this dataset, gawk output is different than mawk or nawk, for 7 records (out of millions)
11 | AWK=${AWK:-awk}
12 |
13 | COLUMNS="
14 | age
15 | children
16 | depart_for_work
17 | traveltime_to_work
18 | weekly_work_hours
19 | last_week_work_hours
20 | carpool_riders
21 | income
22 | wages
23 | poverty_percentage
24 | sex
25 | military_service_years
26 | "
27 |
28 | (printf 'id'
29 | for COL in $COLUMNS ; do
30 | printf ',%s' "$COL"
31 | done
32 | printf '\n' ) >$OUTPUT
33 |
34 | $AWK '{
35 | children = substr($0,89,2);
36 | children = (children == "00") ? 0 : int(children) - 1;
37 |
38 | printf("r%d,", NR);
39 | printf("%s,", substr($0, 15,2)); # age
40 | printf("%s,", children ); # children
41 | printf("%s,", substr($0,105,4)); # depart_for_work
42 | printf("%s,", substr($0,109,2)); # traveltime_to_work
43 | printf("%s,", substr($0,125,2)); # weekly_work_hours
44 | printf("%s,", substr($0, 93,2)); # last_week_work_hours
45 | printf("%s,", substr($0,104,1)); # carpool_riders
46 | printf("%s,", substr($0,133,6)); # income
47 | printf("%s,", substr($0,139,6)); # wages
48 | printf("%s,", substr($0, 41,3)); # poverty_percentage
49 | printf("%s,", substr($0, 11,1)); # sex
50 | printf("%s\n", substr($0, 83,2)); # military_service_years
51 | } ' <$INPUT >>$OUTPUT
52 |
--------------------------------------------------------------------------------
/dataset_tools/sample.csv:
--------------------------------------------------------------------------------
1 | id,first,second,third
2 | r1,1,2,3
3 | r2,0.1,11.234,01.23
4 | r3,000,03,001
5 |
--------------------------------------------------------------------------------
/db.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "container/heap"
5 | "errors"
6 | "fmt"
7 | "math"
8 | )
9 |
10 | type Query struct {
11 | Offset int
12 | Limit int
13 | MinScore float32
14 |
15 | // mixed, nested arrays of strings and numbers describing a function; for example: ["sum", ["field", "age"], ["field", "height"]]
16 | Scorer []interface{}
17 | }
18 |
19 | type DocScore struct {
20 | DocId int64
21 | Score float32
22 | }
23 |
24 | type Record struct {
25 | Id string
26 | Values map[string]float32
27 | }
28 |
29 | type QueryResult struct {
30 | Ids []string
31 | Scores []float32
32 | }
33 |
34 | // Three layers of database interfaces, each one wrapping the next:
35 |
36 | type Db interface { // Outermost interface; clients use this
37 | BulkIndex(records []Record) error
38 | Index(id string, values map[string]float32) error
39 | Query(query Query) (QueryResult, error)
40 | }
41 |
42 | type StreamingDb interface { // Uses a DocItr based query, useful for middleware that alters or combines result streams
43 | BulkIndex(records []map[string]float32) ([]int64, error)
44 | QueryItr(Scorer []interface{}) (DocItr, error)
45 | }
46 |
47 | type DbBackend interface { // the minimal interface to implement storage (filesystem, memory, etc)
48 | BulkIndex(records []map[string]float32) ([]int64, error)
49 | FieldDocItr(field string) DocItr
50 | }
51 |
52 | type IdBackend interface { // stores a mapping from scoredb's identifiers to the clients'
53 | Put(scoreIds []int64, clientIds []string) error
54 | Get(scoreIds []int64) ([]string, error)
55 | }
56 |
57 | type BaseDb struct {
58 | StreamingDb StreamingDb
59 | IdDb IdBackend
60 | }
61 |
62 | func (db BaseDb) BulkIndex(records []Record) error {
63 | clientIds := make([]string, len(records))
64 | values := make([]map[string]float32, len(records))
65 | for idx, rec := range records {
66 | values[idx] = rec.Values
67 | clientIds[idx] = rec.Id
68 | }
69 | scoreIds, err := db.StreamingDb.BulkIndex(values)
70 | if err != nil {
71 | return err
72 | }
73 | return db.IdDb.Put(scoreIds, clientIds)
74 | }
75 |
76 | func (db BaseDb) Index(id string, values map[string]float32) error {
77 | return db.BulkIndex([]Record{Record{Id: id, Values: values}})
78 | }
79 |
80 | func CandidateIsLess(r1, r2 DocScore) bool {
81 | s1, s2 := r1.Score, r2.Score
82 | if s1 < s2 {
83 | return true
84 | } else if s1 > s2 {
85 | return false
86 | } else {
87 | return r1.DocId < r2.DocId
88 | }
89 | }
90 |
91 | type BaseDbResultSet []DocScore
92 |
93 | func (h BaseDbResultSet) Len() int { return len(h) }
94 | func (h BaseDbResultSet) Less(i, j int) bool { return CandidateIsLess(h[i], h[j]) }
95 | func (h BaseDbResultSet) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
96 | func (h *BaseDbResultSet) Push(x interface{}) {
97 | *h = append(*h, x.(DocScore))
98 | }
99 | func (h *BaseDbResultSet) Pop() interface{} {
100 | old := *h
101 | n := len(old)
102 | x := old[n-1]
103 | *h = old[0 : n-1]
104 | return x
105 | }
106 |
107 | func (db BaseDb) Query(query Query) (QueryResult, error) {
108 | itr, err := db.StreamingDb.QueryItr(query.Scorer)
109 | if err != nil {
110 | return QueryResult{}, err
111 | }
112 | minScore, offset, limit := query.MinScore, query.Offset, query.Limit
113 | if limit == 0 { // we short circuit this case because the code below assumes at least one result
114 | return QueryResult{Ids: []string{}}, nil
115 | }
116 | //fmt.Printf("> %+v\n", query);
117 | numResults := offset + limit
118 | resultData := make(BaseDbResultSet, 0, numResults+1)
119 | results := &resultData
120 | heap.Init(results)
121 | minCandidate := DocScore{Score: float32(math.Inf(-1))}
122 | maxScore := float32(math.Inf(1))
123 | docId := int64(-1)
124 | var score float32
125 | for itr.Next(docId + 1) {
126 | docId, score = itr.Cur()
127 | if score < minScore {
128 | continue
129 | }
130 | candidate := DocScore{DocId: docId, Score: score}
131 | if CandidateIsLess(minCandidate, candidate) {
132 | heap.Push(results, candidate)
133 | if results.Len() > numResults {
134 | heap.Pop(results)
135 | minCandidate = resultData[0]
136 | itr.SetBounds(minCandidate.Score, maxScore)
137 | }
138 | }
139 | }
140 | itr.Close()
141 |
142 | for offset > 0 && len(resultData) > 0 {
143 | heap.Pop(results)
144 | offset -= 1
145 | }
146 |
147 | numResults = results.Len()
148 | var resultIds = make([]int64, numResults)
149 | var resultScores = make([]float32, numResults)
150 | for idx, _ := range resultIds {
151 | rec := heap.Pop(results).(DocScore)
152 | i := numResults - (idx + 1)
153 | resultIds[i] = rec.DocId
154 | resultScores[i] = rec.Score
155 | }
156 | //fmt.Printf("< %+v\n", resultIds);
157 | //fmt.Printf("< %+v\n", resultScores);
158 |
159 | clientIds, err := db.IdDb.Get(resultIds)
160 | if err != nil {
161 | return QueryResult{}, err
162 | }
163 | return QueryResult{Ids: clientIds, Scores: resultScores}, nil
164 | }
165 |
166 | func ToFloat32(val interface{}) (float32, error) {
167 | switch typed := val.(type) {
168 | case float32:
169 | return typed, nil
170 | case float64:
171 | return float32(typed), nil
172 | default:
173 | return 0.0, errors.New(fmt.Sprintf("Invalid value ('%s') given, must be floating point number", val))
174 | }
175 | }
176 |
177 | func ToXyPoints(input interface{}) ([]CustomPoint, error) {
178 | switch inputPoints := input.(type) {
179 | case []interface{}:
180 | points := make([]CustomPoint, len(inputPoints))
181 | for idx, inputPoint := range inputPoints {
182 | pair := inputPoint.([]interface{})
183 | if len(pair) != 2 {
184 | return nil, fmt.Errorf("Invalid (x,y) point; found: '%v' instead", pair)
185 | }
186 | xPoint, err := ToFloat32(pair[0])
187 | if err != nil {
188 | return nil, err
189 | }
190 | yPoint, err := ToFloat32(pair[1])
191 | if err != nil {
192 | return nil, err
193 | }
194 | points[idx] = CustomPoint{xPoint, yPoint}
195 | }
196 | return points, nil
197 | default:
198 | return nil, fmt.Errorf("Expected array of (x,y) points; found: '%v' instead", input)
199 | }
200 | }
201 |
202 | // BaseStreamingDb : The usual way to bridge a StreamingDb to a DbBackend
203 |
204 | type BaseStreamingDb struct {
205 | Backend DbBackend
206 | }
207 |
208 | func (db BaseStreamingDb) BulkIndex(records []map[string]float32) ([]int64, error) {
209 | return db.Backend.BulkIndex(records)
210 | }
211 |
212 | func (db BaseStreamingDb) QueryItr(scorer []interface{}) (DocItr, error) {
213 | args := scorer[1:]
214 | switch scorer[0].(string) {
215 | case "sum":
216 | fieldItrs := make([]DocItr, len(args))
217 | for idx, v := range args {
218 | itr, err := db.QueryItr(v.([]interface{}))
219 | if err != nil {
220 | return nil, err
221 | }
222 | fieldItrs[idx] = itr
223 | }
224 | return NewSumDocItr(fieldItrs), nil
225 | case "product":
226 | fieldItrs := make([]DocItr, len(args))
227 | for idx, v := range args {
228 | itr, err := db.QueryItr(v.([]interface{}))
229 | if err != nil {
230 | return nil, err
231 | }
232 | fieldItrs[idx] = itr
233 | }
234 | return NewProductDocItr(fieldItrs), nil
235 | case "min":
236 | fieldItrs := make([]DocItr, len(args))
237 | for idx, v := range args {
238 | itr, err := db.QueryItr(v.([]interface{}))
239 | if err != nil {
240 | return nil, err
241 | }
242 | fieldItrs[idx] = itr
243 | }
244 | return NewMinDocItr(fieldItrs), nil
245 | case "scale":
246 | if len(args) != 2 {
247 | return nil, errors.New("Wrong number of arguments to scale function")
248 | }
249 | itr, err := db.QueryItr(args[1].([]interface{}))
250 | if err != nil {
251 | return nil, err
252 | }
253 | weight := args[0]
254 | switch typed := weight.(type) {
255 | case float32:
256 | return &ScaleDocItr{typed, itr}, nil
257 | case float64:
258 | return &ScaleDocItr{float32(typed), itr}, nil
259 | default:
260 | return nil, errors.New(fmt.Sprintf("Invalid weight ('%s') given to scale function, must be floating point number", weight))
261 | }
262 | case "diff":
263 | if len(args) != 2 {
264 | return nil, errors.New("Wrong number of arguments to diff function")
265 | }
266 | target, err := ToFloat32(args[0])
267 | if err != nil {
268 | return nil, err
269 | }
270 | itr, err := db.QueryItr(args[1].([]interface{}))
271 | if err != nil {
272 | return nil, err
273 | }
274 | return &DiffDocItr{
275 | target: target,
276 | itr: itr,
277 | }, nil
278 | case "pow":
279 | if len(args) != 2 {
280 | return nil, errors.New("Wrong number of arguments to pow function")
281 | }
282 | exp, err := ToFloat32(args[1])
283 | if err != nil {
284 | return nil, err
285 | }
286 | itr, err := db.QueryItr(args[0].([]interface{}))
287 | if err != nil {
288 | return nil, err
289 | }
290 | return &PowDocItr{
291 | itr: itr,
292 | exp: exp,
293 | }, nil
294 |
295 | case "custom_map":
296 | if len(args) != 3 {
297 | return nil, errors.New("Wrong number of arguments to custom_map function")
298 | }
299 |
300 | points, err := ToXyPoints(args[0])
301 | if err != nil {
302 | return nil, err
303 | }
304 |
305 | deflt, err := ToFloat32(args[1])
306 | if err != nil {
307 | return nil, err
308 | }
309 |
310 | itr, err := db.QueryItr(args[2].([]interface{}))
311 | if err != nil {
312 | return nil, err
313 | }
314 |
315 | scoremap := make(map[float32]float32)
316 | for _, pt := range points {
317 | scoremap[pt.X] = pt.Y
318 | }
319 | return &CustomMapDocItr{
320 | points: scoremap,
321 | deflt: deflt,
322 | docItr: itr,
323 | }, nil
324 |
325 | case "custom_linear":
326 | if len(args) != 2 {
327 | return nil, errors.New("Wrong number of arguments to custom_linear function")
328 | }
329 |
330 | inputPoints := args[0].([]interface{})
331 | points := make([]CustomPoint, len(inputPoints))
332 | for idx, inputPoint := range inputPoints {
333 | pair := inputPoint.([]interface{})
334 | if len(pair) != 2 {
335 | return nil, fmt.Errorf("Invalid (x,y) point in custom_linear; found: '%v' instead", pair)
336 | }
337 | xPoint, err := ToFloat32(pair[0])
338 | if err != nil {
339 | return nil, err
340 | }
341 | yPoint, err := ToFloat32(pair[1])
342 | if err != nil {
343 | return nil, err
344 | }
345 | points[idx] = CustomPoint{xPoint, yPoint}
346 | }
347 |
348 | itr, err := db.QueryItr(args[1].([]interface{}))
349 | if err != nil {
350 | return nil, err
351 | }
352 |
353 | return &CustomLinearDocItr{
354 | points: points,
355 | docItr: itr,
356 | }, nil
357 |
358 | case "geo_distance":
359 | if len(args) != 4 {
360 | return nil, errors.New("Wrong number of arguments to geo_distance function")
361 | }
362 | lat, err := ToFloat32(args[0])
363 | if err != nil {
364 | return nil, err
365 | }
366 | lng, err := ToFloat32(args[1])
367 | if err != nil {
368 | return nil, err
369 | }
370 | latFieldName := args[2].(string)
371 | lngFieldName := args[3].(string)
372 | latItr := &DiffDocItr{target: lat, itr: db.Backend.FieldDocItr(latFieldName)}
373 | lngItr := &DiffDocItr{target: lng, itr: db.Backend.FieldDocItr(lngFieldName)}
374 | // bias longitude distances by approximate latitude (matters less at poles)
375 | multiplier := float32(math.Cos(float64(lat) * math.Pi / 180.0))
376 | biasedLngItr := &ScaleDocItr{multiplier, lngItr}
377 | // square each component
378 | latSquaredItr := NewPowDocItr(latItr, 2.0)
379 | lngSquaredItr := NewPowDocItr(biasedLngItr, 2.0)
380 | // sum and square root
381 | distanceItr := NewPowDocItr(NewSumDocItr([]DocItr{latSquaredItr, lngSquaredItr}), 0.5)
382 | // convert degrees distance to radians and multiply by radius of the earth (in km)
383 | earthRadius := float32(6371.0 * math.Pi / 180.0)
384 | return &ScaleDocItr{earthRadius, distanceItr}, nil
385 | case "field":
386 | if len(args) != 1 {
387 | return nil, errors.New("Wrong number of arguments to field function")
388 | }
389 | key := args[0].(string)
390 | return db.Backend.FieldDocItr(key), nil
391 | default:
392 | return nil, errors.New(fmt.Sprintf("Scoring function '%s' is not recognized", scorer[0]))
393 | }
394 | }
395 |
--------------------------------------------------------------------------------
/db_test.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "fmt"
5 | "math"
6 | "os"
7 | "path"
8 | "strings"
9 | "testing"
10 | )
11 |
12 | func CallAndCheck(db Db, t *testing.T, r1 []string, limit int, scorer []interface{}) {
13 | r2, err := db.Query(Query{Limit: limit, Scorer: scorer, MinScore: float32(math.Inf(-1))})
14 | if err != nil {
15 | t.Fatal(err)
16 | }
17 | if len(r1) != len(r2.Ids) {
18 | t.Fatalf("expected: %v found: %v", r1, r2)
19 | }
20 | for idx, v1 := range r1 {
21 | if v1 != r2.Ids[idx] {
22 | t.Fatalf("expected: %v found: %v", r1, r2)
23 | }
24 | }
25 | }
26 |
27 | func DbBasicsTest(db Db, t *testing.T) {
28 | err := db.Index("r1", map[string]float32{"age": 32, "height": 2.0, "lat": 45.0, "lon": -70.0})
29 | if err != nil {
30 | t.Error(fmt.Sprintf("%v", err))
31 | }
32 | err = db.Index("r2", map[string]float32{"age": 25, "height": 1.5, "lat": 43.0, "lon": -69.0})
33 | if err != nil {
34 | t.Error(fmt.Sprintf("%v", err))
35 | }
36 | err = db.Index("r3", map[string]float32{"age": 16, "height": 2.5, "lat": 45.0, "lon": -95.0})
37 | if err != nil {
38 | t.Error(fmt.Sprintf("%v", err))
39 | }
40 | CallAndCheck(db, t, []string{"r3", "r1"}, 2, []interface{}{"field", "height"})
41 | CallAndCheck(db, t, []string{"r1", "r2"}, 2, []interface{}{"sum",
42 | []interface{}{"field", "age"},
43 | []interface{}{"field", "height"}})
44 | CallAndCheck(db, t, []string{"r1"}, 1, []interface{}{"sum",
45 | []interface{}{"field", "age"},
46 | []interface{}{"field", "height"}})
47 | CallAndCheck(db, t, []string{"r3", "r1"}, 2, []interface{}{"sum",
48 | []interface{}{"scale", 0.1, []interface{}{"field", "age"}},
49 | []interface{}{"scale", 10.0, []interface{}{"field", "height"}}})
50 | CallAndCheck(db, t, []string{"r3", "r2"}, 2, []interface{}{"sum",
51 | []interface{}{"scale", -1.0, []interface{}{"field", "age"}},
52 | []interface{}{"scale", -1.0, []interface{}{"field", "height"}}})
53 | CallAndCheck(db, t, []string{"r2", "r1", "r3"}, 3, []interface{}{"sum",
54 | []interface{}{"scale", 1.0, []interface{}{"field", "age"}},
55 | []interface{}{"scale", -100.0, []interface{}{"field", "height"}}})
56 | CallAndCheck(db, t, []string{}, 0, []interface{}{"sum",
57 | []interface{}{"field", "age"},
58 | []interface{}{"field", "height"}})
59 | CallAndCheck(db, t, []string{"r1", "r2", "r3"}, 3, []interface{}{"sum",
60 | []interface{}{"field", "age"},
61 | []interface{}{"pow", []interface{}{"field", "height"}, 2.0}})
62 | CallAndCheck(db, t, []string{"r3", "r1", "r2"}, 3, []interface{}{"sum",
63 | []interface{}{"field", "age"},
64 | []interface{}{"pow", []interface{}{"field", "height"}, 10.0}})
65 | CallAndCheck(db, t, []string{"r1", "r3", "r2"}, 3, []interface{}{"product",
66 | []interface{}{"field", "age"},
67 | []interface{}{"field", "height"}})
68 | CallAndCheck(db, t, []string{"r3", "r1", "r2"}, 3, []interface{}{"min",
69 | []interface{}{"field", "age"},
70 | []interface{}{"field", "height"}})
71 | CallAndCheck(db, t, []string{"r1", "r2", "r3"}, 3, []interface{}{"custom_linear",
72 | []interface{}{ // scores by closeness to age 30:
73 | []interface{}{float32(0), float32(0.0)},
74 | []interface{}{float32(30), float32(1.0)},
75 | []interface{}{float32(100), float32(0.0)}},
76 | []interface{}{"field", "age"}})
77 | CallAndCheck(db, t, []string{"r3", "r2", "r1"}, 3, []interface{}{"geo_distance", 45.0, -69.9, "lat", "lon"})
78 | CallAndCheck(db, t, []string{"r3", "r1", "r2"}, 3, []interface{}{"geo_distance", 20.0, 70.0, "lat", "lon"})
79 | }
80 |
81 | func RmAllTestData() func(name string) string {
82 | tmpDir := os.TempDir()
83 | dirfd, err := os.Open(tmpDir)
84 | if err == nil {
85 | names, err := dirfd.Readdirnames(0)
86 | if err == nil {
87 | for _, name := range names {
88 | if strings.HasPrefix(name, "scoredbtest.") {
89 | os.RemoveAll(path.Join(tmpDir, name))
90 | }
91 | }
92 | }
93 | }
94 | return func(name string) string {
95 | fullname := path.Join(tmpDir, "scoredbtest."+name)
96 | return fullname
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/diffdocitr.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import ()
4 |
5 | // (Absolute) difference between a value and a constant
6 | type DiffDocItr struct {
7 | target float32
8 | itr DocItr
9 | }
10 |
11 | func Abs(val float32) float32 {
12 | if val < 0 {
13 | return -val
14 | } else {
15 | return val
16 | }
17 | }
18 |
19 | func Max(v1, v2 float32) float32 {
20 | if v1 < v2 {
21 | return v2
22 | } else {
23 | return v1
24 | }
25 | }
26 |
27 | func Min(v1, v2 float32) float32 {
28 | if v1 > v2 {
29 | return v2
30 | } else {
31 | return v1
32 | }
33 | }
34 |
35 | func (op *DiffDocItr) Name() string { return "DiffDocItr" }
36 | func (op *DiffDocItr) Cur() (int64, float32) {
37 | docId, score := op.itr.Cur()
38 | return docId, Abs(score - op.target)
39 | }
40 | func (op *DiffDocItr) GetBounds() (min, max float32) {
41 | target := op.target
42 | min, max = op.itr.GetBounds()
43 | d1 := Abs(min - target)
44 | d2 := Abs(max - target)
45 | maxDist := Max(d1, d2)
46 | if min <= target && target <= max {
47 | return 0.0, maxDist
48 | } else {
49 | return Min(d1, d2), maxDist
50 | }
51 | }
52 | func (op *DiffDocItr) Close() {
53 | op.itr.Close()
54 | }
55 | func (op *DiffDocItr) Next(minId int64) bool {
56 | return op.itr.Next(minId)
57 | }
58 |
59 | func (op *DiffDocItr) SetBounds(min, max float32) bool {
60 | // min is not useful to us right now
61 | target := op.target
62 | return op.itr.SetBounds(target-max, target+max)
63 | }
64 |
--------------------------------------------------------------------------------
/docitr.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "math"
5 | )
6 |
7 | var PositiveInfinity = float32(math.Inf(1))
8 | var NegativeInfinity = float32(math.Inf(-1))
9 |
10 | type DocItr interface {
11 | // An iterator over (document id, score) values.
12 |
13 | Name() string
14 |
15 | // return false if the iterator is now known to not produce any more values
16 | SetBounds(min, max float32) bool
17 |
18 | GetBounds() (min, max float32)
19 |
20 | // Next() skips the iterator ahead to at least as far as the given id.
21 | // It always advances the iterator at least one position.
22 | // It Returns false if there are no remaining values.
23 | // Iterators need a call to Next(0) to intialize them to a real value; they all initially have a docId of -1
24 | Next(minId int64) bool
25 |
26 | Close() // release resources held by this iterator (if any)
27 |
28 | Cur() (int64, float32) // doc id and score of current result, or (-1, 0.0) if the iterator has not been initialized
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/elastic.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "bytes"
5 | "encoding/json"
6 | "fmt"
7 | "io/ioutil"
8 | "log"
9 | "net/http"
10 | "strconv"
11 | "strings"
12 | )
13 |
14 | type EsScoreDb struct {
15 | BaseURL, Index string
16 | }
17 |
18 | func (db *EsScoreDb) BulkIndex(records []Record) error {
19 | var jsonbuf bytes.Buffer
20 | for _, rec := range records {
21 | jsonbuf.WriteString(fmt.Sprintf("{\"index\":{\"_id\":\"%s\"}}\n", rec.Id))
22 | buf, err := json.Marshal(rec.Values)
23 | if err != nil {
24 | return err
25 | }
26 | jsonbuf.Write(buf)
27 | jsonbuf.WriteString("\n")
28 | }
29 | payload := jsonbuf.String()
30 | url := db.BaseURL + db.Index + "/external/_bulk"
31 | //fmt.Printf("Bulk: %v @ %v\n", payload, url)
32 | resp, err := http.Post(url, "application/json", strings.NewReader(payload))
33 | if err != nil {
34 | panic(err)
35 | }
36 | body, _ := ioutil.ReadAll(resp.Body)
37 | resp.Body.Close()
38 | //fmt.Printf("Bulk resp: %+v\n", string(body))
39 | var parsedResponse struct{ Errors bool }
40 | err = json.Unmarshal(body, &parsedResponse)
41 | if err != nil {
42 | panic(err)
43 | }
44 | if parsedResponse.Errors {
45 | panic(string(body))
46 | }
47 |
48 | db.RefreshIndex()
49 |
50 | return nil
51 | }
52 |
53 | type EsQueryResponse struct {
54 | Hits struct {
55 | Hits []struct {
56 | Id string `json:"_id"`
57 | } `json:"hits"`
58 | } `json:"hits"`
59 | }
60 |
61 | func (db *EsScoreDb) LinearQuery(numResults int, weights map[string]float32) []string {
62 | var scorefactors bytes.Buffer
63 | first := true
64 | for key, val := range weights {
65 | if !first {
66 | scorefactors.WriteString(",")
67 | } else {
68 | first = false
69 | }
70 | scorefactors.WriteString(fmt.Sprintf(`{"field_value_factor":{"field":"%s","factor":%f}}`, key, val))
71 | }
72 | data := fmt.Sprintf(`{
73 | "size":%d,
74 | "fields":[],
75 | "query":{
76 | "function_score":{
77 | "functions":[%s],
78 | "score_mode": "sum"
79 | }
80 | }
81 | }`, numResults, scorefactors.String())
82 | resp, err := http.Post(db.BaseURL+db.Index+"/external/_search?pretty", "application/json", strings.NewReader(data))
83 | if err != nil {
84 | panic(err)
85 | }
86 | body, _ := ioutil.ReadAll(resp.Body)
87 | resp.Body.Close()
88 | //fmt.Println(string(body))
89 | queryResp := EsQueryResponse{}
90 | err = json.Unmarshal(body, &queryResp)
91 | if err != nil {
92 | panic(err)
93 | }
94 | hits := queryResp.Hits.Hits
95 | resultIds := make([]string, len(hits))
96 | for idx, rec := range hits {
97 | resultIds[idx] = rec.Id
98 | }
99 | return resultIds
100 | }
101 |
102 | func (db *EsScoreDb) DeleteIndex() {
103 | req, _ := http.NewRequest("DELETE", db.BaseURL+db.Index, nil)
104 | resp, _ := http.DefaultClient.Do(req)
105 | body, _ := ioutil.ReadAll(resp.Body)
106 | resp.Body.Close()
107 | fmt.Println("Delete Index: " + string(body))
108 | }
109 |
110 | func (db *EsScoreDb) CreateIndex() {
111 | payload := "{\"settings\": {\"index\": {\"number_of_shards\" : 1}}}"
112 | req, _ := http.NewRequest("PUT", db.BaseURL+db.Index, strings.NewReader(payload))
113 | resp, _ := http.DefaultClient.Do(req)
114 | body, _ := ioutil.ReadAll(resp.Body)
115 | resp.Body.Close()
116 | fmt.Println("Create Index: " + string(body))
117 | }
118 |
119 | func (db *EsScoreDb) RefreshIndex() {
120 | req, _ := http.NewRequest("POST", db.BaseURL+db.Index+"/_refresh", nil)
121 | resp, _ := http.DefaultClient.Do(req)
122 | ioutil.ReadAll(resp.Body)
123 | resp.Body.Close()
124 | //fmt.Println("Refresh Index: " + string(body))
125 | }
126 |
127 | func (db *EsScoreDb) ParseQuery(query string) map[string]float32 {
128 | fields := strings.Split(query, ",")
129 | coefs := make(map[string]float32)
130 | for _, f := range fields {
131 | fieldparts := strings.Split(f, "=")
132 | if len(fieldparts) != 2 {
133 | log.Fatalf("ERROR: malformed query\n")
134 | }
135 | val, _ := strconv.ParseFloat(fieldparts[1], 32)
136 | coefs[fieldparts[0]] = float32(val)
137 | }
138 | return coefs
139 | }
140 |
141 | /*
142 | var (
143 | deleteflag = flag.Bool("delete", false, "delete data from elasticsearch")
144 | queryflag = flag.String("query", "", "column_name=weighting_factor,...")
145 | urlflag = flag.String("esurl", "http://localhost:9200/", "URL to elasticsearch instance with trailing slash")
146 | indexflag = flag.String("index", "scoredb", "Elasticsearch index name")
147 | )
148 |
149 | func main() {
150 | flag.Parse()
151 | db := NewEsScoreDb{BaseUrl: *urlflag, Index: *indexflag}
152 | if *deleteflag {
153 | db.DeleteData()
154 | } else if len(*queryflag) > 0 {
155 | db.LinearQuery(10, db.ParseQuery(*queryflag))
156 | } else {
157 | fmt.Println("need to use --query querystring, or --delete")
158 | }
159 | }
160 | */
161 |
--------------------------------------------------------------------------------
/fielddocitr.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "container/heap"
5 | //"fmt"
6 | "math"
7 | //"time"
8 | )
9 |
10 | type FieldDocItr struct {
11 | field string
12 | score float32
13 | docId int64
14 | min, max float32
15 | lists FieldDocItrs
16 | }
17 |
18 | func NewFieldDocItr(field string, lists FieldDocItrs) *FieldDocItr {
19 | itr := &FieldDocItr{
20 | field: field,
21 | score: 0.0,
22 | docId: -1,
23 | lists: lists,
24 | }
25 | min, max := float32(math.Inf(1)), float32(math.Inf(-1))
26 | for _, docItr := range lists {
27 | curMin, curMax := docItr.GetBounds()
28 | if curMin < min {
29 | min = curMin
30 | }
31 | if curMax > max {
32 | max = curMax
33 | }
34 | }
35 | itr.min, itr.max = min, max
36 | return itr
37 | }
38 |
39 | type FieldDocItrs []DocItr // FieldDocItrs implements heap.Interface
40 | func (so FieldDocItrs) Len() int { return len(so) }
41 | func (so FieldDocItrs) Less(i, j int) bool {
42 | d1, _ := so[i].Cur()
43 | d2, _ := so[j].Cur()
44 | return d1 < d2
45 | }
46 | func (so *FieldDocItrs) Pop() interface{} {
47 | old := *so
48 | n := len(old)
49 | item := old[n-1]
50 | *so = old[0 : n-1]
51 | return item
52 | }
53 | func (so *FieldDocItrs) Push(x interface{}) {
54 | *so = append(*so, x.(DocItr))
55 | }
56 | func (so FieldDocItrs) Swap(i, j int) {
57 | so[i], so[j] = so[j], so[i]
58 | }
59 |
60 | func (op *FieldDocItr) Name() string { return "FieldDocItr" }
61 | func (op *FieldDocItr) Cur() (int64, float32) {
62 | return op.docId, op.score
63 | }
64 | func (op *FieldDocItr) GetBounds() (min, max float32) {
65 | return op.min, op.max
66 | }
67 | func (op *FieldDocItr) SetBounds(min, max float32) bool {
68 | op.min = min
69 | op.max = max
70 | for {
71 | keepGoing := false
72 | anyMore := false
73 | for idx, subOp := range op.lists {
74 | if subOp.SetBounds(min, max) {
75 | anyMore = true
76 | } else {
77 | subOp.Close()
78 | lists := op.lists
79 | lists[idx] = lists[len(lists)-1]
80 | op.lists = lists[:len(lists)-1]
81 | keepGoing = true
82 | //fmt.Printf("%08d Field elim @doc %08d, %05d remain (%s)\n", time.Now().UnixNano() % 100000000, op.docId, len(op.lists), op.field)
83 | break
84 | }
85 | }
86 | if !keepGoing {
87 | return anyMore
88 | }
89 | heap.Init(&op.lists)
90 | }
91 | }
92 |
93 | func (op *FieldDocItr) Close() {
94 | for _, list := range op.lists {
95 | list.Close()
96 | }
97 | }
98 |
99 | func (op *FieldDocItr) Next(minId int64) bool {
100 | if len(op.lists) == 0 {
101 | return false
102 | }
103 | var docId int64
104 | var score float32
105 | for {
106 | docId, score = op.lists[0].Cur()
107 | if docId >= minId {
108 | break
109 | }
110 | if !op.lists[0].Next(minId) {
111 | op.lists[0].Close()
112 | heap.Remove(&op.lists, 0)
113 | if len(op.lists) == 0 {
114 | //fmt.Printf("FieldDocItr Next(%v) %v END\n", minId, op.field)
115 | return false
116 | }
117 | } else {
118 | heap.Fix(&op.lists, 0)
119 | }
120 | }
121 | op.docId = docId
122 | op.score = score
123 | //fmt.Printf("FieldDocItr Next(%v) %v %v %v\n", minId, op.field, op.docId, op.score)
124 | return true
125 | }
126 |
--------------------------------------------------------------------------------
/fielddocitr_test.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestFieldOp(t *testing.T) {
8 | l1 := NewMemoryDocItr(
9 | []float32{1.0, 1.0, 0.5, 1.0, 0.5},
10 | []int64{1, 5, 7, 8, 9},
11 | )
12 | l2 := NewMemoryDocItr(
13 | []float32{1.0, 1.0},
14 | []int64{2, 5},
15 | )
16 | fieldop := FieldDocItr{lists: FieldDocItrs{l1, l2}}
17 | if !fieldop.Next(0) {
18 | t.FailNow()
19 | }
20 | docId, _ := fieldop.Cur()
21 | if docId != 1 {
22 | t.FailNow()
23 | }
24 | if !fieldop.Next(2) {
25 | t.FailNow()
26 | }
27 | docId, _ = fieldop.Cur()
28 | if docId != 2 {
29 | t.FailNow()
30 | }
31 | if !fieldop.Next(3) {
32 | t.FailNow()
33 | }
34 | docId, _ = fieldop.Cur()
35 | if docId != 5 {
36 | t.FailNow()
37 | }
38 | if !fieldop.SetBounds(0.75, 1.0) {
39 | t.FailNow()
40 | }
41 | if !fieldop.Next(6) {
42 | t.FailNow()
43 | }
44 | docId, _ = fieldop.Cur()
45 | if docId != 8 {
46 | t.FailNow()
47 | }
48 | if fieldop.Next(9) {
49 | t.FailNow()
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/fsscoredb.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "encoding/binary"
5 | "errors"
6 | "fmt"
7 | "io/ioutil"
8 | "math"
9 | "os"
10 | "path"
11 | "strconv"
12 | //"time"
13 | )
14 |
15 | func NewFsScoreDb(dataDir string) *FsScoreDb {
16 | err := EnsureDirectory(dataDir)
17 | if err != nil {
18 | panic(err)
19 | }
20 | fields := make(map[string]OrderedFileInfos)
21 |
22 | // Load pre-existing file headers
23 | highestId := int64(0)
24 | fieldNames, err := ioutil.ReadDir(dataDir)
25 | if err != nil {
26 | panic(err)
27 | }
28 | for _, fieldName := range fieldNames {
29 | fieldPath := path.Join(dataDir, fieldName.Name())
30 | fields[fieldPath] = make(OrderedFileInfos, 0)
31 | dataFiles, err := ioutil.ReadDir(fieldPath)
32 | if err != nil {
33 | panic(err)
34 | }
35 | for _, dataFile := range dataFiles {
36 | numVarBits := 32 - len(dataFile.Name())
37 | prefixVal, err := strconv.ParseInt(dataFile.Name(), 2, 32)
38 | if err != nil {
39 | continue
40 | }
41 | dataFilePath := path.Join(fieldPath, dataFile.Name())
42 | fd, err := os.OpenFile(dataFilePath, os.O_RDONLY, 0)
43 | if err != nil {
44 | panic(err)
45 | }
46 | var header PostingListHeader
47 | err = binary.Read(fd, binary.LittleEndian, &header)
48 | if err != nil {
49 | panic(err)
50 | }
51 | fd.Close()
52 | if header.LastDocId > highestId {
53 | highestId = header.LastDocId
54 | }
55 | fileInfo := &FileInfo{
56 | header: &header,
57 | path: dataFilePath,
58 | numVariableBits: uint(numVarBits),
59 | minVal: math.Float32frombits(uint32(prefixVal << uint(numVarBits))),
60 | }
61 | fields[fieldName.Name()] = append(fields[fieldName.Name()], fileInfo)
62 | }
63 |
64 | }
65 |
66 | //fmt.Printf("INIT fs score db %v (highest id %d)\n", dataDir, highestId)
67 | return &FsScoreDb{
68 | dataDir: dataDir,
69 | fields: fields,
70 | nextId: highestId + 1,
71 | }
72 | }
73 |
74 | type FsScoreDb struct {
75 | dataDir string
76 | fields map[string]OrderedFileInfos
77 | nextId int64
78 | }
79 |
80 | type PostingListHeader struct {
81 | FirstDocId int64
82 | LastDocId int64
83 | NumDocs int64
84 | MinVal float32
85 | MaxVal float32
86 | FirstDocScore float32
87 | Version uint8
88 | // padding to make struct 8-byte aligned when using encoding/binary operations:
89 | _ uint8
90 | _ uint16
91 | }
92 |
93 | type FileInfo struct {
94 | header *PostingListHeader
95 | writer *BitWriter
96 | path string
97 | numVariableBits uint // number of bits at the bottom of the float that are variable (smaller means it is a more specific bucket)
98 | minVal float32 // the minimum value allowed in the bucket (minVal and maxVal in the PostingListHeader are for the actual values stored in the list)
99 | }
100 |
101 | type OrderedFileInfos []*FileInfo
102 |
103 | func (a OrderedFileInfos) Len() int { return len(a) }
104 | func (a OrderedFileInfos) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
105 | func (a OrderedFileInfos) Less(i, j int) bool {
106 | if a[i].minVal < a[j].minVal {
107 | return true
108 | } else if a[i].minVal > a[j].minVal {
109 | return false
110 | } else {
111 | return a[i].numVariableBits > a[j].numVariableBits
112 | }
113 | }
114 |
115 | func MaxDocsForFile(fileInfo *FileInfo) int64 {
116 | header := fileInfo.header
117 | if header.MinVal == header.MaxVal { // do not split single-valued lists
118 | return math.MaxInt64
119 | }
120 | if fileInfo.numVariableBits <= 0 { // do not split lists at full precision
121 | return math.MaxInt64
122 | }
123 | fixedFractionBits := 23 - fileInfo.numVariableBits // 23 bits is size of the fraction part
124 | return 20*1568 + (1 << (fixedFractionBits))
125 | }
126 |
127 | func Exists(path string) bool {
128 | _, err := os.Stat(path)
129 | return !os.IsNotExist(err)
130 | }
131 |
132 | func EnsureDirectory(dir string) error {
133 | if Exists(dir) {
134 | return nil
135 | } else {
136 | parent := path.Dir(dir)
137 | EnsureDirectory(parent)
138 | return os.Mkdir(dir, 0755)
139 | }
140 | }
141 |
142 | var INITIAL_VAR_BITS = uint(23 - 0)
143 | var HEADER_SIZE = int64(binary.Size(PostingListHeader{}))
144 | var numOpenFiles = 0
145 |
146 | func FindPostingListFileForWrite(db *FsScoreDb, docId int64, key string, value float32) (*FileInfo, error) {
147 | var err error
148 | fieldDir := path.Join(db.dataDir, key)
149 | files, ok := db.fields[key]
150 | if !ok {
151 | db.fields[key] = make(OrderedFileInfos, 0)
152 | files = db.fields[key]
153 | EnsureDirectory(fieldDir)
154 | }
155 | var fileInfo *FileInfo = nil
156 | bestVarBits := uint(32)
157 | // TODO idea here is that we should be able to use the ordering of OrderedFileInfos to
158 | // binary search for the right one; right now this is just a simplistic linear scan
159 | for _, curFileInfo := range files {
160 | numVar := curFileInfo.numVariableBits
161 | if math.Float32bits(curFileInfo.minVal)>>numVar == math.Float32bits(value)>>numVar {
162 | if numVar < bestVarBits {
163 | fileInfo = curFileInfo
164 | bestVarBits = numVar
165 | }
166 | }
167 | }
168 | if fileInfo == nil { // no matching posting list found
169 | fileInfo, err = MakeFileInfo(fieldDir, value, INITIAL_VAR_BITS, docId)
170 | if err != nil {
171 | return nil, err
172 | }
173 | files = append(files, fileInfo)
174 | db.fields[key] = files
175 | if err != nil {
176 | return nil, err
177 | }
178 | } else {
179 | if fileInfo.header.NumDocs >= MaxDocsForFile(fileInfo) {
180 | newBits := uint(fileInfo.numVariableBits - 3)
181 | if newBits < 0 {
182 | newBits = 0
183 | }
184 | fileInfo, err = MakeFileInfo(fieldDir, value, newBits, docId)
185 | if err != nil {
186 | return nil, err
187 | }
188 | files = append(files, fileInfo)
189 | db.fields[key] = files
190 | }
191 | }
192 |
193 | if fileInfo.writer == nil {
194 | numOpenFiles += 1
195 | fd, err := os.OpenFile(fileInfo.path, os.O_RDWR, 0666)
196 | if err != nil {
197 | return nil, err
198 | }
199 | var header PostingListHeader
200 | err = binary.Read(fd, binary.LittleEndian, &header)
201 | if err != nil {
202 | return nil, err
203 | }
204 | fileInfo.header = &header
205 | writer, err := NewBitWriter(fd)
206 | if err != nil {
207 | return nil, err
208 | }
209 | fileInfo.writer = writer
210 | }
211 | return fileInfo, nil
212 | }
213 |
214 | func MakeFileInfo(fieldDir string, value float32, numVarBits uint, docId int64) (*FileInfo, error) {
215 | var fd *os.File
216 | var err error
217 | var header PostingListHeader
218 |
219 | scoreBits := math.Float32bits(value)
220 | minVal := math.Float32frombits((scoreBits >> numVarBits) << numVarBits)
221 | numFixedBits := 32 - numVarBits
222 | scoreBitString := fmt.Sprintf("%032b", int64(scoreBits))
223 | fixedBits := scoreBitString[:numFixedBits]
224 | filename := path.Join(fieldDir, fixedBits)
225 |
226 | if Exists(filename) {
227 | numOpenFiles += 1
228 | fd, err = os.OpenFile(filename, os.O_RDWR, 0666)
229 | if err != nil {
230 | return nil, err
231 | }
232 | err = binary.Read(fd, binary.LittleEndian, &header)
233 | if err != nil {
234 | return nil, err
235 | }
236 | fd.Seek(0, 2) // Goto EOF (whence=2 means "relative to end")
237 | } else {
238 | numOpenFiles += 1
239 | fd, err = os.Create(filename)
240 | if err != nil {
241 | return nil, err
242 | }
243 | header = PostingListHeader{
244 | Version: 1,
245 | MinVal: value,
246 | MaxVal: value,
247 | FirstDocId: docId,
248 | FirstDocScore: value,
249 | LastDocId: docId,
250 | NumDocs: 1,
251 | }
252 | err = binary.Write(fd, binary.LittleEndian, header)
253 | if err != nil {
254 | return nil, err
255 | }
256 | }
257 | if header.Version != 1 {
258 | return nil, errors.New("Incorrect file version")
259 | }
260 | writer, err := NewBitWriter(fd)
261 | if err != nil {
262 | return nil, err
263 | }
264 | return &FileInfo{
265 | header: &header,
266 | writer: writer,
267 | path: filename,
268 | numVariableBits: numVarBits,
269 | minVal: minVal,
270 | }, nil
271 | }
272 |
273 | func WritePostingListEntry(fileInfo *FileInfo, docId int64, score float32) {
274 | header := fileInfo.header
275 | docIncr := docId - header.LastDocId
276 |
277 | if docIncr == 0 {
278 | // special case for first entry (it exists in the header, so do not write here)
279 | return
280 | }
281 |
282 | // header maintenance
283 | header.LastDocId = docId
284 | header.NumDocs += 1
285 | if score < header.MinVal {
286 | header.MinVal = score
287 | }
288 | if score > header.MaxVal {
289 | header.MaxVal = score
290 | }
291 | scoreBits := math.Float32bits(score)
292 | scoreMask := uint32(0xffffffff) >> (32 - fileInfo.numVariableBits)
293 | scoreRemainder := uint64(scoreBits & scoreMask)
294 |
295 | if scoreRemainder == 0 {
296 | fileInfo.writer.WriteVarUInt32(uint32(docIncr << 1))
297 | } else {
298 | fileInfo.writer.WriteVarUInt32(uint32((docIncr << 1) | 1))
299 | fileInfo.writer.WriteBits(scoreRemainder, fileInfo.numVariableBits)
300 | }
301 |
302 | }
303 |
304 | func (op *PostingListDocItr) Close() {
305 | if op.reader != nil {
306 | numOpenFiles -= 1
307 | err := op.reader.Close()
308 | if err != nil {
309 | panic(fmt.Sprintf("%v", err))
310 | }
311 | }
312 | }
313 |
314 | func (op *PostingListDocItr) Next(minId int64) bool {
315 | reader := op.reader
316 | if reader == nil {
317 | if op.docId == -1 && minId <= op.header.FirstDocId {
318 | op.docId = op.header.FirstDocId
319 | op.score = op.header.FirstDocScore
320 | return true
321 | } else {
322 | //fmt.Printf("%08d Open @doc %08d %s\n", time.Now().UnixNano() % 100000000, minId, op.path)
323 | fd, err := os.OpenFile(op.path, os.O_RDONLY, 0)
324 | numOpenFiles += 1
325 | if err != nil {
326 | panic(fmt.Sprintf("%v", err))
327 | }
328 | _, err = fd.Seek(HEADER_SIZE, 0)
329 | if err != nil {
330 | panic(fmt.Sprintf("%v", err))
331 | }
332 | reader, err = NewBitReader(fd)
333 | if err != nil {
334 | panic(fmt.Sprintf("%v", err))
335 | }
336 | op.reader = reader
337 | }
338 | }
339 | docId := op.docId
340 | for {
341 | if docId == op.maxDocId {
342 | return false
343 | }
344 | pair, err := reader.ReadVarUInt32()
345 | if err != nil {
346 | panic(fmt.Sprintf("%v", err))
347 | }
348 | docIncr := pair >> 1
349 | var valueBits uint64
350 | if pair&1 == 1 {
351 | valueBits, err = reader.ReadBits(op.numVarBits)
352 | if err != nil {
353 | panic(fmt.Sprintf("%v", err))
354 | }
355 | }
356 | if docIncr == 0 {
357 | panic(fmt.Sprintf("Inconsistent file data @ %v %v", reader.MmapPtr*8, op.path))
358 | }
359 | docId += int64(docIncr)
360 | if docId < minId {
361 | continue
362 | }
363 | score := math.Float32frombits(op.rangePrefix | uint32(valueBits))
364 | op.docId = docId
365 | op.score = score
366 | return true
367 | }
368 | }
369 |
370 | func (db *FsScoreDb) BulkIndex(records []map[string]float32) ([]int64, error) {
371 | ids := make([]int64, len(records))
372 | for idx, record := range records {
373 | docid := db.nextId
374 | db.nextId += 1
375 | for key, value := range record {
376 | fileInfo, err := FindPostingListFileForWrite(db, docid, key, value)
377 | if err != nil {
378 | return nil, err
379 | }
380 | WritePostingListEntry(fileInfo, docid, value)
381 | ids[idx] = docid
382 | }
383 | }
384 | CloseWriters(db)
385 | return ids, nil
386 | }
387 |
388 | func CloseWriters(db *FsScoreDb) error {
389 | for _, fieldIndex := range db.fields {
390 | for idx, fileInfo := range fieldIndex {
391 | writer := fileInfo.writer
392 | if writer == nil {
393 | continue
394 | }
395 | origPos, err := writer.File.Seek(0, 1) // save position to restore later
396 | if err != nil {
397 | return err
398 | }
399 | _, err = writer.File.Seek(0, 0)
400 | if err != nil {
401 | return err
402 | }
403 | err = binary.Write(writer.File, binary.LittleEndian, fileInfo.header)
404 | if err != nil {
405 | return err
406 | }
407 | _, err = writer.File.Seek(origPos, 0)
408 | if err != nil {
409 | return err
410 | }
411 | err = writer.Close()
412 | if err != nil {
413 | return err
414 | }
415 | numOpenFiles -= 1
416 | fieldIndex[idx].writer = nil
417 | }
418 | }
419 | return nil
420 | }
421 |
422 | func (db *FsScoreDb) Index(record map[string]float32) (int64, error) {
423 | docid := db.nextId
424 | db.nextId += 1
425 | for key, value := range record {
426 | fileInfo, err := FindPostingListFileForWrite(db, docid, key, value)
427 | if err != nil {
428 | return -1, err
429 | }
430 | WritePostingListEntry(fileInfo, docid, value)
431 | }
432 | CloseWriters(db)
433 | return docid, nil
434 | }
435 |
436 | func (db *FsScoreDb) FieldDocItr(fieldName string) DocItr {
437 | files, ok := db.fields[fieldName]
438 | if !ok {
439 | return NewMemoryScoreDocItr([]float32{})
440 | }
441 | itrs := make([]DocItr, len(files))
442 | for fileIdx, fileInfo := range files {
443 | itrs[fileIdx] = NewPostingListDocItr(math.Float32bits(fileInfo.minVal), fileInfo.path, fileInfo.header, fileInfo.numVariableBits)
444 | }
445 | return NewFieldDocItr(fieldName, itrs)
446 | }
447 |
448 | type PostingListDocItr struct {
449 | score float32
450 | docId int64
451 | maxDocId int64
452 | min, max float32
453 | numVarBits uint
454 | rangePrefix uint32
455 | path string
456 | reader *BitReader
457 | header *PostingListHeader
458 | }
459 |
460 | func NewPostingListDocItr(rangePrefix uint32, path string, header *PostingListHeader, numVarBits uint) DocItr {
461 | itr := &PostingListDocItr{
462 | score: 0.0,
463 | docId: -1,
464 | maxDocId: header.LastDocId,
465 | min: header.MinVal,
466 | max: header.MaxVal,
467 | numVarBits: numVarBits,
468 | rangePrefix: rangePrefix,
469 | path: path,
470 | header: header,
471 | }
472 | return itr
473 | }
474 |
475 | func (op *PostingListDocItr) Name() string { return "PostingListDocItr" }
476 | func (op *PostingListDocItr) Cur() (int64, float32) {
477 | return op.docId, op.score
478 | }
479 | func (op *PostingListDocItr) GetBounds() (min, max float32) {
480 | return op.min, op.max
481 | }
482 | func (op *PostingListDocItr) SetBounds(min, max float32) bool {
483 | if min > op.min {
484 | op.min = min
485 | }
486 | if max < op.max {
487 | op.max = max
488 | }
489 | if op.min > op.max {
490 | return false
491 | }
492 | return true
493 | }
494 |
--------------------------------------------------------------------------------
/fsscoredb_test.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 | )
7 |
8 | func TestFsScore(t *testing.T) {
9 | testdir := RmAllTestData()("fsscoredb.1")
10 | defer RmAllTestData()
11 | db := BaseDb{StreamingDb: BaseStreamingDb{NewFsScoreDb(testdir)}, IdDb: NewMemoryIdDb()}
12 | DbBasicsTest(db, t)
13 | }
14 |
15 | func TestFsScoreLarge(t *testing.T) {
16 | testdir := RmAllTestData()("fsscoredb.2")
17 | defer RmAllTestData()
18 | db := BaseDb{StreamingDb: BaseStreamingDb{NewFsScoreDb(testdir)}, IdDb: NewMemoryIdDb()}
19 |
20 | for i := 0; i < 100; i++ {
21 | db.Index(fmt.Sprintf("r%d", i), map[string]float32{"age": float32(1000 + 100 - i), "height": 100 + 1.0 + float32(i%10)/10.0})
22 | }
23 |
24 | CallAndCheck(db, t, []string{"r0", "r1"}, 2, []interface{}{"sum",
25 | []interface{}{"field", "age"},
26 | []interface{}{"field", "height"}})
27 | }
28 |
--------------------------------------------------------------------------------
/http.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "io/ioutil"
7 | "math"
8 | "net/http"
9 | "net/url"
10 | "strconv"
11 | )
12 |
13 | type ScoreDbServer struct {
14 | Db Db
15 | ReadOnly, AutoMigrate bool
16 | }
17 |
18 | func serializeIds(ids []int64) (string, error) {
19 | b, err := json.Marshal(ids)
20 | if err != nil {
21 | return "", err
22 | }
23 | s := string(b)
24 | return s, nil
25 | }
26 |
27 | func QueryIntVal(queryParams url.Values, key string, defaultValue int) (int, error) {
28 | vals, ok := queryParams[key]
29 | if !ok || len(vals) == 0 {
30 | return defaultValue, nil
31 | }
32 | return strconv.Atoi(vals[0])
33 | }
34 |
35 | func QueryFloatVal(queryParams url.Values, key string, defaultValue float32) (float32, error) {
36 | vals, ok := queryParams[key]
37 | if !ok || len(vals) == 0 {
38 | return defaultValue, nil
39 | }
40 | f64, err := strconv.ParseFloat(vals[0], 32)
41 | if err != nil {
42 | return 0.0, err
43 | } else {
44 | return float32(f64), nil
45 | }
46 | }
47 |
48 | func (sds *ScoreDbServer) ServeHTTP(w http.ResponseWriter, req *http.Request) {
49 | p := req.URL.Path
50 | if p[0] == '/' {
51 | p = p[1:]
52 | }
53 |
54 | if req.Method == "PUT" && !sds.ReadOnly {
55 |
56 | b, err := ioutil.ReadAll(req.Body)
57 | if err != nil {
58 | http.Error(w, "Could not read request body", 400)
59 | return
60 | }
61 | var records []Record
62 | if len(p) > 0 {
63 | var values map[string]float32
64 | err = json.Unmarshal(b, &values)
65 | if err == nil {
66 | records = append(records, Record{Id: p, Values: values})
67 | }
68 | } else {
69 | err = json.Unmarshal(b, &records)
70 | }
71 | if err != nil {
72 | http.Error(w, fmt.Sprintf("Could not parse json: %v", err), 400)
73 | return
74 | }
75 | err = sds.Db.BulkIndex(records)
76 | if err != nil {
77 | http.Error(w, "Could not index data", 500)
78 | return
79 | }
80 |
81 | } else if req.Method == "GET" && len(p) == 0 {
82 |
83 | queryParams := req.URL.Query()
84 |
85 | offset, err := QueryIntVal(queryParams, "offset", 0)
86 | if err != nil {
87 | http.Error(w, "Invalid value for offset", 400)
88 | return
89 | }
90 |
91 | limit, err := QueryIntVal(queryParams, "limit", 10)
92 | if err != nil {
93 | http.Error(w, "Invalid value for limit", 400)
94 | return
95 | }
96 |
97 | minScore, err := QueryFloatVal(queryParams, "minScore", float32(math.Inf(-1)))
98 | if err != nil {
99 | http.Error(w, "Invalid value for minscore", 400)
100 | return
101 | }
102 |
103 | scorerStrings, ok := queryParams["score"]
104 | if !ok || len(scorerStrings) == 0 {
105 | http.Error(w, "No score function was specified", 400)
106 | return
107 | }
108 | scorer := new([]interface{})
109 | err = json.Unmarshal([]byte(scorerStrings[0]), scorer)
110 | if err != nil {
111 | http.Error(w, "Score parameter is not a valid JSON array", 400)
112 | return
113 | }
114 |
115 | query := Query{
116 | Offset: offset,
117 | Limit: limit,
118 | MinScore: minScore,
119 | Scorer: *scorer,
120 | }
121 |
122 | results, err := sds.Db.Query(query)
123 | if err != nil {
124 | fmt.Printf("Internal error. %+v: %v\n", query, err)
125 | http.Error(w, "Internal Error in ScoreDB; please report", 500)
126 | return
127 | }
128 | response, err := json.Marshal(results)
129 | if err != nil {
130 | fmt.Printf("Internal error. %+v: %v\n", query, err)
131 | http.Error(w, "Internal Error in ScoreDB; please report", 500)
132 | return
133 | }
134 | fmt.Fprintf(w, "%s\n", response)
135 |
136 | } else {
137 |
138 | http.NotFound(w, req)
139 | return
140 |
141 | }
142 | }
143 |
144 | func ServeHttp(addr string, db Db, readOnly bool) error {
145 | scoreDbServer := ScoreDbServer{Db: db, ReadOnly: readOnly}
146 | return http.ListenAndServe(addr, &scoreDbServer)
147 | }
148 |
--------------------------------------------------------------------------------
/memorydb.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "fmt"
5 | "math"
6 | )
7 |
8 | func NewMemoryIdDb() MemoryIdDb {
9 | return MemoryIdDb{make(map[int64]string)}
10 | }
11 |
12 | type MemoryIdDb struct {
13 | bindings map[int64]string
14 | }
15 |
16 | func (db MemoryIdDb) Put(scoreIds []int64, clientIds []string) error {
17 | for idx, scoreId := range scoreIds {
18 | db.bindings[scoreId] = clientIds[idx]
19 | }
20 | return nil
21 | }
22 |
23 | func (db MemoryIdDb) Get(scoreIds []int64) ([]string, error) {
24 | result := make([]string, len(scoreIds))
25 | for idx, scoreId := range scoreIds {
26 | clientId, ok := db.bindings[scoreId]
27 | if !ok {
28 | return nil, fmt.Errorf("Unable to find client id for internal id %d", scoreId)
29 |
30 | }
31 | result[idx] = clientId
32 | }
33 | return result, nil
34 | }
35 |
36 | type MemoryScoreDb struct {
37 | Fields map[string][]float32
38 | nextId int64
39 | }
40 |
41 | func NewMemoryScoreDb() *MemoryScoreDb {
42 | return &MemoryScoreDb{
43 | Fields: make(map[string][]float32),
44 | nextId: 1,
45 | }
46 | }
47 |
48 | func (db *MemoryScoreDb) BulkIndex(records []map[string]float32) ([]int64, error) {
49 | fields := db.Fields
50 | ids := make([]int64, len(records))
51 | for idx, record := range records {
52 | ids[idx] = db.nextId
53 | db.nextId += 1
54 | for key, value := range record {
55 | _, ok := fields[key]
56 | if !ok {
57 | fields[key] = make([]float32, 0, 64)
58 | }
59 | fields[key] = append(fields[key], value)
60 | }
61 | }
62 | return ids, nil
63 | }
64 |
65 | func (db *MemoryScoreDb) FieldDocItr(fieldName string) DocItr {
66 | scores := db.Fields[fieldName]
67 | return NewMemoryScoreDocItr(scores)
68 | }
69 |
70 | func NewMemoryScoreDocItr(scores []float32) *MemoryScoreDocItr {
71 | min, max := float32(math.Inf(1)), float32(math.Inf(-1))
72 | for _, score := range scores {
73 | if score < min {
74 | min = score
75 | }
76 | if score > max {
77 | max = score
78 | }
79 | }
80 | return &MemoryScoreDocItr{
81 | scores: scores,
82 | idx: -1,
83 | min: min,
84 | max: max,
85 | }
86 | }
87 |
88 | type MemoryScoreDocItr struct {
89 | scores []float32
90 | idx int
91 | min, max float32
92 | }
93 |
94 | func (op *MemoryScoreDocItr) Name() string { return "MemoryScoreDocItr" }
95 | func (op *MemoryScoreDocItr) Cur() (int64, float32) {
96 | idx := op.idx
97 | if idx < 0 || idx >= len(op.scores) {
98 | return -1, 0.0
99 | }
100 | return int64(idx + 1), op.scores[idx]
101 |
102 | }
103 | func (op *MemoryScoreDocItr) GetBounds() (min, max float32) {
104 | return op.min, op.max
105 | }
106 | func (op *MemoryScoreDocItr) SetBounds(min, max float32) bool {
107 | op.min = Max(op.min, min)
108 | op.max = Min(op.max, max)
109 | return true
110 | }
111 |
112 | func (op *MemoryScoreDocItr) Close() {
113 | }
114 |
115 | func (op *MemoryScoreDocItr) Next(minId int64) bool {
116 | if minId == 0 {
117 | minId = 1
118 | }
119 | op.idx = int(minId - 1)
120 | return op.idx < len(op.scores)
121 | }
122 |
--------------------------------------------------------------------------------
/memorydb_test.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestMemoryScoreDb(t *testing.T) {
8 | db := BaseDb{StreamingDb: BaseStreamingDb{NewMemoryScoreDb()}, IdDb: NewMemoryIdDb()}
9 | DbBasicsTest(db, t)
10 | }
11 |
--------------------------------------------------------------------------------
/memorydocitr.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "math"
5 | )
6 |
7 | type MemoryDocItr struct {
8 | score float32
9 | docId int64
10 | min, max float32
11 |
12 | scores []float32
13 | docs []int64
14 | index int
15 | }
16 |
17 | func NewMemoryDocItr(scores []float32, docs []int64) *MemoryDocItr {
18 | return &MemoryDocItr{
19 | score: 0.0,
20 | docId: -1,
21 | min: float32(math.Inf(-1)),
22 | max: float32(math.Inf(1)),
23 | scores: scores,
24 | docs: docs,
25 | index: -1,
26 | }
27 | }
28 | func (op *MemoryDocItr) Cur() (int64, float32) {
29 | return op.docId, op.score
30 | }
31 | func (op *MemoryDocItr) GetBounds() (min, max float32) { return op.min, op.max }
32 | func (op *MemoryDocItr) SetBounds(min, max float32) bool {
33 | if min > op.max || max < op.min {
34 | return false
35 | }
36 | if min > op.min {
37 | op.min = min
38 | }
39 | if max < op.max {
40 | op.max = max
41 | }
42 | return true
43 | }
44 | func (op *MemoryDocItr) Name() string { return "MemoryDocItr" }
45 | func (op *MemoryDocItr) Close() {}
46 | func (op *MemoryDocItr) Next(minId int64) bool {
47 | for {
48 | op.index += 1
49 | index := op.index
50 | if index >= len(op.docs) {
51 | return false
52 | }
53 | docId := op.docs[index]
54 | if docId >= minId {
55 | score := op.scores[index]
56 | if score >= op.min && score <= op.max {
57 | op.score = score
58 | op.docId = op.docs[index]
59 | return true
60 | }
61 | }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/migratabledb.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "fmt"
5 | "time"
6 | )
7 |
8 | type MigratableDb struct {
9 | Current Db
10 | }
11 |
12 | func (db *MigratableDb) BulkIndex(records []Record) error {
13 | return db.Current.BulkIndex(records)
14 | }
15 |
16 | func (db *MigratableDb) Index(id string, values map[string]float32) error {
17 | return db.Current.Index(id, values)
18 | }
19 |
20 | func (db *MigratableDb) Query(query Query) (QueryResult, error) {
21 | fmt.Printf("Query versus %v at %v", db.Current, time.Now().Unix())
22 | return db.Current.Query(query)
23 | }
24 |
--------------------------------------------------------------------------------
/mindocitr.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "fmt"
5 | "sort"
6 | )
7 |
8 | type MinComponents []DocItr
9 |
10 | func (a MinComponents) Len() int { return len(a) }
11 | func (a MinComponents) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
12 | func (a MinComponents) Less(i, j int) bool {
13 | min1, max1 := a[i].GetBounds()
14 | min2, max2 := a[j].GetBounds()
15 | return max1-min1 > max2-min2
16 | }
17 |
18 | type MinDocItr struct {
19 | score float32
20 | docId int64
21 | min, max float32
22 | parts MinComponents
23 | }
24 |
25 | func NewMinDocItr(itrs []DocItr) *MinDocItr {
26 | min, max := float32(0.0), float32(0.0)
27 | components := make(MinComponents, len(itrs))
28 | for idx, part := range itrs {
29 | curMin, curMax := part.GetBounds()
30 | components[idx] = part
31 | if idx == 0 {
32 | min, max = curMin, curMax
33 | } else {
34 | min = Min(min, curMin)
35 | max = Min(max, curMax)
36 | }
37 | }
38 | sort.Sort(components)
39 | return &MinDocItr{
40 | score: 0.0,
41 | docId: -1,
42 | min: min,
43 | max: max,
44 | parts: components,
45 | }
46 | }
47 |
48 | func (op *MinDocItr) Name() string { return "MinDocItr" }
49 | func (op *MinDocItr) Cur() (int64, float32) {
50 | return op.docId, op.score
51 | }
52 | func (op *MinDocItr) GetBounds() (min, max float32) { return op.min, op.max }
53 | func (op *MinDocItr) Close() {
54 | for _, part := range op.parts {
55 | part.Close()
56 | }
57 | }
58 |
59 | func (op *MinDocItr) Next(minId int64) bool {
60 | min, max := op.min, op.max
61 | keepGoing := true
62 | var score float32
63 | for keepGoing {
64 | //fmt.Printf("MinDocItr Next itr (%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score)
65 | keepGoing = false
66 | score = PositiveInfinity
67 | for _, part := range op.parts {
68 | var curDocId int64
69 | var curScore float32
70 | for {
71 | curDocId, curScore = part.Cur()
72 | if curDocId >= minId {
73 | break
74 | }
75 | if !part.Next(minId) {
76 | return false
77 | }
78 | }
79 | if curDocId > minId {
80 | minId = curDocId
81 | keepGoing = true
82 | break
83 | }
84 | score = Min(score, curScore)
85 | }
86 | if !keepGoing {
87 | if score < min || score > max {
88 | minId += 1
89 | keepGoing = true
90 | }
91 | }
92 | }
93 | op.docId = minId
94 | op.score = score
95 | //fmt.Printf("MinDocItr Next(%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score)
96 | return true
97 | }
98 |
99 | func (op *MinDocItr) SetBounds(min, max float32) bool {
100 | fmt.Printf("MinDocItr SetBounds %v %v\n", min, max)
101 | op.min = min
102 | for _, component := range op.parts {
103 | curMin, curMax := component.GetBounds()
104 | if curMin < min {
105 | //fmt.Printf("MinDocItr SetBounds for component %v %v\n", min, curMax)
106 | if !component.SetBounds(min, curMax) {
107 | return false
108 | }
109 | }
110 | }
111 | return true
112 | }
113 |
--------------------------------------------------------------------------------
/powdocitr.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "math"
5 | )
6 |
7 | // Takes a constant power of a value.
8 | // Important: for bounds caluclation reasons, assumes only positive values are provided as inputs!
9 | type PowDocItr struct {
10 | exp, oneOverExp float32
11 | itr DocItr
12 | }
13 |
14 | func NewPowDocItr(itr DocItr, exp float32) *PowDocItr {
15 | return &PowDocItr{exp: exp, oneOverExp: 1.0 / exp, itr: itr}
16 | }
17 |
18 | func Pow(val, exp float32) float32 {
19 | return float32(math.Pow(float64(val), float64(exp)))
20 | }
21 |
22 | func (op *PowDocItr) Name() string { return "PowDocItr" }
23 | func (op *PowDocItr) Cur() (int64, float32) {
24 | docId, score := op.itr.Cur()
25 | return docId, Pow(score, op.exp)
26 | }
27 | func (op *PowDocItr) Close() {
28 | op.itr.Close()
29 | }
30 | func (op *PowDocItr) Next(minId int64) bool {
31 | ret := op.itr.Next(minId)
32 | return ret
33 | }
34 | func (op *PowDocItr) GetBounds() (min, max float32) {
35 | exp := op.exp
36 | min, max = op.itr.GetBounds()
37 | v1 := Pow(min, exp)
38 | v2 := Pow(max, exp)
39 | if v1 < v2 {
40 | return v1, v2
41 | } else {
42 | return v2, v1
43 | }
44 | }
45 | func (op *PowDocItr) SetBounds(min, max float32) bool {
46 | min = Max(0, min)
47 | max = Max(0, max)
48 | oneOverExp := op.oneOverExp
49 | v1 := Pow(min, oneOverExp)
50 | v2 := Pow(max, oneOverExp)
51 | if v1 < v2 {
52 | return op.itr.SetBounds(v1, v2)
53 | } else {
54 | return op.itr.SetBounds(v2, v1)
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/productdocitr.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "sort"
5 | )
6 |
7 | type ProductComponents []DocItr
8 |
9 | func (a ProductComponents) Len() int { return len(a) }
10 | func (a ProductComponents) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
11 | func (a ProductComponents) Less(i, j int) bool {
12 | min1, max1 := a[i].GetBounds()
13 | min2, max2 := a[j].GetBounds()
14 | return max1-min1 > max2-min2
15 | }
16 |
17 | type ProductDocItr struct {
18 | score float32
19 | docId int64
20 | min, max float32
21 | parts ProductComponents
22 | }
23 |
24 | func NewProductDocItr(itrs []DocItr) *ProductDocItr {
25 | min, max := float32(0.0), float32(0.0)
26 | components := make(ProductComponents, len(itrs))
27 | for idx, part := range itrs {
28 | curMin, curMax := part.GetBounds()
29 | //fmt.Printf("Init %v %v %v\n", idx, curMin, curMax)
30 | components[idx] = part
31 | if idx == 0 {
32 | min, max = curMin, curMax
33 | } else {
34 | // assumes positive inputs:
35 | min *= curMin
36 | max *= curMax
37 | }
38 | }
39 | sort.Sort(components)
40 | return &ProductDocItr{
41 | score: 0.0,
42 | docId: -1,
43 | min: min,
44 | max: max,
45 | parts: components,
46 | }
47 | }
48 |
49 | func (op *ProductDocItr) Name() string { return "ProductDocItr" }
50 | func (op *ProductDocItr) Cur() (int64, float32) {
51 | return op.docId, op.score
52 | }
53 | func (op *ProductDocItr) GetBounds() (min, max float32) { return op.min, op.max }
54 | func (op *ProductDocItr) Close() {
55 | for _, part := range op.parts {
56 | part.Close()
57 | }
58 | }
59 | func (op *ProductDocItr) Next(minId int64) bool {
60 | min, max := op.min, op.max
61 | keepGoing := true
62 | var score float32
63 | for keepGoing {
64 | //fmt.Printf("ProductDocItr Next itr (%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score)
65 | keepGoing = false
66 | score = float32(1.0)
67 | for _, part := range op.parts {
68 | var curDocId int64
69 | var curScore float32
70 | for {
71 | curDocId, curScore = part.Cur()
72 | if curDocId >= minId {
73 | break
74 | }
75 | if !part.Next(minId) {
76 | return false
77 | }
78 | }
79 | if curDocId > minId {
80 | minId = curDocId
81 | keepGoing = true
82 | break
83 | }
84 | score *= curScore
85 | }
86 | if !keepGoing {
87 | if score < min || score > max {
88 | minId += 1
89 | keepGoing = true
90 | }
91 | }
92 | }
93 | op.docId = minId
94 | op.score = score
95 | //fmt.Printf("ProductDocItr Next(%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score)
96 | return true
97 | }
98 |
99 | func (op *ProductDocItr) SetBounds(min, max float32) bool {
100 | //fmt.Printf("ProductDocItr SetBounds %v %v\n", min, max)
101 | op.min = min
102 | op.max = max
103 |
104 | for curfield, component := range op.parts {
105 | newMin, newMax := min, max
106 | for otherfactor, otherComponent := range op.parts {
107 | // Then divide by the other maxes or mins
108 | if curfield != otherfactor {
109 | otherMin, otherMax := otherComponent.GetBounds()
110 | if otherMax == 0.0 {
111 | newMin = 0.0
112 | } else {
113 | newMin /= otherMax
114 | }
115 | if otherMin == 0.0 {
116 | newMax = PositiveInfinity
117 | } else {
118 | newMax /= otherMin
119 | }
120 | }
121 | }
122 | curMin, curMax := component.GetBounds()
123 | if newMin < curMin {
124 | newMin = curMin
125 | }
126 | if newMax > curMax {
127 | newMax = curMax
128 | }
129 | if newMin != curMin || newMax != curMax {
130 | //fmt.Printf("ProductDocItr SetBounds for component %v %v\n", newMin, newMax)
131 | component.SetBounds(newMin, newMax)
132 | }
133 | }
134 | return true
135 | }
136 |
--------------------------------------------------------------------------------
/productdocitr_test.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestProductDocItr(t *testing.T) {
8 | i1 := NewMemoryScoreDocItr([]float32{0.2, 0.8, 0.5})
9 | i2 := NewMemoryScoreDocItr([]float32{1.0, 0.0, 0.7})
10 | itr := NewProductDocItr([]DocItr{i1, i2})
11 |
12 | if itr.min != 0.0 {
13 | t.Fatalf("%v", itr.min)
14 | }
15 | if itr.max != 0.8 {
16 | t.Fatalf("%v", itr.max)
17 | }
18 |
19 | itr.SetBounds(0.5, 1.0)
20 |
21 | min1, _ := i1.GetBounds()
22 | min2, _ := i2.GetBounds()
23 |
24 | if min1 != 0.5 {
25 | t.Fatalf("%v", min1)
26 | }
27 | if min2*0.2 == 0.5 {
28 | t.Fatalf("%v", min2)
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/scale_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pschanely/scoredb/57beea075b4b5a53ee0a27b9752a0ca544c4510d/scale_performance.png
--------------------------------------------------------------------------------
/scaledocitr.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import ()
4 |
5 | // Multiplies a value by a constant
6 | type ScaleDocItr struct {
7 | factor float32
8 | docItr DocItr
9 | }
10 |
11 | func (op *ScaleDocItr) Name() string { return "ScaleDocItr" }
12 | func (op *ScaleDocItr) Cur() (int64, float32) {
13 | docId, score := op.docItr.Cur()
14 | return docId, score * op.factor
15 | }
16 | func (op *ScaleDocItr) GetBounds() (min, max float32) {
17 | min, max = op.docItr.GetBounds()
18 | factor := op.factor
19 | if factor >= 0 {
20 | return min * op.factor, max * op.factor
21 | } else {
22 | return max * op.factor, min * op.factor
23 | }
24 | }
25 | func (op *ScaleDocItr) Close() {
26 | op.docItr.Close()
27 | }
28 | func (op *ScaleDocItr) Next(minId int64) bool {
29 | return op.docItr.Next(minId)
30 | }
31 |
32 | func (op *ScaleDocItr) SetBounds(min, max float32) bool {
33 | factor := op.factor
34 | if factor >= 0 {
35 | return op.docItr.SetBounds(min/op.factor, max/op.factor)
36 | } else {
37 | return op.docItr.SetBounds(max/op.factor, min/op.factor)
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/scoredb/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bufio"
5 | "encoding/json"
6 | "flag"
7 | "fmt"
8 | "github.com/pschanely/scoredb"
9 | "log"
10 | "os"
11 | "path"
12 | "runtime"
13 | "strings"
14 | "time"
15 | )
16 |
17 | func MakeStandardDb(dataDir string, numShards int) (*scoredb.BaseDb, error) {
18 | var shards []scoredb.StreamingDb
19 |
20 | if scoredb.Exists(dataDir) && scoredb.Exists(path.Join(dataDir, "shard.0")) {
21 | i := 0
22 | shards = make([]scoredb.StreamingDb, 0, numShards)
23 | for {
24 | shardDir := path.Join(dataDir, fmt.Sprintf("shard.%d", i))
25 | if scoredb.Exists(shardDir) {
26 | shards = append(shards, scoredb.BaseStreamingDb{scoredb.NewFsScoreDb(shardDir)})
27 | } else {
28 | break
29 | }
30 | i += 1
31 | }
32 | } else {
33 | shards = make([]scoredb.StreamingDb, numShards)
34 | for i := range shards {
35 | shardDir := path.Join(dataDir, fmt.Sprintf("shard.%d", i))
36 | shards[i] = scoredb.BaseStreamingDb{scoredb.NewFsScoreDb(shardDir)}
37 | }
38 | }
39 | idDb, err := scoredb.NewBoltIdDb(path.Join(dataDir, "iddb"))
40 | if err != nil {
41 | return nil, err
42 | }
43 | return &scoredb.BaseDb{
44 | StreamingDb: scoredb.ShardedDb{
45 | Shards: shards,
46 | },
47 | IdDb: idDb,
48 | }, nil
49 | }
50 |
51 | func watchDir(db *scoredb.MigratableDb, baseDir string, namePrefix string) {
52 | log.Printf("Watching for databases at %s%s*\n", baseDir, namePrefix)
53 | var lastName = ""
54 | for {
55 | dir, err := os.Open(baseDir)
56 | var fileInfos []os.FileInfo
57 | if err == nil {
58 | fileInfos, err = dir.Readdir(0)
59 | dir.Close()
60 | }
61 | if err != nil {
62 | log.Printf("Unable to read %v: %v\n", dir, err)
63 | time.Sleep(55 * time.Second)
64 | } else {
65 | var newDbName = ""
66 | for _, fileInfo := range fileInfos {
67 | name := fileInfo.Name()
68 | if strings.HasPrefix(name, namePrefix) {
69 | if name > newDbName {
70 | newDbName = name
71 | }
72 | }
73 | }
74 | if newDbName > lastName {
75 | fmt.Printf("Detected database at %s%s\n", baseDir, newDbName)
76 | fullDbName := path.Join(baseDir, newDbName)
77 | newDb, err := MakeStandardDb(fullDbName, 1)
78 | if err != nil {
79 | log.Printf("Unable to load database at %s%s (%v); ignoring\n", dir, fullDbName, err)
80 | } else {
81 | fmt.Printf("The database at %s%s is live at %v\n", baseDir, fullDbName, time.Now().Unix())
82 | db.Current = newDb
83 | lastName = newDbName
84 | }
85 | }
86 | }
87 | time.Sleep(10 * time.Second)
88 | }
89 | }
90 |
91 | func SetupDirLoading(databaseDir string) *scoredb.MigratableDb {
92 | migratable := scoredb.MigratableDb{Current: nil}
93 | baseDir, namePrefix := path.Split(databaseDir)
94 | fmt.Printf("Watching for new databases named %s* in %s\n", namePrefix, baseDir)
95 | go watchDir(&migratable, baseDir, namePrefix)
96 | return &migratable
97 | }
98 |
99 | func main() {
100 |
101 | serveCommand := flag.NewFlagSet("serve", flag.ExitOnError)
102 | servePort := serveCommand.Int("port", 11625, "listening port in http mode, defaults to 11625")
103 | serveIntf := serveCommand.String("interface", "", "network interface to listen on in http mode, defaults to empty string (any interface)")
104 | serveDataDir := serveCommand.String("datadir", "./data", "Storage directory for database")
105 | serveNumShards := serveCommand.Int("numshards", 4, "Number of shards")
106 | serveReadOnly := serveCommand.Bool("readonly", false, "Only allow GET requests")
107 | serveAutoMigrate := serveCommand.Bool("automigrate", false, "When new directories appear matching *, atomically swap in the database at that directory. (lexigraphically last)")
108 |
109 | loadCommand := flag.NewFlagSet("load", flag.ExitOnError)
110 | loadDataDir := loadCommand.String("datadir", "./data", "Storage directory for database")
111 | loadNumShards := loadCommand.Int("numshards", 4, "Number of shards (ignored if db already exists)")
112 |
113 | benchCommand := flag.NewFlagSet("benchmark", flag.ExitOnError)
114 | benchCsvFilename := benchCommand.String("csv", "", "csv filename of census data")
115 | benchMaxRecords := benchCommand.Int64("maxrecords", 1000*1000, "Maximum size of database to benchmark (in # of records)")
116 | benchCsvOutput := benchCommand.String("out", "output.csv", "csv of performance data to output")
117 | benchEsUrl := benchCommand.String("esurl", "http://localhost:9200/", "URL of elasticsearch instance")
118 | benchEsIndex := benchCommand.String("esindex", "benchmark_scoredb", "Index name to use for elasticsearch")
119 | benchFsDataDir := benchCommand.String("fsdatadir", "./benchmark_data", "Storage directory for native scoredb database")
120 |
121 | /*
122 | for cmd := range([]*flag.FlagSet{serveCommand, benchCommand}) {
123 | // common args here
124 | }
125 | */
126 |
127 | if len(os.Args) <= 1 {
128 | fmt.Println("usage: scoredb []")
129 | fmt.Println("Commands:")
130 | fmt.Println(" serve Run a scoredb server")
131 | fmt.Println(" load Load json lines from stdin")
132 | fmt.Println(" benchmark Run performance benchmarks")
133 | fmt.Println("For more help, run scoredb -h")
134 | os.Exit(1)
135 | }
136 | var db scoredb.Db
137 | var err error
138 | switch os.Args[1] {
139 | case "serve":
140 | serveCommand.Parse(os.Args[2:])
141 | if *serveAutoMigrate {
142 | db = SetupDirLoading(*serveDataDir)
143 | } else {
144 | db, err = MakeStandardDb(*serveDataDir, *serveNumShards)
145 | if err != nil {
146 | log.Fatalf("Failed to initialize database at %v: %v\n", *serveDataDir, err)
147 | }
148 | }
149 | addr := fmt.Sprintf("%s:%d", *serveIntf, *servePort)
150 | fmt.Printf("Serving on %s\n", addr)
151 | log.Fatal(scoredb.ServeHttp(addr, db, *serveReadOnly))
152 | case "load":
153 | loadCommand.Parse(os.Args[2:])
154 | db, err := MakeStandardDb(*loadDataDir, *loadNumShards)
155 | if err != nil {
156 | log.Fatal(fmt.Sprintf("Failed to initialize database at %v: %v\n", *loadDataDir, err))
157 | }
158 | scanner := bufio.NewScanner(os.Stdin)
159 | batchSize := 200
160 | batchIndex := 0
161 | var batch = make([]scoredb.Record, batchSize)
162 | for scanner.Scan() {
163 | record := scoredb.Record{}
164 | line := scanner.Bytes()
165 | json.Unmarshal(line, &record)
166 | batch[batchIndex] = record
167 | batchIndex += 1
168 | if batchIndex >= batchSize {
169 | db.BulkIndex(batch)
170 | batchIndex = 0
171 | batch = make([]scoredb.Record, batchSize)
172 | }
173 | }
174 | if batchIndex > 0 {
175 | db.BulkIndex(batch[:batchIndex])
176 | }
177 | case "benchmark":
178 | outputFd, err := os.Create(*benchCsvOutput)
179 | if err != nil {
180 | log.Fatal(fmt.Sprintf("Failed to output output csv file at %v: %v\n", *benchCsvOutput, err))
181 | }
182 |
183 | runtime.GOMAXPROCS(runtime.NumCPU())
184 | benchCommand.Parse(os.Args[2:])
185 | esDb := &scoredb.EsScoreDb{BaseURL: *benchEsUrl, Index: *benchEsIndex}
186 | fsDb, err := MakeStandardDb(*benchFsDataDir, 4)
187 | if err != nil {
188 | log.Fatal(fmt.Sprintf("Failed to initialize database at %v: %v\n", *benchFsDataDir, err))
189 | }
190 | if !scoredb.Exists(*benchCsvFilename) {
191 | log.Fatal(fmt.Sprintf("Cannot find source csv data file at '%s'", *benchCsvFilename))
192 | }
193 |
194 | fmt.Printf("Running es benchmarks\n")
195 | esDb.DeleteIndex()
196 | esDb.CreateIndex()
197 | counts, esIndexTimes, esQueryTimes, err := scoredb.RunBenchmark(esDb, *benchCsvFilename, *benchMaxRecords)
198 | //esDb.DeleteIndex()
199 | if err != nil {
200 | log.Fatal(fmt.Sprintf("Failed to run es benchmark: %v\n", err))
201 | }
202 |
203 | fmt.Printf("Running native benchmarks\n")
204 | _, fsIndexTimes, fsQueryTimes, err := scoredb.RunBenchmark(fsDb, *benchCsvFilename, *benchMaxRecords)
205 | if err != nil {
206 | log.Fatal(fmt.Sprintf("Failed to run native benchmark: %v\n", err))
207 | }
208 |
209 | fmt.Fprintf(outputFd, "records,es_index,native_index,es_query_1,native_query_1,es_query_2,native_query_2\n")
210 | for idx := 0; idx < len(esIndexTimes); idx++ {
211 | fmt.Fprintf(outputFd, "%v,%v,%v", counts[idx], esIndexTimes[idx], fsIndexTimes[idx])
212 | for idx2 := 0; idx2 < len(esQueryTimes[idx]); idx2++ {
213 | fmt.Fprintf(outputFd, ",%v,%v", esQueryTimes[idx][idx2], fsQueryTimes[idx][idx2])
214 | }
215 | fmt.Fprintf(outputFd, "\n")
216 | }
217 | outputFd.Close()
218 | default:
219 | fmt.Printf("%q is not valid command.\n", os.Args[1])
220 | os.Exit(2)
221 | }
222 | }
223 |
--------------------------------------------------------------------------------
/shardeddb.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "fmt"
5 | "math"
6 | "math/rand"
7 | )
8 |
9 | type ShardedDb struct {
10 | Shards []StreamingDb
11 | }
12 |
13 | var reservedShardBits = uint(14)
14 |
15 | func NewShardedDb(shards []StreamingDb) (*ShardedDb, error) {
16 | maxShards := (1 << reservedShardBits) - 1
17 | if len(shards) >= 1<= bounds.max {
82 | continue
83 | }
84 | resultChannel <- CandidateResult{DocId: docId, Score: score, WorkerNum: myWorkerNum}
85 | /*
86 | select {
87 | case newBounds, ok := <- boundsChannel:
88 | if ok {
89 | if bounds != newBounds {
90 | bounds = newBounds
91 | itr.SetBounds(bounds.min, bounds.max)
92 | }
93 | }
94 | }
95 | */
96 |
97 | newBounds := <-boundsChannel
98 |
99 | if bounds != newBounds {
100 | bounds = newBounds
101 | itr.SetBounds(bounds.min, bounds.max)
102 | }
103 |
104 | }
105 | itr.Close()
106 | resultChannel <- CandidateResult{DocId: -1}
107 | }
108 |
109 | func NewParallelDocItr(parts []DocItr) *ParallelDocItr {
110 | op := ParallelDocItr{
111 | score: 0.0,
112 | docId: -1,
113 | NumAlive: len(parts),
114 | Bounds: Bounds{min: float32(math.Inf(-1)), max: float32(math.Inf(1))},
115 | ResultChannel: make(chan CandidateResult),
116 | Comms: make([](chan Bounds), len(parts)),
117 | }
118 | for idx, part := range parts {
119 | part := part
120 | curMin, curMax := part.GetBounds()
121 | op.Bounds.min = Min(op.Bounds.min, curMin)
122 | op.Bounds.max = Max(op.Bounds.max, curMax)
123 | boundsChannel := make(chan Bounds)
124 | op.Comms[idx] = boundsChannel
125 | go RunItr(part, idx, op.ResultChannel, boundsChannel)
126 | }
127 | return &op
128 | }
129 |
130 | func (op *ParallelDocItr) Name() string {
131 | return "ParallelDocItr"
132 | }
133 |
134 | func (op *ParallelDocItr) SetBounds(min, max float32) bool {
135 | op.Bounds.min, op.Bounds.max = min, max
136 | return true
137 | }
138 |
139 | func (op *ParallelDocItr) GetBounds() (min, max float32) {
140 | return op.Bounds.min, op.Bounds.max
141 | }
142 |
143 | func (op *ParallelDocItr) Next(minId int64) bool {
144 | for {
145 | result := <-op.ResultChannel
146 | if result.DocId == -1 {
147 | op.NumAlive -= 1
148 | if op.NumAlive <= 0 {
149 | return false
150 | }
151 | } else {
152 | workerNum := result.WorkerNum
153 | if result.Score > op.Bounds.min && result.Score < op.Bounds.max {
154 | op.docId = ShardIdToExt(result.DocId, workerNum)
155 | op.score = result.Score
156 | op.Comms[workerNum] <- op.Bounds
157 | return true
158 | } else {
159 | op.Comms[workerNum] <- op.Bounds
160 | }
161 | }
162 | }
163 | }
164 |
165 | func (op *ParallelDocItr) Close() {} // unsure...
166 |
167 | func (op *ParallelDocItr) Cur() (int64, float32) {
168 | return op.docId, op.score
169 | }
170 |
--------------------------------------------------------------------------------
/shardeddb_test.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestShardedDb(t *testing.T) {
8 | pathmaker := RmAllTestData()
9 | defer RmAllTestData()
10 | idDb, err := NewBoltIdDb(pathmaker("shard_ids"))
11 | if err != nil {
12 | t.Fatal(err)
13 | }
14 | db := BaseDb{
15 | StreamingDb: ShardedDb{
16 | Shards: []StreamingDb{
17 | BaseStreamingDb{NewFsScoreDb(pathmaker("shard_1"))},
18 | BaseStreamingDb{NewFsScoreDb(pathmaker("shard_2"))},
19 | },
20 | },
21 | IdDb: idDb,
22 | }
23 | DbBasicsTest(db, t)
24 | }
25 |
--------------------------------------------------------------------------------
/stub.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | type StubDb struct {
4 | idx int64
5 | }
6 |
7 | func (sdb *StubDb) Index(record map[string]float32) (int64, error) {
8 | sdb.idx += 1
9 | return sdb.idx, nil
10 | }
11 |
12 | func (sdb *StubDb) BulkIndex(records []map[string]float32) ([]int64, error) {
13 | ids := make([]int64, len(records))
14 | for i, _ := range records {
15 | sdb.idx++
16 | ids[i] = sdb.idx
17 | }
18 | return ids, nil
19 | }
20 |
21 | func (db *StubDb) Query(query Query) (QueryResult, error) {
22 | return QueryResult{Ids: []string{"7", "42"}}, nil
23 | }
24 |
25 | func (db *StubDb) LinearQuery(numResults int, coefs map[string]float32) []string {
26 | return []string{"7", "42"}
27 | }
28 |
--------------------------------------------------------------------------------
/sumdocitr.go:
--------------------------------------------------------------------------------
1 | package scoredb
2 |
3 | import (
4 | "math"
5 | "sort"
6 | )
7 |
8 | type SumComponent struct {
9 | docItr DocItr
10 | scoreRange float32
11 | }
12 |
13 | type SumComponents []SumComponent
14 |
15 | func (a SumComponents) Len() int { return len(a) }
16 | func (a SumComponents) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
17 | func (a SumComponents) Less(i, j int) bool { return a[i].scoreRange > a[j].scoreRange }
18 |
19 | type SumDocItr struct {
20 | score float32
21 | docId int64
22 | min, max float32
23 | parts []SumComponent
24 | }
25 |
26 | func NewSumDocItr(itrs []DocItr) *SumDocItr {
27 | min, max := float32(0.0), float32(0.0)
28 | components := make(SumComponents, len(itrs))
29 | for idx, part := range itrs {
30 | curMin, curMax := part.GetBounds()
31 | components[idx].docItr = part
32 | components[idx].scoreRange = float32(math.Abs(float64(curMax - curMin)))
33 | min += curMin
34 | max += curMax
35 | }
36 | sort.Sort(components)
37 | return &SumDocItr{
38 | score: 0.0,
39 | docId: -1,
40 | min: min,
41 | max: max,
42 | parts: components,
43 | }
44 | }
45 |
46 | func (op *SumDocItr) Name() string { return "SumDocItr" }
47 | func (op *SumDocItr) Cur() (int64, float32) {
48 | return op.docId, op.score
49 | }
50 | func (op *SumDocItr) GetBounds() (min, max float32) { return op.min, op.max }
51 | func (op *SumDocItr) Close() {
52 | for _, part := range op.parts {
53 | part.docItr.Close()
54 | }
55 | }
56 | func (op *SumDocItr) Next(minId int64) bool {
57 | min, max := op.min, op.max
58 | keepGoing := true
59 | var score float32
60 | for keepGoing {
61 | keepGoing = false
62 | score = float32(0.0)
63 | for _, part := range op.parts {
64 | var curDocId int64
65 | var curScore float32
66 | for {
67 | curDocId, curScore = part.docItr.Cur()
68 | if curDocId >= minId {
69 | break
70 | }
71 | if !part.docItr.Next(minId) {
72 | return false
73 | }
74 | }
75 | if curDocId > minId {
76 | minId = curDocId
77 | keepGoing = true
78 | break
79 | }
80 | score += curScore
81 | }
82 | if !keepGoing {
83 | if score < min || score > max {
84 | minId += 1
85 | keepGoing = true
86 | }
87 | }
88 | }
89 | op.docId = minId
90 | op.score = score
91 | //fmt.Printf("SumDocItr Next(%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score)
92 | return true
93 | }
94 |
95 | func (op *SumDocItr) SetBounds(min, max float32) bool {
96 | //fmt.Printf("SumDocItr SetBounds %v %v\n", min, max)
97 | op.min = min
98 | op.max = max
99 |
100 | for curfield, component := range op.parts {
101 | newMin, newMax := min, max
102 | // subtract out the ranges of all the other components (the remaining range will be mine)
103 | for otherfactor, otherComponent := range op.parts {
104 | //Then subtract the other maxes or mins
105 | if curfield != otherfactor {
106 | otherMin, otherMax := otherComponent.docItr.GetBounds()
107 | newMin -= otherMax
108 | newMax -= otherMin
109 | }
110 | }
111 | curMin, curMax := component.docItr.GetBounds()
112 | if newMin < curMin {
113 | newMin = curMin
114 | }
115 | if newMax > curMax {
116 | newMax = curMax
117 | }
118 | if newMin != curMin || newMax != curMax {
119 | //fmt.Printf("SumDocItr SetBounds for component %v %v\n", newMin, newMax)
120 | component.docItr.SetBounds(newMin, newMax)
121 | }
122 | }
123 | return true
124 | }
125 |
--------------------------------------------------------------------------------