├── .gitignore
├── LICENSE
├── README.md
├── bench.go
├── bitreader.go
├── bitreader_test.go
├── boltiddb.go
├── bucket_execution.png
├── customlineardocitr.go
├── customlineardocitr_test.go
├── custommapdocitr.go
├── custommapdocitr_test.go
├── dataset_tools
    ├── census_p_rec_gen.sh
    └── sample.csv
├── db.go
├── db_test.go
├── diffdocitr.go
├── docitr.go
├── elastic.go
├── fielddocitr.go
├── fielddocitr_test.go
├── fsscoredb.go
├── fsscoredb_test.go
├── http.go
├── memorydb.go
├── memorydb_test.go
├── memorydocitr.go
├── migratabledb.go
├── mindocitr.go
├── powdocitr.go
├── productdocitr.go
├── productdocitr_test.go
├── scale_performance.png
├── scaledocitr.go
├── scoredb
    └── main.go
├── shardeddb.go
├── shardeddb_test.go
├── stub.go
└── sumdocitr.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | data
10 | 
11 | # Architecture specific extensions/prefixes
12 | *.[568vq]
13 | [568vq].out
14 | 
15 | *.cgo1.go
16 | *.cgo2.c
17 | _cgo_defun.c
18 | _cgo_gotypes.go
19 | _cgo_export.*
20 | 
21 | _testmain.go
22 | 
23 | *.exe
24 | *.test
25 | *.prof
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Phillip Schanely
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # scoredb
  2 | 
  3 | A simple database index optimized for returning results by custom scoring functions.
  4 | 
  5 | To my knowledge, it is the only open source system with an algorithm designed for this purpose; in some cases, it is faster than elasticsearch's implementation by an order of magnitude. (see below)
  6 | 
  7 | # Why?
  8 | 
  9 | Scoredb is optimized for systems that want to find the top scoring results, where the scoring function is specified by the client, 
 10 | and may depend on more than one field.
 11 | It may be a good choice for any system that needs to incorporate multiple factors when returning results.
 12 | For instance, it might power a used car website to produce results based on factors like mileage, year, and distance.
 13 | 
 14 | 
 15 | # Run It
 16 | 
 17 | Though Scoredb has a straightforward programatic interface, you can run a simple standalone HTTP server like so:
 18 | 
 19 | ```
 20 | $ go get github.com/pschanely/scoredb
 21 | $ go install github.com/pschanely/scoredb/...
 22 | $ ${GOPATH}/bin/scoredb serve -datadir my_data_directory -port 11625
 23 | ```
 24 | ... and in another shell:
 25 | ```
 26 | # insert some people with ages and weights
 27 | $ curl -XPUT http://localhost:11625/jim -d '{"age":21, "weight":170}'
 28 | $ curl -XPUT http://localhost:11625/bob -d '{"age":34, "weight":150}'
 29 | 
 30 | # get people by age
 31 | $ curl -G 'http://localhost:11625' --data-urlencode 'score=["field", "age"]'
 32 | {"Ids":["bob","jim"]}
 33 | 
 34 | # get people by the sum of their age and weight:
 35 | $ curl -G 'http://localhost:11625' --data-urlencode 'score=["sum", ["field", "age"], ["field", "weight"]]'
 36 | {"Ids":["jim","bob"]}
 37 | ```
 38 | 
 39 | 
 40 | # The Algorithm
 41 | 
 42 | Scoredb uses a format on disk that is very similar to that used by text search systems like solr and elasticsearch.
 43 | We divide each field into ranges of values (buckets) and, for each bucket, maintain a file containing the IDs of objects that have their value inside that range.
 44 | 
 45 | The IDs in each file are strictly increasing; this means that we can traverse several buckets efficiently by using a heap of buckets to find the next smallest id among many buckets.
 46 | 
 47 | As we traverse the buckets, we score the objects produced and put them into a candidate result set.  The result set is capped at the `&limit=` parameter specified by the user.  As poorly scoring results get kicked out of the candidate result set, we can infer a lower bound on the final score.  With some math, we can propagate that lower bound backwards through the scoring function to infer bounds on the individual fields.  These bounds may then be used to stop traversing very poorly scoring buckets that could not produce a good enough final score.  In this manner, as the candidate result set gets better and better, the system can eliminate more and more buckets to arrive at a result very quickly.
 48 | 
 49 | The following graph shows bucket elimination over the course of an example query combining two fields, "age" and "wages":
 50 | 
 51 | <img src="bucket_execution.png" width="380">
 52 | 
 53 | 
 54 | # Performance
 55 | 
 56 | Few database systems support custom scoring functions, and fewer (possibly none?) use algorithms designed for that purpose.
 57 | In practice, I've found elasticsearch's
 58 | [custom scoring functions](https://www.elastic.co/guide/en/elasticsearch/reference/0.90/query-dsl-function-score-query.html#query-dsl-function-score-query)
 59 |  to be quite fast, so I've benchmarked against it here.  Please let me know about other systems I might benchmark against!
 60 | 
 61 | This is a graph of how 5 different queries perform with varying database sizes (yellow is elasticsearch and blue is scoredb):
 62 | 
 63 | <img src="scale_performance.png" width="500">
 64 | 
 65 | The elasticsearch query times (yellow) look like they're rising exponentially, but it's actually linear because the X-axis has a logarithmic scale.
 66 | 
 67 | The dataset is anonymized US census data, each object representing an individual.  These are the 5 scoring functions used for benchmarking, in order from fastest to slowest (for scoredb):
 68 | 
 69 | ```
 70 | 10 * number_of_children + age
 71 | 10000 * age + yearly_wages
 72 | 100 * age + yearly_wages
 73 | 40 * gender + weekly_work_hours
 74 | 100.0 * gender + 9 * num_children + age + weekly_work_hours
 75 | 5 * num_children + age + weekly_work_hours
 76 | ```
 77 | 
 78 | This is an unscientific test!  Just my personal laptop, [this datafile](http://millstonecw.com/censusdata.csv.bz2) repeated a few times over for the biggest datasets, and `scoredb benchmark -maxrecords 10000000 -csv censusdata.csv`.  There's no substitute for testing with your own data, queries, and hardware.
 79 | 
 80 | It's clear from the graph that scoredb's performance can vary significantly based on the scoring function.
 81 | Some guidance on scoring:
 82 | 
 83 | * Prefer to combine fields with addition, multiplication, and, in particular, minimum, because they allow the computation of useful lower bounds.  Combining fields with a max() function does not, because a bad value in one field can be completely overcome by a good value in another.
 84 | * Combining many fields instead of a few will make the query take longer, because it takes longer to determine useful lower bounds on each field.
 85 | * Prefer to engineer weights so that the contributions from each of your fields is similar in scale.  Scoredb may never be able to find useful bounds on fields that tweak the final score very slightly.
 86 | 
 87 | 
 88 | # Limitations
 89 | 
 90 | Scoredb is minimalistic and highly specialized; it is intended to just act as one piece of a larger system:
 91 | * Scoredb **has no delete or update operation**.  To remove or change an object, you must build a new index.  See below for how to swap a new index in under a running instance without downtime.
 92 | * It stores objects as a flat set of key-value pairs with string keys and numeric values only. (internally, all values are 32 bit floating point values)
 93 | * Scoredb can only respond to queries with lists of identifiers; scoredb's indexes do not provide efficient access to the original field data.
 94 | * Scoredb has no built-in clustering, redundancy, or backup functions.
 95 | * Adding objects to scoredb is slow if you add them one at a time.  Bulk insertion should be used whenever possible.
 96 | * Scoredb requires many open files; sometimes thousands of them.  You will need to increase default filehandle limits on your system (see "ulimit" on linux).
 97 | * Scoredb expects you to provide every field for every object; objects that are missing a field cannot be returned from queries that use the missing fields.
 98 | * Scoredb data files are endian specific; most modern CPUs are little endian, so you won't normally have to worry about this.
 99 | 
100 | # Index Bulk Load
101 | 
102 | You can create a database without running a server using the `scoredb load` command, which expects newline separated json records on stdin.
103 | So, for instance:
104 | ```
105 | printf '{"id":"person_1", "values":{"age":10, "height":53}}\n' > data.jsonl
106 | printf '{"id":"person_2", "values":{"age":32, "height":68}}\n' >> data.jsonl
107 | cat data.jsonl | scoredb load
108 | ```
109 | 
110 | # Index Swapping
111 | 
112 | If you need deletes or updates, you'll have to perodically rebuild your database and swap in updated versions.
113 | If you specify the -automigrate option to the server, it will look for new database directories that begin with the given data directory
114 | and keep the (lexigraphically largest) one live.  Use an atomic mv command to put it in place like so:
115 | 
116 | ```
117 | $ cat new_data.jsonlines | scoredb load -datadir ./live_db_v00001  # Load initial data
118 | $ scoredb serve -readonly -automigrate -datadir ./live_db_v        # Start server
119 | 
120 | # when ready for a new version of the database,
121 | 
122 | $ cat new_data.jsonlines | scoredb load -datadir ./tmp_db          # Create the database
123 | $ mv ./tmp_db ./live_db_v00002                                     # Rename to match the watched prefix
124 | 
125 | # The server should detect and load the new database here.
126 | 
127 | $ rm -rf ./live_db_v00001                                          # Now, remove the old database
128 | ```
129 | 
130 | # Supported Query Functions
131 | 
132 | As shown above, queries are expressed as JSON expressions and then url encoded into the "score" query parameter.
133 | Each expression takes a lisp-like form: `[<function name>, <argument 1>, <argument 2>, ...]`.  These are the supported functions:
134 | 
135 | #### `["field", <field_name>]`
136 | Simply produces the value of `<field_name>` as a score.
137 |   * Example: `["field", "age"]` (return the age value as a score)
138 | 
139 | #### `["scale", <factor>, <subexpression>]`
140 | Takes the result of `<subexpression>` and multiplies it by `<factor>`.  `<factor>` may be negative.
141 |   * Example: `["scale", 2.0, ["field", "age"]]` (age, doubled)
142 | 
143 | #### `["sum", <subexpression 1>, <subexpression 2>, ...]`
144 | Sums the results of each `<subexpression>`.
145 |   * Example: `["sum", ["field", "age"], ["field", "height"]]` (add age and height together)
146 | 
147 | #### `["product", <subexpression 1>, <subexpression 2>, ...]`
148 | Multiplies the result of each `<subexpression>` together.  For bounding reasons, negative inputs are not allowed.
149 |   * Example: `["product", ["field", "age"], ["field", "height"]]` (multiply age by height)
150 | 
151 | #### `["min", <subexpression 1>, <subexpression 2>, ...]`
152 | Takes the least score resulting from all `<subexpression>`s.
153 |   * Example: `["min", ["field", "age"], ["field", "height"]]` (Take age or height, whichever is smaller)
154 | 
155 | ####`["diff", <subexpression 1>, <subexpression 2>]`
156 | Returns the absolute difference between the values produced by both subexpressions.
157 |   * Example: `["diff", ["field", "age"], ["field", "height"]]` (the difference between each age and height)
158 | 
159 | #### `["pow", <subexpression>, <exponent>]`
160 | Raises the result from the given subexpression to the `<exponent>` power.  
161 | `<exponent>` may be fractional (for Nth roots) or negative.  
162 | However, for bounding reasons, the subexpression may not produce negative values.
163 |   * Example: `["pow", ["field", "age"], 2.0]` (age, squared)
164 | 
165 | #### `["custom_linear", [[<x1>, <y1>], [<x2>, <y2>], ..], <subexpression>]` 
166 | Establishes a user-defined function using a set of linearly interpolated [x, y] points. 
167 | Inputs smaller than the smallest X value or larger than the largest X value get the closest specified Y value.
168 |   * Example: `["custom_linear", [[0, 0.0], [30, 1.0], [80, 0.0]], ["field", "age"]]` Maping ages to scores: 30 year-olds get a score of one, gradually declining to a score of zero for infants and the elderly.
169 | 
170 | #### `["geo_distance", <lat>, <lng>, <lat field name>, <lng field name>]` 
171 | Returns the distance to a fixed point in kilometers as a score.  
172 | This is experimental: may be inaccurate for large distances, and fails across the prime meridian.  
173 | Since you typically want smaller distances to have higher scores, you'll probably want to wrap the "scale" or "custom_linear" functions around this one to invert it.
174 |   * Example: `["geo_distance", 40.7, -74.0, "home_lat", "home_lng"]` Scores each result by how far its home_lat and home_lng fields put it from New York City.
175 | 
176 | 
177 | # Status
178 | 
179 | Though it has reasonable test coverage and a small, straightforward codebase, scoredb is certainly alpha-quality software.
180 | 
181 | Your bug reports are greatly appreciated.
182 | 
183 | 
184 | # Thanks
185 | 
186 | Thanks are due to the [Samsung Accelerator](http://samsungaccelerator.com) which let us start this project as a hackathon proof of concept.  Scoredb was built with this awesome team (in github lexicographic order!):
187 | 
188 | * https://github.com/davidgljay
189 | * https://github.com/ploxiln
190 | * https://github.com/pschanely
191 | * https://github.com/rmarianski
192 | * https://github.com/sleepylemur
193 | 
194 | 
195 | # Plugs
196 | 
197 | Check out of some of our other side projects too:
198 | 
199 | * [wildflower-touch](https://github.com/pschanely/wildflower-touch) is proof-of-concept programming IDE and language for touch devices.
200 | * [music-tonight](http://musictonightapp.com) makes playlists of bands playing near you, tonight.
201 | 


--------------------------------------------------------------------------------
/bench.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"encoding/csv"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"os"
  9 | 	"strconv"
 10 | 	"time"
 11 | )
 12 | 
 13 | type LinearCombinationBackend interface {
 14 | 	BulkIndex(records []Record) error
 15 | 	LinearQuery(numResults int, coefs map[string]float32) []string
 16 | }
 17 | 
 18 | func (db BaseDb) LinearQuery(numResults int, weights map[string]float32) []string {
 19 | 	scorer := make([]interface{}, len(weights)+1)
 20 | 	scorer[0] = "sum"
 21 | 	idx := 1
 22 | 	for key, weight := range weights {
 23 | 		scorer[idx] = []interface{}{"scale", weight, []interface{}{"field", key}}
 24 | 		idx += 1
 25 | 	}
 26 | 	result, _ := db.Query(Query{
 27 | 		Limit:  numResults,
 28 | 		Scorer: scorer,
 29 | 	})
 30 | 	return result.Ids
 31 | }
 32 | 
 33 | func RunBenchmark(db LinearCombinationBackend, csvFilename string, maxRecords int64) ([]int64, []int64, [][]int64, error) {
 34 | 	fp, err := os.Open(csvFilename)
 35 | 	if err != nil {
 36 | 		return nil, nil, nil, err
 37 | 	}
 38 | 	defer fp.Close()
 39 | 
 40 | 	bufReader := bufio.NewReader(fp)
 41 | 	csvReader := csv.NewReader(bufReader)
 42 | 
 43 | 	header, err := csvReader.Read()
 44 | 	if err == io.EOF {
 45 | 		return nil, nil, nil, fmt.Errorf("Missing csv header")
 46 | 	} else if err != nil {
 47 | 		return nil, nil, nil, fmt.Errorf("Error reading csv header")
 48 | 	}
 49 | 
 50 | 	// TODO ensure we have at least one value?
 51 | 
 52 | 	colMap := make(map[int]string, len(header))
 53 | 	for colIdx, colName := range header {
 54 | 		colMap[colIdx] = colName
 55 | 	}
 56 | 
 57 | 	totalRecs := []int64{}
 58 | 	indexTimes := []int64{}
 59 | 	queryTimes := [][]int64{}
 60 | 	nResults := 10
 61 | 	weights := []map[string]float32{
 62 | 		map[string]float32{
 63 | 			"age":   100.0,
 64 | 			"wages": 1.0,
 65 | 		},
 66 | 		map[string]float32{
 67 | 			"age":   10000.0,
 68 | 			"wages": 1.0,
 69 | 		},
 70 | 		map[string]float32{
 71 | 			"sex":               40.0,
 72 | 			"weekly_work_hours": 1.0,
 73 | 		},
 74 | 		map[string]float32{
 75 | 			"fertility": 10.0,
 76 | 			"age":       1.0,
 77 | 		},
 78 | 		map[string]float32{
 79 | 			"fertility":         5.0,
 80 | 			"age":               1.0,
 81 | 			"weekly_work_hours": 1.0,
 82 | 		},
 83 | 		map[string]float32{
 84 | 			"sex":               100.0,
 85 | 			"fertility":         9.0,
 86 | 			"age":               1.0,
 87 | 			"weekly_work_hours": 1.0,
 88 | 		},
 89 | 	}
 90 | 
 91 | 	bucketSize := 1000
 92 | 	recordGroup := make([]Record, bucketSize)
 93 | 	totalCount := int64(0)
 94 | 	curGroupSize := 0
 95 | 
 96 | 	for {
 97 | 		row, err := csvReader.Read()
 98 | 		if err == io.EOF {
 99 | 			break
100 | 		} else if err != nil {
101 | 			return nil, nil, nil, fmt.Errorf("Error reading csv contents")
102 | 		}
103 | 		record := make(map[string]float32, len(row))
104 | 		for fieldIdx, fieldValue := range row {
105 | 			recordKey, ok := colMap[fieldIdx]
106 | 			if !ok {
107 | 				// if we don't have header mappings, skip
108 | 				break
109 | 			}
110 | 			val64, err := strconv.ParseFloat(fieldValue, 32)
111 | 			if err != nil {
112 | 				continue
113 | 			}
114 | 			val32 := float32(val64)
115 | 			record[recordKey] = val32
116 | 		}
117 | 		if len(record) > 0 {
118 | 			// indexing one at a time
119 | 			// id := db.Index(record)
120 | 			// recordIndexIds = append(recordIndexIds, id)
121 | 
122 | 			totalCount++
123 | 			recordGroup[curGroupSize] = Record{Id: fmt.Sprintf("%d", totalCount), Values: record}
124 | 			curGroupSize++
125 | 			if curGroupSize == bucketSize {
126 | 				t0 := time.Now().UnixNano()
127 | 				db.BulkIndex(recordGroup)
128 | 				totalRecs = append(totalRecs, totalCount)
129 | 				indexTimes = append(indexTimes, time.Now().UnixNano()-t0)
130 | 				queryRoundTimes := make([]int64, len(weights))
131 | 
132 | 				for idx, query := range weights {
133 | 					//fmt.Printf("%08d Q start\n", time.Now().UnixNano() % 100000000)
134 | 					t0 := time.Now().UnixNano()
135 | 					results := db.LinearQuery(nResults, query)
136 | 					queryTime := time.Now().UnixNano() - t0
137 | 					fmt.Printf("%08d Q results: %v\n", time.Now().UnixNano()%100000000, results)
138 | 					queryRoundTimes[idx] = queryTime
139 | 				}
140 | 				curGroupSize = 0
141 | 				queryTimes = append(queryTimes, queryRoundTimes)
142 | 				bucketSize += bucketSize * 2
143 | 				if totalCount >= maxRecords {
144 | 					break
145 | 				}
146 | 				if bucketSize > 100000 {
147 | 					bucketSize = 100000
148 | 				}
149 | 				recordGroup = make([]Record, bucketSize)
150 | 			}
151 | 		}
152 | 	}
153 | 	if curGroupSize > 0 {
154 | 		finalRecords := make([]Record, curGroupSize)
155 | 		copy(finalRecords, recordGroup)
156 | 		db.BulkIndex(finalRecords)
157 | 	}
158 | 
159 | 	return totalRecs, indexTimes, queryTimes, nil
160 | }
161 | 


--------------------------------------------------------------------------------
/bitreader.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"github.com/edsrzf/mmap-go"
  7 | 	"io"
  8 | 	"os"
  9 | 	"unsafe"
 10 | )
 11 | 
 12 | type BitWriter struct {
 13 | 	BufferedWriter *bufio.Writer
 14 | 	File           *os.File
 15 | 	Cur            uint64
 16 | 	CurBitsUsed    uint
 17 | }
 18 | 
 19 | func FileIsAtEnd(file *os.File) bool {
 20 | 	stat, _ := file.Stat()
 21 | 	pos, _ := file.Seek(0, 1)
 22 | 	return pos == stat.Size()
 23 | }
 24 | 
 25 | func WriteNativeLong(val uint64, writer io.Writer) error {
 26 | 	byteSlice := (*((*[8]byte)(unsafe.Pointer(&val))))[:]
 27 | 	_, err := writer.Write(byteSlice)
 28 | 	return err
 29 | }
 30 | 
 31 | func ReadNativeLong(buf []byte) uint64 {
 32 | 	return *((*uint64)(unsafe.Pointer(&buf[0])))
 33 | }
 34 | 
 35 | func NewBitWriter(file *os.File) (*BitWriter, error) {
 36 | 	writer := BitWriter{File: file}
 37 | 	if !FileIsAtEnd(file) {
 38 | 		buf := make([]byte, 16)
 39 | 
 40 | 		file.Seek(-16, 2) // Goto EOF (whence=2 means "relative to end")
 41 | 		nRead, err := file.Read(buf)
 42 | 		if nRead != 16 {
 43 | 			return nil, err
 44 | 		}
 45 | 		writer.CurBitsUsed = uint(ReadNativeLong(buf[8:]))
 46 | 		writer.Cur = ReadNativeLong(buf) >> (64 - writer.CurBitsUsed)
 47 | 
 48 | 		file.Seek(-16, 2) // Goto EOF (whence=2 means "relative to end")
 49 | 	}
 50 | 	writer.BufferedWriter = bufio.NewWriter(file)
 51 | 	return &writer, nil
 52 | }
 53 | 
 54 | func (writer *BitWriter) Close() error {
 55 | 	bitsUsed := writer.CurBitsUsed
 56 | 	WriteNativeLong(writer.Cur<<(64-bitsUsed), writer.BufferedWriter)
 57 | 	WriteNativeLong(uint64(bitsUsed), writer.BufferedWriter)
 58 | 	err := writer.BufferedWriter.Flush()
 59 | 	if err != nil {
 60 | 		return err
 61 | 	}
 62 | 	return writer.File.Close()
 63 | }
 64 | 
 65 | func (writer *BitWriter) WriteBits(val uint64, numBits uint) error { // assumes val is all zeros above numBits
 66 | 	cur, bitsUsed := writer.Cur, writer.CurBitsUsed
 67 | 	overflow := int(bitsUsed+numBits) - 64
 68 | 	if overflow >= 0 { // split the write
 69 | 		initialBits := numBits - uint(overflow)
 70 | 		cur = (cur << initialBits) | (val >> uint(overflow))
 71 | 		err := WriteNativeLong(cur, writer.BufferedWriter)
 72 | 		if err != nil {
 73 | 			return err
 74 | 		}
 75 | 		writer.Cur = val
 76 | 		writer.CurBitsUsed = uint(overflow)
 77 | 	} else {
 78 | 		writer.Cur = (cur << numBits) | val
 79 | 		writer.CurBitsUsed += numBits
 80 | 	}
 81 | 	return nil
 82 | }
 83 | 
 84 | func (writer *BitWriter) WriteVarUInt32(val uint32) error {
 85 | 	var sizeFactor uint64
 86 | 	if val&0xfffffff0 == 0 {
 87 | 		sizeFactor = 0
 88 | 	} else if val&0xffffff00 == 0 {
 89 | 		sizeFactor = 1
 90 | 	} else if val&0xffff0000 == 0 {
 91 | 		sizeFactor = 2
 92 | 	} else {
 93 | 		sizeFactor = 3
 94 | 	}
 95 | 	writer.WriteBits(sizeFactor, 2)
 96 | 	numBits := uint(4 << sizeFactor)
 97 | 	writer.WriteBits(uint64(val), numBits)
 98 | 	return nil
 99 | }
100 | 
101 | type BitReader struct {
102 | 	OrigMmap        *mmap.MMap
103 | 	Mmap            []uint64
104 | 	MmapPtr         uint
105 | 	MmapPtrBitsLeft uint
106 | 	File            *os.File
107 | 	Cur             uint64
108 | 	CurBitsLeft     uint
109 | }
110 | 
111 | func NewBitReader(file *os.File) (*BitReader, error) {
112 | 	mapSlice, err := mmap.Map(file, mmap.RDONLY, 0)
113 | 	if err != nil {
114 | 		panic(err)
115 | 	}
116 | 	curPos, err := file.Seek(0, 1)
117 | 	if curPos%8 != 0 {
118 | 		panic(fmt.Sprintf("BitReader started at byte %v; must be 8 byte aligned", curPos))
119 | 	}
120 | 	return &BitReader{
121 | 		File:            file,
122 | 		OrigMmap:        &mapSlice,
123 | 		Mmap:            (*((*[10000000]uint64)(unsafe.Pointer(&mapSlice[0]))))[:],
124 | 		MmapPtr:         uint(curPos / 8),
125 | 		MmapPtrBitsLeft: 64,
126 | 	}, nil
127 | }
128 | 
129 | func (reader *BitReader) Close() error {
130 | 	reader.Mmap = []uint64{}
131 | 	err := reader.OrigMmap.Unmap()
132 | 	if err != nil {
133 | 		return err
134 | 	}
135 | 	return reader.File.Close()
136 | }
137 | 
138 | func (reader *BitReader) Refill(cur uint64, bitsLeft uint, numNeeded uint) (uint64, uint, error) {
139 | 	wanted := 64 - bitsLeft
140 | 	if wanted >= reader.MmapPtrBitsLeft {
141 | 		bits := reader.Mmap[reader.MmapPtr] << (64 - reader.MmapPtrBitsLeft)
142 | 		cur = cur | (bits >> bitsLeft)
143 | 		bitsLeft += reader.MmapPtrBitsLeft
144 | 		wanted -= reader.MmapPtrBitsLeft
145 | 		reader.MmapPtrBitsLeft = 64
146 | 		reader.MmapPtr += 1
147 | 		if wanted == 0 {
148 | 			return cur, bitsLeft, nil
149 | 		}
150 | 	}
151 | 	bits := reader.Mmap[reader.MmapPtr] << (64 - reader.MmapPtrBitsLeft)
152 | 	cur = cur | (bits >> bitsLeft)
153 | 	reader.MmapPtrBitsLeft -= wanted
154 | 	bitsLeft = 64
155 | 	return cur, bitsLeft, nil
156 | }
157 | 
158 | func (reader *BitReader) ReadBits(numBits uint) (uint64, error) {
159 | 	cur, bitsLeft := reader.Cur, reader.CurBitsLeft
160 | 	var err error
161 | 	if bitsLeft < numBits {
162 | 		cur, bitsLeft, err = reader.Refill(cur, bitsLeft, numBits)
163 | 		if err != nil {
164 | 			return 0, err
165 | 		}
166 | 	}
167 | 	val := cur >> (64 - numBits)
168 | 	cur = cur << numBits
169 | 	bitsLeft -= numBits
170 | 	reader.Cur, reader.CurBitsLeft = cur, bitsLeft
171 | 	return val, nil
172 | }
173 | 
174 | func (reader *BitReader) ReadVarUInt32() (uint32, error) {
175 | 	sizeFactor, err := reader.ReadBits(2)
176 | 	if err != nil {
177 | 		return 0, err
178 | 	}
179 | 	numNeeded := uint(4 << sizeFactor)
180 | 	val, err := reader.ReadBits(numNeeded)
181 | 	if err != nil {
182 | 		return 0, err
183 | 	}
184 | 	return uint32(val), nil
185 | }
186 | 


--------------------------------------------------------------------------------
/bitreader_test.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"os"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestBitReader(t *testing.T) {
  9 | 	filename := RmAllTestData()("bitreader")
 10 | 	defer RmAllTestData()
 11 | 
 12 | 	file, err := os.Create(filename)
 13 | 	if err != nil {
 14 | 		t.Fatalf("%v", err)
 15 | 	}
 16 | 
 17 | 	wtr, err := NewBitWriter(file)
 18 | 	if err != nil {
 19 | 		t.Fatalf("%v", err)
 20 | 	}
 21 | 	wtr.WriteVarUInt32(7)
 22 | 	wtr.WriteBits(42, 21)
 23 | 	wtr.WriteVarUInt32(0)
 24 | 	wtr.WriteVarUInt32(1)
 25 | 	wtr.WriteVarUInt32(2)
 26 | 	wtr.WriteVarUInt32(123)
 27 | 	wtr.WriteVarUInt32(12345)
 28 | 	wtr.WriteVarUInt32(1234567)
 29 | 	wtr.WriteVarUInt32(123456789)
 30 | 	err = wtr.Close()
 31 | 	if err != nil {
 32 | 		t.Fatalf("%v", err)
 33 | 	}
 34 | 
 35 | 	// try adding mroe stuff at the end
 36 | 	file, err = os.OpenFile(filename, os.O_RDWR, 0666)
 37 | 	if err != nil {
 38 | 		t.Fatalf("%v", err)
 39 | 	}
 40 | 	wtr, err = NewBitWriter(file)
 41 | 	if err != nil {
 42 | 		t.Fatalf("%v", err)
 43 | 	}
 44 | 	wtr.WriteVarUInt32(7654321)
 45 | 	err = wtr.Close()
 46 | 	if err != nil {
 47 | 		t.Fatalf("%v", err)
 48 | 	}
 49 | 
 50 | 	fd, err := os.OpenFile(filename, os.O_RDWR, 0666)
 51 | 	if err != nil {
 52 | 		t.Fatalf("%v", err)
 53 | 	}
 54 | 	rdr, err := NewBitReader(fd)
 55 | 	if err != nil {
 56 | 		t.Fatalf("%v", err)
 57 | 	}
 58 | 	val, err := rdr.ReadVarUInt32()
 59 | 	if err != nil || val != 7 {
 60 | 		t.Fatalf("val:%v, err:%v", val, err)
 61 | 	}
 62 | 	fixedval, err := rdr.ReadBits(21)
 63 | 	if err != nil || fixedval != 42 {
 64 | 		t.Fatalf("val:%v, err:%v", fixedval, err)
 65 | 	}
 66 | 	val, err = rdr.ReadVarUInt32()
 67 | 	if err != nil || val != 0 {
 68 | 		t.Fatalf("val:%v, err:%v", val, err)
 69 | 	}
 70 | 	val, err = rdr.ReadVarUInt32()
 71 | 	if err != nil || val != 1 {
 72 | 		t.Fatalf("val:%v, err:%v", val, err)
 73 | 	}
 74 | 	val, err = rdr.ReadVarUInt32()
 75 | 	if err != nil || val != 2 {
 76 | 		t.Fatalf("val:%v, err:%v", val, err)
 77 | 	}
 78 | 	val, err = rdr.ReadVarUInt32()
 79 | 	if err != nil || val != 123 {
 80 | 		t.Fatalf("val:%v, err:%v", val, err)
 81 | 	}
 82 | 	val, err = rdr.ReadVarUInt32()
 83 | 	if err != nil || val != 12345 {
 84 | 		t.Fatalf("val:%v, err:%v", val, err)
 85 | 	}
 86 | 	val, err = rdr.ReadVarUInt32()
 87 | 	if err != nil || val != 1234567 {
 88 | 		t.Fatalf("val:%v, err:%v", val, err)
 89 | 	}
 90 | 	val, err = rdr.ReadVarUInt32()
 91 | 	if err != nil || val != 123456789 {
 92 | 		t.Fatalf("val:%v, err:%v", val, err)
 93 | 	}
 94 | 	val, err = rdr.ReadVarUInt32()
 95 | 	if err != nil || val != 7654321 {
 96 | 		t.Fatalf("val:%v, err:%v", val, err)
 97 | 	}
 98 | 	err = rdr.Close()
 99 | 	if err != nil {
100 | 		t.Fatalf("%v", err)
101 | 	}
102 | 
103 | }
104 | 
105 | func TestBitReaderVolume(t *testing.T) {
106 | 	filename := RmAllTestData()("bitreader.volume")
107 | 	defer RmAllTestData()
108 | 
109 | 	file, err := os.Create(filename)
110 | 	if err != nil {
111 | 		t.Fatalf("%v", err)
112 | 	}
113 | 
114 | 	wtr, err := NewBitWriter(file)
115 | 	if err != nil {
116 | 		t.Fatalf("%v", err)
117 | 	}
118 | 
119 | 	for i := 0; i < 200; i++ {
120 | 		wtr.WriteVarUInt32(uint32(i * i))
121 | 		wtr.WriteBits(uint64(i), uint(i%23)+10)
122 | 	}
123 | 	err = wtr.Close()
124 | 	if err != nil {
125 | 		t.Fatalf("%v", err)
126 | 	}
127 | 
128 | 	fd, err := os.OpenFile(filename, os.O_RDWR, 0666)
129 | 	if err != nil {
130 | 		t.Fatalf("%v", err)
131 | 	}
132 | 	rdr, err := NewBitReader(fd)
133 | 	if err != nil {
134 | 		t.Fatalf("%v", err)
135 | 	}
136 | 	for i := 0; i < 200; i++ {
137 | 		val, err := rdr.ReadVarUInt32()
138 | 		if err != nil || int(val) != i*i {
139 | 			t.Fatalf("val:%v, err:%v", val, err)
140 | 		}
141 | 		fixedval, err := rdr.ReadBits(uint(i%23) + 10)
142 | 		if err != nil || int(fixedval) != i {
143 | 			t.Fatalf("val:%v, err:%v", fixedval, err)
144 | 		}
145 | 	}
146 | 	err = rdr.Close()
147 | 	if err != nil {
148 | 		t.Fatalf("%v", err)
149 | 	}
150 | }
151 | 


--------------------------------------------------------------------------------
/boltiddb.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import (
 4 | 	"encoding/binary"
 5 | 	"fmt"
 6 | 	"github.com/boltdb/bolt"
 7 | )
 8 | 
 9 | func NewBoltIdDb(file string) (*BoltIdDb, error) {
10 | 	db, err := bolt.Open(file, 0600, nil)
11 | 	if err != nil {
12 | 		return nil, err
13 | 	}
14 | 	return &BoltIdDb{Db: db}, nil
15 | }
16 | 
17 | type BoltIdDb struct {
18 | 	Db *bolt.DB
19 | }
20 | 
21 | func encodeScoreId(id int64) []byte {
22 | 	var buf [9]byte
23 | 	slice := buf[:]
24 | 	sz := binary.PutVarint(slice, id)
25 | 	return slice[:sz]
26 | }
27 | 
28 | var boltBucketName []byte = []byte("ScoreDbIds")
29 | 
30 | func (db *BoltIdDb) Put(scoreIds []int64, clientIds []string) error {
31 | 	return db.Db.Update(func(tx *bolt.Tx) error {
32 | 		b, err := tx.CreateBucketIfNotExists([]byte(boltBucketName))
33 | 		if err != nil {
34 | 			return err
35 | 		}
36 | 		for idx, scoreId := range scoreIds {
37 | 			err = b.Put(encodeScoreId(scoreId), []byte(clientIds[idx]))
38 | 			if err != nil {
39 | 				return err
40 | 			}
41 | 		}
42 | 		return nil
43 | 	})
44 | }
45 | 
46 | func (db *BoltIdDb) Get(scoreIds []int64) ([]string, error) {
47 | 	result := make([]string, len(scoreIds))
48 | 
49 | 	err := db.Db.View(func(tx *bolt.Tx) error {
50 | 		b := tx.Bucket([]byte(boltBucketName))
51 | 		for idx, scoreId := range scoreIds {
52 | 			clientIdBytes := b.Get(encodeScoreId(scoreId))
53 | 			if clientIdBytes == nil {
54 | 				return fmt.Errorf("Unable to find client id for internal id %d", scoreId)
55 | 			}
56 | 			result[idx] = string(clientIdBytes[:])
57 | 			//fmt.Printf(" ID %v %v %v %v\n", idx, scoreId, clientIdBytes, result[idx])
58 | 		}
59 | 		return nil
60 | 	})
61 | 
62 | 	return result, err
63 | }
64 | 


--------------------------------------------------------------------------------
/bucket_execution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pschanely/scoredb/57beea075b4b5a53ee0a27b9752a0ca544c4510d/bucket_execution.png


--------------------------------------------------------------------------------
/customlineardocitr.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"sort"
  5 | )
  6 | 
  7 | type CustomPoint struct {
  8 | 	X, Y float32
  9 | }
 10 | 
 11 | // Remaps a value according to a user-specified function that linearly interpolates
 12 | // among a set of (x, y) points.
 13 | type CustomLinearDocItr struct {
 14 | 	points []CustomPoint
 15 | 	docItr DocItr
 16 | }
 17 | 
 18 | func ComputeCustomFunc(x float32, points []CustomPoint) float32 {
 19 | 	numPoints := len(points)
 20 | 	idx := sort.Search(numPoints, func(i int) bool {
 21 | 		return points[i].X >= x
 22 | 	})
 23 | 	if idx == 0 {
 24 | 		return points[0].Y
 25 | 	} else if idx == numPoints {
 26 | 		return points[numPoints-1].Y
 27 | 	} else {
 28 | 		p1 := points[idx-1]
 29 | 		p2 := points[idx]
 30 | 		pctInto := (x - p1.X) / (p2.X - p1.X)
 31 | 		return p2.Y*pctInto + p1.Y*(1.0-pctInto)
 32 | 	}
 33 | }
 34 | 
 35 | func (op *CustomLinearDocItr) Name() string { return "CustomLinearDocItr" }
 36 | func (op *CustomLinearDocItr) Cur() (int64, float32) {
 37 | 	docId, score := op.docItr.Cur()
 38 | 	return docId, ComputeCustomFunc(score, op.points)
 39 | }
 40 | func (op *CustomLinearDocItr) GetBounds() (min, max float32) {
 41 | 	insideMin, insideMax := op.docItr.GetBounds()
 42 | 	outsideMin := ComputeCustomFunc(insideMin, op.points)
 43 | 	outsideMax := ComputeCustomFunc(insideMax, op.points)
 44 | 	if outsideMin > outsideMax { // swap if required
 45 | 		outsideMin, outsideMax = outsideMax, outsideMin
 46 | 	}
 47 | 	// functions need not be monotonic, check for peaks inside the X range
 48 | 	for _, point := range op.points {
 49 | 		if point.X <= insideMin {
 50 | 			continue
 51 | 		} else if point.X >= insideMax {
 52 | 			break
 53 | 		} else {
 54 | 			y := point.Y
 55 | 			outsideMax = Max(outsideMax, y)
 56 | 			outsideMin = Min(outsideMin, y)
 57 | 		}
 58 | 	}
 59 | 	return outsideMin, outsideMax
 60 | }
 61 | func (op *CustomLinearDocItr) Close() {
 62 | 	op.docItr.Close()
 63 | }
 64 | func (op *CustomLinearDocItr) Next(minId int64) bool {
 65 | 	return op.docItr.Next(minId)
 66 | }
 67 | 
 68 | func CheckIntersection(yValue float32, p1, p2 CustomPoint, insideMin, insideMax *float32) {
 69 | 	var xIntersect float32
 70 | 	// intersect descending:  y 3 at {3 3}-{6 1}: 0
 71 | 	if p1.Y <= yValue && yValue <= p2.Y { // intersect while function is ascending
 72 | 		earliness := (p2.Y - yValue) / (p2.Y - p1.Y)
 73 | 		xIntersect = p1.X*earliness + p2.X*(1.0-earliness)
 74 | 	} else if p1.Y >= yValue && yValue >= p2.Y { // intersect while function is descending
 75 | 		lateness := (p1.Y - yValue) / (p1.Y - p2.Y)
 76 | 		xIntersect = p2.X*lateness + p1.X*(1.0-lateness)
 77 | 	} else {
 78 | 		return
 79 | 	}
 80 | 	*insideMin = Min(xIntersect, *insideMin)
 81 | 	*insideMax = Max(xIntersect, *insideMax)
 82 | }
 83 | 
 84 | func (op *CustomLinearDocItr) SetBounds(outsideMin, outsideMax float32) bool {
 85 | 	insideMin, insideMax := PositiveInfinity, NegativeInfinity // start with impossible (inverted) range
 86 | 	for idx := len(op.points) - 1; idx > 0; idx-- {
 87 | 		p1 := op.points[idx-1]
 88 | 		p2 := op.points[idx]
 89 | 		CheckIntersection(outsideMin, p1, p2, &insideMin, &insideMax)
 90 | 		CheckIntersection(outsideMax, p1, p2, &insideMin, &insideMax)
 91 | 		if outsideMin <= p2.Y && p2.Y <= outsideMax {
 92 | 			insideMin = Min(insideMin, p2.X)
 93 | 			insideMax = Max(insideMax, p2.X)
 94 | 		}
 95 | 	}
 96 | 	firstPoint := op.points[0]
 97 | 	if outsideMin <= firstPoint.Y && firstPoint.Y <= outsideMax {
 98 | 		insideMin = NegativeInfinity
 99 | 	}
100 | 	lastPoint := op.points[len(op.points)-1]
101 | 	if outsideMin <= lastPoint.Y && lastPoint.Y <= outsideMax {
102 | 		insideMax = PositiveInfinity
103 | 	}
104 | 	return op.docItr.SetBounds(insideMin, insideMax)
105 | }
106 | 


--------------------------------------------------------------------------------
/customlineardocitr_test.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func BoundsEqualish(actualMin, actualMax, expectedMin, expectedMax float32) bool {
  9 | 	tolerance := 0.0000001
 10 | 	if math.Abs(float64(actualMin-expectedMin)) > tolerance {
 11 | 		return false
 12 | 	}
 13 | 	if math.Abs(float64(actualMax-expectedMax)) > tolerance {
 14 | 		return false
 15 | 	}
 16 | 	return true
 17 | }
 18 | 
 19 | func TestComputeCustomFunc(t *testing.T) {
 20 | 	v := ComputeCustomFunc(1.0, []CustomPoint{CustomPoint{0.0, 0.0}, CustomPoint{3.0, 3.0}})
 21 | 	if v != 1.0 {
 22 | 		t.Fatalf("%v", v)
 23 | 	}
 24 | 	v = ComputeCustomFunc(-1, []CustomPoint{CustomPoint{0, 0}, CustomPoint{3, 3}})
 25 | 	if v != 0.0 {
 26 | 		t.Fatalf("%v", v)
 27 | 	}
 28 | 	v = ComputeCustomFunc(3, []CustomPoint{CustomPoint{0, 0}, CustomPoint{3, 3}})
 29 | 	if v != 3.0 {
 30 | 		t.Fatalf("%v", v)
 31 | 	}
 32 | }
 33 | 
 34 | func TestCustomLinearDocItr(t *testing.T) {
 35 | 	inside := NewMemoryScoreDocItr([]float32{-1, 0, 2, 8, 5, 9, 12})
 36 | 	outside := CustomLinearDocItr{
 37 | 		docItr: inside,
 38 | 		points: []CustomPoint{
 39 | 			CustomPoint{0, 0}, // kind of a zig-zag function...
 40 | 			CustomPoint{3, 3},
 41 | 			CustomPoint{6, 1},
 42 | 			CustomPoint{9, 2},
 43 | 		},
 44 | 	}
 45 | 
 46 | 	min, max := inside.GetBounds()
 47 | 	if !BoundsEqualish(min, max, -1, 12) {
 48 | 		t.Fatalf("%v:%v", min, max)
 49 | 	}
 50 | 	min, max = outside.GetBounds()
 51 | 	if !BoundsEqualish(min, max, 0.0, 3.0) {
 52 | 		t.Fatalf("%v:%v", min, max)
 53 | 	}
 54 | 
 55 | 	// should leave unchanged
 56 | 	outside.SetBounds(0, 4)
 57 | 	min, max = inside.GetBounds()
 58 | 	if !BoundsEqualish(min, max, -1, 12) {
 59 | 		t.Fatalf("%v:%v", min, max)
 60 | 	}
 61 | 	min, max = outside.GetBounds()
 62 | 	if !BoundsEqualish(min, max, 0.0, 3.0) {
 63 | 		t.Fatalf("%v:%v", min, max)
 64 | 	}
 65 | 
 66 | 	// nudge the start up some
 67 | 	outside.SetBounds(0.5, 3)
 68 | 	min, max = inside.GetBounds()
 69 | 	if !BoundsEqualish(min, max, 0.5, 12) {
 70 | 		t.Fatalf("%v:%v", min, max)
 71 | 	}
 72 | 	min, max = outside.GetBounds()
 73 | 	if !BoundsEqualish(min, max, 0.5, 3.0) {
 74 | 		t.Fatalf("%v:%v", min, max)
 75 | 	}
 76 | 
 77 | 	// chop off the end (leaves a hole in the middle of the function)
 78 | 	outside.SetBounds(0.5, 1.5)
 79 | 	min, max = inside.GetBounds()
 80 | 	if !BoundsEqualish(min, max, 0.5, 7.5) {
 81 | 		t.Fatalf("%v:%v", min, max)
 82 | 	}
 83 | 	min, max = outside.GetBounds()
 84 | 	if !BoundsEqualish(min, max, 0.5, 3.0) {
 85 | 		t.Fatalf("%v:%v", min, max)
 86 | 	}
 87 | 
 88 | 	// chop off most of the end
 89 | 	outside.SetBounds(0.5, 0.9)
 90 | 	min, max = inside.GetBounds()
 91 | 	if !BoundsEqualish(min, max, 0.5, 0.9) {
 92 | 		t.Fatalf("%v:%v", min, max)
 93 | 	}
 94 | 	min, max = outside.GetBounds()
 95 | 	if !BoundsEqualish(min, max, 0.5, 0.9) {
 96 | 		t.Fatalf("%v:%v", min, max)
 97 | 	}
 98 | 
 99 | }
100 | 


--------------------------------------------------------------------------------
/custommapdocitr.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import ()
 4 | 
 5 | // Remaps a value according to a user-specified mapping of values to scores
 6 | type CustomMapDocItr struct {
 7 | 	points map[float32]float32
 8 | 	deflt  float32
 9 | 	docItr DocItr
10 | }
11 | 
12 | func (op *CustomMapDocItr) ComputeCustomFunc(val float32) float32 {
13 | 	score, ok := op.points[val]
14 | 	if ok {
15 | 		return score
16 | 	} else {
17 | 		return op.deflt
18 | 	}
19 | }
20 | 
21 | func (op *CustomMapDocItr) Name() string { return "CustomMapDocItr" }
22 | func (op *CustomMapDocItr) Cur() (int64, float32) {
23 | 	docId, score := op.docItr.Cur()
24 | 	return docId, op.ComputeCustomFunc(score)
25 | }
26 | func (op *CustomMapDocItr) GetBounds() (min, max float32) {
27 | 	insideMin, insideMax := op.docItr.GetBounds()
28 | 	outsideMin := op.deflt
29 | 	outsideMax := op.deflt
30 | 	for input, output := range op.points {
31 | 		if insideMin <= input && input <= insideMax {
32 | 			outsideMin = Min(outsideMin, output)
33 | 			outsideMax = Max(outsideMax, output)
34 | 		}
35 | 	}
36 | 	return outsideMin, outsideMax
37 | }
38 | func (op *CustomMapDocItr) Close() {
39 | 	op.docItr.Close()
40 | }
41 | func (op *CustomMapDocItr) Next(minId int64) bool {
42 | 	return op.docItr.Next(minId)
43 | }
44 | 
45 | func (op *CustomMapDocItr) SetBounds(outsideMin, outsideMax float32) bool {
46 | 	if outsideMin <= op.deflt && op.deflt <= outsideMax {
47 | 		return true
48 | 	}
49 | 
50 | 	insideMin, insideMax := PositiveInfinity, NegativeInfinity // start with impossible (inverted) range
51 | 	for input, output := range op.points {
52 | 		if outsideMin <= output && output <= outsideMax {
53 | 			insideMin = Min(insideMin, input)
54 | 			insideMax = Max(insideMax, input)
55 | 		}
56 | 	}
57 | 	return op.docItr.SetBounds(insideMin, insideMax)
58 | }
59 | 


--------------------------------------------------------------------------------
/custommapdocitr_test.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestCustomMapDocItr(t *testing.T) {
 8 | 	inside := NewMemoryScoreDocItr([]float32{-1, 0, 2, 8, 5, 9, 12})
 9 | 	outside := CustomMapDocItr{
10 | 		docItr: inside,
11 | 		deflt:  0.0,
12 | 		points: map[float32]float32{ // kind of a zig-zag function...
13 | 			-2: -2,
14 | 			2:  2,
15 | 			5:  3,
16 | 			6:  1,
17 | 		},
18 | 	}
19 | 
20 | 	min, max := inside.GetBounds()
21 | 	if !BoundsEqualish(min, max, -1, 12) {
22 | 		t.Fatalf("%v:%v", min, max)
23 | 	}
24 | 	min, max = outside.GetBounds()
25 | 	if !BoundsEqualish(min, max, 0.0, 3.0) {
26 | 		t.Fatalf("%v:%v", min, max)
27 | 	}
28 | 
29 | 	// should leave unchanged
30 | 	outside.SetBounds(-2, 4)
31 | 	min, max = inside.GetBounds()
32 | 	if !BoundsEqualish(min, max, -1, 12) {
33 | 		t.Fatalf("%v:%v", min, max)
34 | 	}
35 | 	min, max = outside.GetBounds()
36 | 	if !BoundsEqualish(min, max, 0.0, 3.0) {
37 | 		t.Fatalf("%v:%v", min, max)
38 | 	}
39 | 
40 | 	// nudge the start up some
41 | 	outside.SetBounds(0.25, 3)
42 | 	min, max = inside.GetBounds()
43 | 	if !BoundsEqualish(min, max, 2, 6) {
44 | 		t.Fatalf("%v:%v", min, max)
45 | 	}
46 | 	min, max = outside.GetBounds()
47 | 	if !BoundsEqualish(min, max, 0.0, 3.0) {
48 | 		t.Fatalf("%v:%v", min, max)
49 | 	}
50 | 
51 | 	outside.SetBounds(0.5, 1.5)
52 | 	min, max = inside.GetBounds()
53 | 	if !BoundsEqualish(min, max, 6, 6) {
54 | 		t.Fatalf("%v:%v", min, max)
55 | 	}
56 | 	min, max = outside.GetBounds()
57 | 	if !BoundsEqualish(min, max, 0, 1.0) {
58 | 		t.Fatalf("%v:%v", min, max)
59 | 	}
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/dataset_tools/census_p_rec_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #for ZIPFILE in /mnt/census1990/census_1990/1990_PUMS_A/*.zip ; do
 4 | #    unzip -c $ZIPFILE
 5 | #done | grep '^P' >census1990_people.dat
 6 | 
 7 | INPUT=census1990_people.dat
 8 | OUTPUT=census1990_people.csv
 9 | 
10 | # for this dataset, gawk output is different than mawk or nawk, for 7 records (out of millions)
11 | AWK=${AWK:-awk}
12 | 
13 | COLUMNS="
14 | age
15 | children
16 | depart_for_work
17 | traveltime_to_work
18 | weekly_work_hours
19 | last_week_work_hours
20 | carpool_riders
21 | income
22 | wages
23 | poverty_percentage
24 | sex
25 | military_service_years
26 | "
27 | 
28 | (printf 'id'
29 |  for COL in $COLUMNS ; do
30 |      printf ',%s' "$COL"
31 |  done
32 |  printf '\n'                ) >$OUTPUT
33 | 
34 | $AWK '{
35 | children = substr($0,89,2);
36 | children = (children == "00") ? 0 : int(children) - 1;
37 | 
38 | printf("r%d,", NR);
39 | printf("%s,",  substr($0, 15,2));  # age
40 | printf("%s,",  children        );  # children
41 | printf("%s,",  substr($0,105,4));  # depart_for_work
42 | printf("%s,",  substr($0,109,2));  # traveltime_to_work
43 | printf("%s,",  substr($0,125,2));  # weekly_work_hours
44 | printf("%s,",  substr($0, 93,2));  # last_week_work_hours
45 | printf("%s,",  substr($0,104,1));  # carpool_riders
46 | printf("%s,",  substr($0,133,6));  # income
47 | printf("%s,",  substr($0,139,6));  # wages
48 | printf("%s,",  substr($0, 41,3));  # poverty_percentage
49 | printf("%s,",  substr($0, 11,1));  # sex
50 | printf("%s\n", substr($0, 83,2));  # military_service_years
51 | } ' <$INPUT >>$OUTPUT
52 | 


--------------------------------------------------------------------------------
/dataset_tools/sample.csv:
--------------------------------------------------------------------------------
1 | id,first,second,third
2 | r1,1,2,3
3 | r2,0.1,11.234,01.23
4 | r3,000,03,001
5 | 


--------------------------------------------------------------------------------
/db.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"container/heap"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"math"
  8 | )
  9 | 
 10 | type Query struct {
 11 | 	Offset   int
 12 | 	Limit    int
 13 | 	MinScore float32
 14 | 
 15 | 	// mixed, nested arrays of strings and numbers describing a function; for example: ["sum", ["field", "age"], ["field", "height"]]
 16 | 	Scorer []interface{}
 17 | }
 18 | 
 19 | type DocScore struct {
 20 | 	DocId int64
 21 | 	Score float32
 22 | }
 23 | 
 24 | type Record struct {
 25 | 	Id     string
 26 | 	Values map[string]float32
 27 | }
 28 | 
 29 | type QueryResult struct {
 30 | 	Ids    []string
 31 | 	Scores []float32
 32 | }
 33 | 
 34 | // Three layers of database interfaces, each one wrapping the next:
 35 | 
 36 | type Db interface { // Outermost interface; clients use this
 37 | 	BulkIndex(records []Record) error
 38 | 	Index(id string, values map[string]float32) error
 39 | 	Query(query Query) (QueryResult, error)
 40 | }
 41 | 
 42 | type StreamingDb interface { // Uses a DocItr based query, useful for middleware that alters or combines result streams
 43 | 	BulkIndex(records []map[string]float32) ([]int64, error)
 44 | 	QueryItr(Scorer []interface{}) (DocItr, error)
 45 | }
 46 | 
 47 | type DbBackend interface { // the minimal interface to implement storage (filesystem, memory, etc)
 48 | 	BulkIndex(records []map[string]float32) ([]int64, error)
 49 | 	FieldDocItr(field string) DocItr
 50 | }
 51 | 
 52 | type IdBackend interface { // stores a mapping from scoredb's identifiers to the clients'
 53 | 	Put(scoreIds []int64, clientIds []string) error
 54 | 	Get(scoreIds []int64) ([]string, error)
 55 | }
 56 | 
 57 | type BaseDb struct {
 58 | 	StreamingDb StreamingDb
 59 | 	IdDb        IdBackend
 60 | }
 61 | 
 62 | func (db BaseDb) BulkIndex(records []Record) error {
 63 | 	clientIds := make([]string, len(records))
 64 | 	values := make([]map[string]float32, len(records))
 65 | 	for idx, rec := range records {
 66 | 		values[idx] = rec.Values
 67 | 		clientIds[idx] = rec.Id
 68 | 	}
 69 | 	scoreIds, err := db.StreamingDb.BulkIndex(values)
 70 | 	if err != nil {
 71 | 		return err
 72 | 	}
 73 | 	return db.IdDb.Put(scoreIds, clientIds)
 74 | }
 75 | 
 76 | func (db BaseDb) Index(id string, values map[string]float32) error {
 77 | 	return db.BulkIndex([]Record{Record{Id: id, Values: values}})
 78 | }
 79 | 
 80 | func CandidateIsLess(r1, r2 DocScore) bool {
 81 | 	s1, s2 := r1.Score, r2.Score
 82 | 	if s1 < s2 {
 83 | 		return true
 84 | 	} else if s1 > s2 {
 85 | 		return false
 86 | 	} else {
 87 | 		return r1.DocId < r2.DocId
 88 | 	}
 89 | }
 90 | 
 91 | type BaseDbResultSet []DocScore
 92 | 
 93 | func (h BaseDbResultSet) Len() int           { return len(h) }
 94 | func (h BaseDbResultSet) Less(i, j int) bool { return CandidateIsLess(h[i], h[j]) }
 95 | func (h BaseDbResultSet) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
 96 | func (h *BaseDbResultSet) Push(x interface{}) {
 97 | 	*h = append(*h, x.(DocScore))
 98 | }
 99 | func (h *BaseDbResultSet) Pop() interface{} {
100 | 	old := *h
101 | 	n := len(old)
102 | 	x := old[n-1]
103 | 	*h = old[0 : n-1]
104 | 	return x
105 | }
106 | 
107 | func (db BaseDb) Query(query Query) (QueryResult, error) {
108 | 	itr, err := db.StreamingDb.QueryItr(query.Scorer)
109 | 	if err != nil {
110 | 		return QueryResult{}, err
111 | 	}
112 | 	minScore, offset, limit := query.MinScore, query.Offset, query.Limit
113 | 	if limit == 0 { // we short circuit this case because the code below assumes at least one result
114 | 		return QueryResult{Ids: []string{}}, nil
115 | 	}
116 | 	//fmt.Printf("> %+v\n", query);
117 | 	numResults := offset + limit
118 | 	resultData := make(BaseDbResultSet, 0, numResults+1)
119 | 	results := &resultData
120 | 	heap.Init(results)
121 | 	minCandidate := DocScore{Score: float32(math.Inf(-1))}
122 | 	maxScore := float32(math.Inf(1))
123 | 	docId := int64(-1)
124 | 	var score float32
125 | 	for itr.Next(docId + 1) {
126 | 		docId, score = itr.Cur()
127 | 		if score < minScore {
128 | 			continue
129 | 		}
130 | 		candidate := DocScore{DocId: docId, Score: score}
131 | 		if CandidateIsLess(minCandidate, candidate) {
132 | 			heap.Push(results, candidate)
133 | 			if results.Len() > numResults {
134 | 				heap.Pop(results)
135 | 				minCandidate = resultData[0]
136 | 				itr.SetBounds(minCandidate.Score, maxScore)
137 | 			}
138 | 		}
139 | 	}
140 | 	itr.Close()
141 | 
142 | 	for offset > 0 && len(resultData) > 0 {
143 | 		heap.Pop(results)
144 | 		offset -= 1
145 | 	}
146 | 
147 | 	numResults = results.Len()
148 | 	var resultIds = make([]int64, numResults)
149 | 	var resultScores = make([]float32, numResults)
150 | 	for idx, _ := range resultIds {
151 | 		rec := heap.Pop(results).(DocScore)
152 | 		i := numResults - (idx + 1)
153 | 		resultIds[i] = rec.DocId
154 | 		resultScores[i] = rec.Score
155 | 	}
156 | 	//fmt.Printf("< %+v\n", resultIds);
157 | 	//fmt.Printf("< %+v\n", resultScores);
158 | 
159 | 	clientIds, err := db.IdDb.Get(resultIds)
160 | 	if err != nil {
161 | 		return QueryResult{}, err
162 | 	}
163 | 	return QueryResult{Ids: clientIds, Scores: resultScores}, nil
164 | }
165 | 
166 | func ToFloat32(val interface{}) (float32, error) {
167 | 	switch typed := val.(type) {
168 | 	case float32:
169 | 		return typed, nil
170 | 	case float64:
171 | 		return float32(typed), nil
172 | 	default:
173 | 		return 0.0, errors.New(fmt.Sprintf("Invalid value ('%s') given, must be floating point number", val))
174 | 	}
175 | }
176 | 
177 | func ToXyPoints(input interface{}) ([]CustomPoint, error) {
178 | 	switch inputPoints := input.(type) {
179 | 	case []interface{}:
180 | 		points := make([]CustomPoint, len(inputPoints))
181 | 		for idx, inputPoint := range inputPoints {
182 | 			pair := inputPoint.([]interface{})
183 | 			if len(pair) != 2 {
184 | 				return nil, fmt.Errorf("Invalid (x,y) point; found: '%v' instead", pair)
185 | 			}
186 | 			xPoint, err := ToFloat32(pair[0])
187 | 			if err != nil {
188 | 				return nil, err
189 | 			}
190 | 			yPoint, err := ToFloat32(pair[1])
191 | 			if err != nil {
192 | 				return nil, err
193 | 			}
194 | 			points[idx] = CustomPoint{xPoint, yPoint}
195 | 		}
196 | 		return points, nil
197 | 	default:
198 | 		return nil, fmt.Errorf("Expected array of (x,y) points; found: '%v' instead", input)
199 | 	}
200 | }
201 | 
202 | // BaseStreamingDb : The usual way to bridge a StreamingDb to a DbBackend
203 | 
204 | type BaseStreamingDb struct {
205 | 	Backend DbBackend
206 | }
207 | 
208 | func (db BaseStreamingDb) BulkIndex(records []map[string]float32) ([]int64, error) {
209 | 	return db.Backend.BulkIndex(records)
210 | }
211 | 
212 | func (db BaseStreamingDb) QueryItr(scorer []interface{}) (DocItr, error) {
213 | 	args := scorer[1:]
214 | 	switch scorer[0].(string) {
215 | 	case "sum":
216 | 		fieldItrs := make([]DocItr, len(args))
217 | 		for idx, v := range args {
218 | 			itr, err := db.QueryItr(v.([]interface{}))
219 | 			if err != nil {
220 | 				return nil, err
221 | 			}
222 | 			fieldItrs[idx] = itr
223 | 		}
224 | 		return NewSumDocItr(fieldItrs), nil
225 | 	case "product":
226 | 		fieldItrs := make([]DocItr, len(args))
227 | 		for idx, v := range args {
228 | 			itr, err := db.QueryItr(v.([]interface{}))
229 | 			if err != nil {
230 | 				return nil, err
231 | 			}
232 | 			fieldItrs[idx] = itr
233 | 		}
234 | 		return NewProductDocItr(fieldItrs), nil
235 | 	case "min":
236 | 		fieldItrs := make([]DocItr, len(args))
237 | 		for idx, v := range args {
238 | 			itr, err := db.QueryItr(v.([]interface{}))
239 | 			if err != nil {
240 | 				return nil, err
241 | 			}
242 | 			fieldItrs[idx] = itr
243 | 		}
244 | 		return NewMinDocItr(fieldItrs), nil
245 | 	case "scale":
246 | 		if len(args) != 2 {
247 | 			return nil, errors.New("Wrong number of arguments to scale function")
248 | 		}
249 | 		itr, err := db.QueryItr(args[1].([]interface{}))
250 | 		if err != nil {
251 | 			return nil, err
252 | 		}
253 | 		weight := args[0]
254 | 		switch typed := weight.(type) {
255 | 		case float32:
256 | 			return &ScaleDocItr{typed, itr}, nil
257 | 		case float64:
258 | 			return &ScaleDocItr{float32(typed), itr}, nil
259 | 		default:
260 | 			return nil, errors.New(fmt.Sprintf("Invalid weight ('%s') given to scale function, must be floating point number", weight))
261 | 		}
262 | 	case "diff":
263 | 		if len(args) != 2 {
264 | 			return nil, errors.New("Wrong number of arguments to diff function")
265 | 		}
266 | 		target, err := ToFloat32(args[0])
267 | 		if err != nil {
268 | 			return nil, err
269 | 		}
270 | 		itr, err := db.QueryItr(args[1].([]interface{}))
271 | 		if err != nil {
272 | 			return nil, err
273 | 		}
274 | 		return &DiffDocItr{
275 | 			target: target,
276 | 			itr:    itr,
277 | 		}, nil
278 | 	case "pow":
279 | 		if len(args) != 2 {
280 | 			return nil, errors.New("Wrong number of arguments to pow function")
281 | 		}
282 | 		exp, err := ToFloat32(args[1])
283 | 		if err != nil {
284 | 			return nil, err
285 | 		}
286 | 		itr, err := db.QueryItr(args[0].([]interface{}))
287 | 		if err != nil {
288 | 			return nil, err
289 | 		}
290 | 		return &PowDocItr{
291 | 			itr: itr,
292 | 			exp: exp,
293 | 		}, nil
294 | 
295 | 	case "custom_map":
296 | 		if len(args) != 3 {
297 | 			return nil, errors.New("Wrong number of arguments to custom_map function")
298 | 		}
299 | 
300 | 		points, err := ToXyPoints(args[0])
301 | 		if err != nil {
302 | 			return nil, err
303 | 		}
304 | 
305 | 		deflt, err := ToFloat32(args[1])
306 | 		if err != nil {
307 | 			return nil, err
308 | 		}
309 | 
310 | 		itr, err := db.QueryItr(args[2].([]interface{}))
311 | 		if err != nil {
312 | 			return nil, err
313 | 		}
314 | 
315 | 		scoremap := make(map[float32]float32)
316 | 		for _, pt := range points {
317 | 			scoremap[pt.X] = pt.Y
318 | 		}
319 | 		return &CustomMapDocItr{
320 | 			points: scoremap,
321 | 			deflt:  deflt,
322 | 			docItr: itr,
323 | 		}, nil
324 | 
325 | 	case "custom_linear":
326 | 		if len(args) != 2 {
327 | 			return nil, errors.New("Wrong number of arguments to custom_linear function")
328 | 		}
329 | 
330 | 		inputPoints := args[0].([]interface{})
331 | 		points := make([]CustomPoint, len(inputPoints))
332 | 		for idx, inputPoint := range inputPoints {
333 | 			pair := inputPoint.([]interface{})
334 | 			if len(pair) != 2 {
335 | 				return nil, fmt.Errorf("Invalid (x,y) point in custom_linear; found: '%v' instead", pair)
336 | 			}
337 | 			xPoint, err := ToFloat32(pair[0])
338 | 			if err != nil {
339 | 				return nil, err
340 | 			}
341 | 			yPoint, err := ToFloat32(pair[1])
342 | 			if err != nil {
343 | 				return nil, err
344 | 			}
345 | 			points[idx] = CustomPoint{xPoint, yPoint}
346 | 		}
347 | 
348 | 		itr, err := db.QueryItr(args[1].([]interface{}))
349 | 		if err != nil {
350 | 			return nil, err
351 | 		}
352 | 
353 | 		return &CustomLinearDocItr{
354 | 			points: points,
355 | 			docItr: itr,
356 | 		}, nil
357 | 
358 | 	case "geo_distance":
359 | 		if len(args) != 4 {
360 | 			return nil, errors.New("Wrong number of arguments to geo_distance function")
361 | 		}
362 | 		lat, err := ToFloat32(args[0])
363 | 		if err != nil {
364 | 			return nil, err
365 | 		}
366 | 		lng, err := ToFloat32(args[1])
367 | 		if err != nil {
368 | 			return nil, err
369 | 		}
370 | 		latFieldName := args[2].(string)
371 | 		lngFieldName := args[3].(string)
372 | 		latItr := &DiffDocItr{target: lat, itr: db.Backend.FieldDocItr(latFieldName)}
373 | 		lngItr := &DiffDocItr{target: lng, itr: db.Backend.FieldDocItr(lngFieldName)}
374 | 		// bias longitude distances by approximate latitude (matters less at poles)
375 | 		multiplier := float32(math.Cos(float64(lat) * math.Pi / 180.0))
376 | 		biasedLngItr := &ScaleDocItr{multiplier, lngItr}
377 | 		// square each component
378 | 		latSquaredItr := NewPowDocItr(latItr, 2.0)
379 | 		lngSquaredItr := NewPowDocItr(biasedLngItr, 2.0)
380 | 		// sum and square root
381 | 		distanceItr := NewPowDocItr(NewSumDocItr([]DocItr{latSquaredItr, lngSquaredItr}), 0.5)
382 | 		// convert degrees distance to radians and multiply by radius of the earth (in km)
383 | 		earthRadius := float32(6371.0 * math.Pi / 180.0)
384 | 		return &ScaleDocItr{earthRadius, distanceItr}, nil
385 | 	case "field":
386 | 		if len(args) != 1 {
387 | 			return nil, errors.New("Wrong number of arguments to field function")
388 | 		}
389 | 		key := args[0].(string)
390 | 		return db.Backend.FieldDocItr(key), nil
391 | 	default:
392 | 		return nil, errors.New(fmt.Sprintf("Scoring function '%s' is not recognized", scorer[0]))
393 | 	}
394 | }
395 | 


--------------------------------------------------------------------------------
/db_test.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"math"
 6 | 	"os"
 7 | 	"path"
 8 | 	"strings"
 9 | 	"testing"
10 | )
11 | 
12 | func CallAndCheck(db Db, t *testing.T, r1 []string, limit int, scorer []interface{}) {
13 | 	r2, err := db.Query(Query{Limit: limit, Scorer: scorer, MinScore: float32(math.Inf(-1))})
14 | 	if err != nil {
15 | 		t.Fatal(err)
16 | 	}
17 | 	if len(r1) != len(r2.Ids) {
18 | 		t.Fatalf("expected: %v found: %v", r1, r2)
19 | 	}
20 | 	for idx, v1 := range r1 {
21 | 		if v1 != r2.Ids[idx] {
22 | 			t.Fatalf("expected: %v found: %v", r1, r2)
23 | 		}
24 | 	}
25 | }
26 | 
27 | func DbBasicsTest(db Db, t *testing.T) {
28 | 	err := db.Index("r1", map[string]float32{"age": 32, "height": 2.0, "lat": 45.0, "lon": -70.0})
29 | 	if err != nil {
30 | 		t.Error(fmt.Sprintf("%v", err))
31 | 	}
32 | 	err = db.Index("r2", map[string]float32{"age": 25, "height": 1.5, "lat": 43.0, "lon": -69.0})
33 | 	if err != nil {
34 | 		t.Error(fmt.Sprintf("%v", err))
35 | 	}
36 | 	err = db.Index("r3", map[string]float32{"age": 16, "height": 2.5, "lat": 45.0, "lon": -95.0})
37 | 	if err != nil {
38 | 		t.Error(fmt.Sprintf("%v", err))
39 | 	}
40 | 	CallAndCheck(db, t, []string{"r3", "r1"}, 2, []interface{}{"field", "height"})
41 | 	CallAndCheck(db, t, []string{"r1", "r2"}, 2, []interface{}{"sum",
42 | 		[]interface{}{"field", "age"},
43 | 		[]interface{}{"field", "height"}})
44 | 	CallAndCheck(db, t, []string{"r1"}, 1, []interface{}{"sum",
45 | 		[]interface{}{"field", "age"},
46 | 		[]interface{}{"field", "height"}})
47 | 	CallAndCheck(db, t, []string{"r3", "r1"}, 2, []interface{}{"sum",
48 | 		[]interface{}{"scale", 0.1, []interface{}{"field", "age"}},
49 | 		[]interface{}{"scale", 10.0, []interface{}{"field", "height"}}})
50 | 	CallAndCheck(db, t, []string{"r3", "r2"}, 2, []interface{}{"sum",
51 | 		[]interface{}{"scale", -1.0, []interface{}{"field", "age"}},
52 | 		[]interface{}{"scale", -1.0, []interface{}{"field", "height"}}})
53 | 	CallAndCheck(db, t, []string{"r2", "r1", "r3"}, 3, []interface{}{"sum",
54 | 		[]interface{}{"scale", 1.0, []interface{}{"field", "age"}},
55 | 		[]interface{}{"scale", -100.0, []interface{}{"field", "height"}}})
56 | 	CallAndCheck(db, t, []string{}, 0, []interface{}{"sum",
57 | 		[]interface{}{"field", "age"},
58 | 		[]interface{}{"field", "height"}})
59 | 	CallAndCheck(db, t, []string{"r1", "r2", "r3"}, 3, []interface{}{"sum",
60 | 		[]interface{}{"field", "age"},
61 | 		[]interface{}{"pow", []interface{}{"field", "height"}, 2.0}})
62 | 	CallAndCheck(db, t, []string{"r3", "r1", "r2"}, 3, []interface{}{"sum",
63 | 		[]interface{}{"field", "age"},
64 | 		[]interface{}{"pow", []interface{}{"field", "height"}, 10.0}})
65 | 	CallAndCheck(db, t, []string{"r1", "r3", "r2"}, 3, []interface{}{"product",
66 | 		[]interface{}{"field", "age"},
67 | 		[]interface{}{"field", "height"}})
68 | 	CallAndCheck(db, t, []string{"r3", "r1", "r2"}, 3, []interface{}{"min",
69 | 		[]interface{}{"field", "age"},
70 | 		[]interface{}{"field", "height"}})
71 | 	CallAndCheck(db, t, []string{"r1", "r2", "r3"}, 3, []interface{}{"custom_linear",
72 | 		[]interface{}{ // scores by closeness to age 30:
73 | 			[]interface{}{float32(0), float32(0.0)},
74 | 			[]interface{}{float32(30), float32(1.0)},
75 | 			[]interface{}{float32(100), float32(0.0)}},
76 | 		[]interface{}{"field", "age"}})
77 | 	CallAndCheck(db, t, []string{"r3", "r2", "r1"}, 3, []interface{}{"geo_distance", 45.0, -69.9, "lat", "lon"})
78 | 	CallAndCheck(db, t, []string{"r3", "r1", "r2"}, 3, []interface{}{"geo_distance", 20.0, 70.0, "lat", "lon"})
79 | }
80 | 
81 | func RmAllTestData() func(name string) string {
82 | 	tmpDir := os.TempDir()
83 | 	dirfd, err := os.Open(tmpDir)
84 | 	if err == nil {
85 | 		names, err := dirfd.Readdirnames(0)
86 | 		if err == nil {
87 | 			for _, name := range names {
88 | 				if strings.HasPrefix(name, "scoredbtest.") {
89 | 					os.RemoveAll(path.Join(tmpDir, name))
90 | 				}
91 | 			}
92 | 		}
93 | 	}
94 | 	return func(name string) string {
95 | 		fullname := path.Join(tmpDir, "scoredbtest."+name)
96 | 		return fullname
97 | 	}
98 | }
99 | 


--------------------------------------------------------------------------------
/diffdocitr.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import ()
 4 | 
 5 | // (Absolute) difference between a value and a constant
 6 | type DiffDocItr struct {
 7 | 	target float32
 8 | 	itr    DocItr
 9 | }
10 | 
11 | func Abs(val float32) float32 {
12 | 	if val < 0 {
13 | 		return -val
14 | 	} else {
15 | 		return val
16 | 	}
17 | }
18 | 
19 | func Max(v1, v2 float32) float32 {
20 | 	if v1 < v2 {
21 | 		return v2
22 | 	} else {
23 | 		return v1
24 | 	}
25 | }
26 | 
27 | func Min(v1, v2 float32) float32 {
28 | 	if v1 > v2 {
29 | 		return v2
30 | 	} else {
31 | 		return v1
32 | 	}
33 | }
34 | 
35 | func (op *DiffDocItr) Name() string { return "DiffDocItr" }
36 | func (op *DiffDocItr) Cur() (int64, float32) {
37 | 	docId, score := op.itr.Cur()
38 | 	return docId, Abs(score - op.target)
39 | }
40 | func (op *DiffDocItr) GetBounds() (min, max float32) {
41 | 	target := op.target
42 | 	min, max = op.itr.GetBounds()
43 | 	d1 := Abs(min - target)
44 | 	d2 := Abs(max - target)
45 | 	maxDist := Max(d1, d2)
46 | 	if min <= target && target <= max {
47 | 		return 0.0, maxDist
48 | 	} else {
49 | 		return Min(d1, d2), maxDist
50 | 	}
51 | }
52 | func (op *DiffDocItr) Close() {
53 | 	op.itr.Close()
54 | }
55 | func (op *DiffDocItr) Next(minId int64) bool {
56 | 	return op.itr.Next(minId)
57 | }
58 | 
59 | func (op *DiffDocItr) SetBounds(min, max float32) bool {
60 | 	// min is not useful to us right now
61 | 	target := op.target
62 | 	return op.itr.SetBounds(target-max, target+max)
63 | }
64 | 


--------------------------------------------------------------------------------
/docitr.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import (
 4 | 	"math"
 5 | )
 6 | 
 7 | var PositiveInfinity = float32(math.Inf(1))
 8 | var NegativeInfinity = float32(math.Inf(-1))
 9 | 
10 | type DocItr interface {
11 | 	// An iterator over (document id, score) values.
12 | 
13 | 	Name() string
14 | 
15 | 	// return false if the iterator is now known to not produce any more values
16 | 	SetBounds(min, max float32) bool
17 | 
18 | 	GetBounds() (min, max float32)
19 | 
20 | 	// Next() skips the iterator ahead to at least as far as the given id.
21 | 	// It always advances the iterator at least one position.
22 | 	// It Returns false if there are no remaining values.
23 | 	// Iterators need a call to Next(0) to intialize them to a real value; they all initially have a docId of -1
24 | 	Next(minId int64) bool
25 | 
26 | 	Close() // release resources held by this iterator (if any)
27 | 
28 | 	Cur() (int64, float32) // doc id and score of current result, or (-1, 0.0) if the iterator has not been initialized
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/elastic.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/json"
  6 | 	"fmt"
  7 | 	"io/ioutil"
  8 | 	"log"
  9 | 	"net/http"
 10 | 	"strconv"
 11 | 	"strings"
 12 | )
 13 | 
 14 | type EsScoreDb struct {
 15 | 	BaseURL, Index string
 16 | }
 17 | 
 18 | func (db *EsScoreDb) BulkIndex(records []Record) error {
 19 | 	var jsonbuf bytes.Buffer
 20 | 	for _, rec := range records {
 21 | 		jsonbuf.WriteString(fmt.Sprintf("{\"index\":{\"_id\":\"%s\"}}\n", rec.Id))
 22 | 		buf, err := json.Marshal(rec.Values)
 23 | 		if err != nil {
 24 | 			return err
 25 | 		}
 26 | 		jsonbuf.Write(buf)
 27 | 		jsonbuf.WriteString("\n")
 28 | 	}
 29 | 	payload := jsonbuf.String()
 30 | 	url := db.BaseURL + db.Index + "/external/_bulk"
 31 | 	//fmt.Printf("Bulk: %v @ %v\n", payload, url)
 32 | 	resp, err := http.Post(url, "application/json", strings.NewReader(payload))
 33 | 	if err != nil {
 34 | 		panic(err)
 35 | 	}
 36 | 	body, _ := ioutil.ReadAll(resp.Body)
 37 | 	resp.Body.Close()
 38 | 	//fmt.Printf("Bulk resp: %+v\n", string(body))
 39 | 	var parsedResponse struct{ Errors bool }
 40 | 	err = json.Unmarshal(body, &parsedResponse)
 41 | 	if err != nil {
 42 | 		panic(err)
 43 | 	}
 44 | 	if parsedResponse.Errors {
 45 | 		panic(string(body))
 46 | 	}
 47 | 
 48 | 	db.RefreshIndex()
 49 | 
 50 | 	return nil
 51 | }
 52 | 
 53 | type EsQueryResponse struct {
 54 | 	Hits struct {
 55 | 		Hits []struct {
 56 | 			Id string `json:"_id"`
 57 | 		} `json:"hits"`
 58 | 	} `json:"hits"`
 59 | }
 60 | 
 61 | func (db *EsScoreDb) LinearQuery(numResults int, weights map[string]float32) []string {
 62 | 	var scorefactors bytes.Buffer
 63 | 	first := true
 64 | 	for key, val := range weights {
 65 | 		if !first {
 66 | 			scorefactors.WriteString(",")
 67 | 		} else {
 68 | 			first = false
 69 | 		}
 70 | 		scorefactors.WriteString(fmt.Sprintf(`{"field_value_factor":{"field":"%s","factor":%f}}`, key, val))
 71 | 	}
 72 | 	data := fmt.Sprintf(`{
 73 |     "size":%d,
 74 |     "fields":[],
 75 |     "query":{
 76 |       "function_score":{
 77 |         "functions":[%s],
 78 |         "score_mode": "sum"
 79 |       }
 80 |     }
 81 |   }`, numResults, scorefactors.String())
 82 | 	resp, err := http.Post(db.BaseURL+db.Index+"/external/_search?pretty", "application/json", strings.NewReader(data))
 83 | 	if err != nil {
 84 | 		panic(err)
 85 | 	}
 86 | 	body, _ := ioutil.ReadAll(resp.Body)
 87 | 	resp.Body.Close()
 88 | 	//fmt.Println(string(body))
 89 | 	queryResp := EsQueryResponse{}
 90 | 	err = json.Unmarshal(body, &queryResp)
 91 | 	if err != nil {
 92 | 		panic(err)
 93 | 	}
 94 | 	hits := queryResp.Hits.Hits
 95 | 	resultIds := make([]string, len(hits))
 96 | 	for idx, rec := range hits {
 97 | 		resultIds[idx] = rec.Id
 98 | 	}
 99 | 	return resultIds
100 | }
101 | 
102 | func (db *EsScoreDb) DeleteIndex() {
103 | 	req, _ := http.NewRequest("DELETE", db.BaseURL+db.Index, nil)
104 | 	resp, _ := http.DefaultClient.Do(req)
105 | 	body, _ := ioutil.ReadAll(resp.Body)
106 | 	resp.Body.Close()
107 | 	fmt.Println("Delete Index: " + string(body))
108 | }
109 | 
110 | func (db *EsScoreDb) CreateIndex() {
111 | 	payload := "{\"settings\": {\"index\": {\"number_of_shards\" : 1}}}"
112 | 	req, _ := http.NewRequest("PUT", db.BaseURL+db.Index, strings.NewReader(payload))
113 | 	resp, _ := http.DefaultClient.Do(req)
114 | 	body, _ := ioutil.ReadAll(resp.Body)
115 | 	resp.Body.Close()
116 | 	fmt.Println("Create Index: " + string(body))
117 | }
118 | 
119 | func (db *EsScoreDb) RefreshIndex() {
120 | 	req, _ := http.NewRequest("POST", db.BaseURL+db.Index+"/_refresh", nil)
121 | 	resp, _ := http.DefaultClient.Do(req)
122 | 	ioutil.ReadAll(resp.Body)
123 | 	resp.Body.Close()
124 | 	//fmt.Println("Refresh Index: " + string(body))
125 | }
126 | 
127 | func (db *EsScoreDb) ParseQuery(query string) map[string]float32 {
128 | 	fields := strings.Split(query, ",")
129 | 	coefs := make(map[string]float32)
130 | 	for _, f := range fields {
131 | 		fieldparts := strings.Split(f, "=")
132 | 		if len(fieldparts) != 2 {
133 | 			log.Fatalf("ERROR: malformed query\n")
134 | 		}
135 | 		val, _ := strconv.ParseFloat(fieldparts[1], 32)
136 | 		coefs[fieldparts[0]] = float32(val)
137 | 	}
138 | 	return coefs
139 | }
140 | 
141 | /*
142 | var (
143 | 	deleteflag = flag.Bool("delete", false, "delete data from elasticsearch")
144 | 	queryflag  = flag.String("query", "", "column_name=weighting_factor,...")
145 | 	urlflag    = flag.String("esurl", "http://localhost:9200/", "URL to elasticsearch instance with trailing slash")
146 | 	indexflag  = flag.String("index", "scoredb", "Elasticsearch index name")
147 | )
148 | 
149 | func main() {
150 | 	flag.Parse()
151 | 	db := NewEsScoreDb{BaseUrl: *urlflag, Index: *indexflag}
152 | 	if *deleteflag {
153 | 		db.DeleteData()
154 | 	} else if len(*queryflag) > 0 {
155 | 		db.LinearQuery(10, db.ParseQuery(*queryflag))
156 | 	} else {
157 | 		fmt.Println("need to use --query querystring, or --delete")
158 | 	}
159 | }
160 | */
161 | 


--------------------------------------------------------------------------------
/fielddocitr.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"container/heap"
  5 | 	//"fmt"
  6 | 	"math"
  7 | 	//"time"
  8 | )
  9 | 
 10 | type FieldDocItr struct {
 11 | 	field    string
 12 | 	score    float32
 13 | 	docId    int64
 14 | 	min, max float32
 15 | 	lists    FieldDocItrs
 16 | }
 17 | 
 18 | func NewFieldDocItr(field string, lists FieldDocItrs) *FieldDocItr {
 19 | 	itr := &FieldDocItr{
 20 | 		field: field,
 21 | 		score: 0.0,
 22 | 		docId: -1,
 23 | 		lists: lists,
 24 | 	}
 25 | 	min, max := float32(math.Inf(1)), float32(math.Inf(-1))
 26 | 	for _, docItr := range lists {
 27 | 		curMin, curMax := docItr.GetBounds()
 28 | 		if curMin < min {
 29 | 			min = curMin
 30 | 		}
 31 | 		if curMax > max {
 32 | 			max = curMax
 33 | 		}
 34 | 	}
 35 | 	itr.min, itr.max = min, max
 36 | 	return itr
 37 | }
 38 | 
 39 | type FieldDocItrs []DocItr       // FieldDocItrs implements heap.Interface
 40 | func (so FieldDocItrs) Len() int { return len(so) }
 41 | func (so FieldDocItrs) Less(i, j int) bool {
 42 | 	d1, _ := so[i].Cur()
 43 | 	d2, _ := so[j].Cur()
 44 | 	return d1 < d2
 45 | }
 46 | func (so *FieldDocItrs) Pop() interface{} {
 47 | 	old := *so
 48 | 	n := len(old)
 49 | 	item := old[n-1]
 50 | 	*so = old[0 : n-1]
 51 | 	return item
 52 | }
 53 | func (so *FieldDocItrs) Push(x interface{}) {
 54 | 	*so = append(*so, x.(DocItr))
 55 | }
 56 | func (so FieldDocItrs) Swap(i, j int) {
 57 | 	so[i], so[j] = so[j], so[i]
 58 | }
 59 | 
 60 | func (op *FieldDocItr) Name() string { return "FieldDocItr" }
 61 | func (op *FieldDocItr) Cur() (int64, float32) {
 62 | 	return op.docId, op.score
 63 | }
 64 | func (op *FieldDocItr) GetBounds() (min, max float32) {
 65 | 	return op.min, op.max
 66 | }
 67 | func (op *FieldDocItr) SetBounds(min, max float32) bool {
 68 | 	op.min = min
 69 | 	op.max = max
 70 | 	for {
 71 | 		keepGoing := false
 72 | 		anyMore := false
 73 | 		for idx, subOp := range op.lists {
 74 | 			if subOp.SetBounds(min, max) {
 75 | 				anyMore = true
 76 | 			} else {
 77 | 				subOp.Close()
 78 | 				lists := op.lists
 79 | 				lists[idx] = lists[len(lists)-1]
 80 | 				op.lists = lists[:len(lists)-1]
 81 | 				keepGoing = true
 82 | 				//fmt.Printf("%08d Field elim @doc %08d, %05d remain (%s)\n", time.Now().UnixNano() % 100000000, op.docId, len(op.lists), op.field)
 83 | 				break
 84 | 			}
 85 | 		}
 86 | 		if !keepGoing {
 87 | 			return anyMore
 88 | 		}
 89 | 		heap.Init(&op.lists)
 90 | 	}
 91 | }
 92 | 
 93 | func (op *FieldDocItr) Close() {
 94 | 	for _, list := range op.lists {
 95 | 		list.Close()
 96 | 	}
 97 | }
 98 | 
 99 | func (op *FieldDocItr) Next(minId int64) bool {
100 | 	if len(op.lists) == 0 {
101 | 		return false
102 | 	}
103 | 	var docId int64
104 | 	var score float32
105 | 	for {
106 | 		docId, score = op.lists[0].Cur()
107 | 		if docId >= minId {
108 | 			break
109 | 		}
110 | 		if !op.lists[0].Next(minId) {
111 | 			op.lists[0].Close()
112 | 			heap.Remove(&op.lists, 0)
113 | 			if len(op.lists) == 0 {
114 | 				//fmt.Printf("FieldDocItr Next(%v) %v END\n", minId, op.field)
115 | 				return false
116 | 			}
117 | 		} else {
118 | 			heap.Fix(&op.lists, 0)
119 | 		}
120 | 	}
121 | 	op.docId = docId
122 | 	op.score = score
123 | 	//fmt.Printf("FieldDocItr Next(%v) %v %v %v\n", minId, op.field, op.docId, op.score)
124 | 	return true
125 | }
126 | 


--------------------------------------------------------------------------------
/fielddocitr_test.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestFieldOp(t *testing.T) {
 8 | 	l1 := NewMemoryDocItr(
 9 | 		[]float32{1.0, 1.0, 0.5, 1.0, 0.5},
10 | 		[]int64{1, 5, 7, 8, 9},
11 | 	)
12 | 	l2 := NewMemoryDocItr(
13 | 		[]float32{1.0, 1.0},
14 | 		[]int64{2, 5},
15 | 	)
16 | 	fieldop := FieldDocItr{lists: FieldDocItrs{l1, l2}}
17 | 	if !fieldop.Next(0) {
18 | 		t.FailNow()
19 | 	}
20 | 	docId, _ := fieldop.Cur()
21 | 	if docId != 1 {
22 | 		t.FailNow()
23 | 	}
24 | 	if !fieldop.Next(2) {
25 | 		t.FailNow()
26 | 	}
27 | 	docId, _ = fieldop.Cur()
28 | 	if docId != 2 {
29 | 		t.FailNow()
30 | 	}
31 | 	if !fieldop.Next(3) {
32 | 		t.FailNow()
33 | 	}
34 | 	docId, _ = fieldop.Cur()
35 | 	if docId != 5 {
36 | 		t.FailNow()
37 | 	}
38 | 	if !fieldop.SetBounds(0.75, 1.0) {
39 | 		t.FailNow()
40 | 	}
41 | 	if !fieldop.Next(6) {
42 | 		t.FailNow()
43 | 	}
44 | 	docId, _ = fieldop.Cur()
45 | 	if docId != 8 {
46 | 		t.FailNow()
47 | 	}
48 | 	if fieldop.Next(9) {
49 | 		t.FailNow()
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/fsscoredb.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"io/ioutil"
  8 | 	"math"
  9 | 	"os"
 10 | 	"path"
 11 | 	"strconv"
 12 | 	//"time"
 13 | )
 14 | 
 15 | func NewFsScoreDb(dataDir string) *FsScoreDb {
 16 | 	err := EnsureDirectory(dataDir)
 17 | 	if err != nil {
 18 | 		panic(err)
 19 | 	}
 20 | 	fields := make(map[string]OrderedFileInfos)
 21 | 
 22 | 	// Load pre-existing file headers
 23 | 	highestId := int64(0)
 24 | 	fieldNames, err := ioutil.ReadDir(dataDir)
 25 | 	if err != nil {
 26 | 		panic(err)
 27 | 	}
 28 | 	for _, fieldName := range fieldNames {
 29 | 		fieldPath := path.Join(dataDir, fieldName.Name())
 30 | 		fields[fieldPath] = make(OrderedFileInfos, 0)
 31 | 		dataFiles, err := ioutil.ReadDir(fieldPath)
 32 | 		if err != nil {
 33 | 			panic(err)
 34 | 		}
 35 | 		for _, dataFile := range dataFiles {
 36 | 			numVarBits := 32 - len(dataFile.Name())
 37 | 			prefixVal, err := strconv.ParseInt(dataFile.Name(), 2, 32)
 38 | 			if err != nil {
 39 | 				continue
 40 | 			}
 41 | 			dataFilePath := path.Join(fieldPath, dataFile.Name())
 42 | 			fd, err := os.OpenFile(dataFilePath, os.O_RDONLY, 0)
 43 | 			if err != nil {
 44 | 				panic(err)
 45 | 			}
 46 | 			var header PostingListHeader
 47 | 			err = binary.Read(fd, binary.LittleEndian, &header)
 48 | 			if err != nil {
 49 | 				panic(err)
 50 | 			}
 51 | 			fd.Close()
 52 | 			if header.LastDocId > highestId {
 53 | 				highestId = header.LastDocId
 54 | 			}
 55 | 			fileInfo := &FileInfo{
 56 | 				header:          &header,
 57 | 				path:            dataFilePath,
 58 | 				numVariableBits: uint(numVarBits),
 59 | 				minVal:          math.Float32frombits(uint32(prefixVal << uint(numVarBits))),
 60 | 			}
 61 | 			fields[fieldName.Name()] = append(fields[fieldName.Name()], fileInfo)
 62 | 		}
 63 | 
 64 | 	}
 65 | 
 66 | 	//fmt.Printf("INIT fs score db %v (highest id %d)\n", dataDir, highestId)
 67 | 	return &FsScoreDb{
 68 | 		dataDir: dataDir,
 69 | 		fields:  fields,
 70 | 		nextId:  highestId + 1,
 71 | 	}
 72 | }
 73 | 
 74 | type FsScoreDb struct {
 75 | 	dataDir string
 76 | 	fields  map[string]OrderedFileInfos
 77 | 	nextId  int64
 78 | }
 79 | 
 80 | type PostingListHeader struct {
 81 | 	FirstDocId    int64
 82 | 	LastDocId     int64
 83 | 	NumDocs       int64
 84 | 	MinVal        float32
 85 | 	MaxVal        float32
 86 | 	FirstDocScore float32
 87 | 	Version       uint8
 88 | 	// padding to make struct 8-byte aligned when using encoding/binary operations:
 89 | 	_ uint8
 90 | 	_ uint16
 91 | }
 92 | 
 93 | type FileInfo struct {
 94 | 	header          *PostingListHeader
 95 | 	writer          *BitWriter
 96 | 	path            string
 97 | 	numVariableBits uint    // number of bits at the bottom of the float that are variable (smaller means it is a more specific bucket)
 98 | 	minVal          float32 // the minimum value allowed in the bucket (minVal and maxVal in the PostingListHeader are for the actual values stored in the list)
 99 | }
100 | 
101 | type OrderedFileInfos []*FileInfo
102 | 
103 | func (a OrderedFileInfos) Len() int      { return len(a) }
104 | func (a OrderedFileInfos) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
105 | func (a OrderedFileInfos) Less(i, j int) bool {
106 | 	if a[i].minVal < a[j].minVal {
107 | 		return true
108 | 	} else if a[i].minVal > a[j].minVal {
109 | 		return false
110 | 	} else {
111 | 		return a[i].numVariableBits > a[j].numVariableBits
112 | 	}
113 | }
114 | 
115 | func MaxDocsForFile(fileInfo *FileInfo) int64 {
116 | 	header := fileInfo.header
117 | 	if header.MinVal == header.MaxVal { // do not split single-valued lists
118 | 		return math.MaxInt64
119 | 	}
120 | 	if fileInfo.numVariableBits <= 0 { // do not split lists at full precision
121 | 		return math.MaxInt64
122 | 	}
123 | 	fixedFractionBits := 23 - fileInfo.numVariableBits // 23 bits is size of the fraction part
124 | 	return 20*1568 + (1 << (fixedFractionBits))
125 | }
126 | 
127 | func Exists(path string) bool {
128 | 	_, err := os.Stat(path)
129 | 	return !os.IsNotExist(err)
130 | }
131 | 
132 | func EnsureDirectory(dir string) error {
133 | 	if Exists(dir) {
134 | 		return nil
135 | 	} else {
136 | 		parent := path.Dir(dir)
137 | 		EnsureDirectory(parent)
138 | 		return os.Mkdir(dir, 0755)
139 | 	}
140 | }
141 | 
142 | var INITIAL_VAR_BITS = uint(23 - 0)
143 | var HEADER_SIZE = int64(binary.Size(PostingListHeader{}))
144 | var numOpenFiles = 0
145 | 
146 | func FindPostingListFileForWrite(db *FsScoreDb, docId int64, key string, value float32) (*FileInfo, error) {
147 | 	var err error
148 | 	fieldDir := path.Join(db.dataDir, key)
149 | 	files, ok := db.fields[key]
150 | 	if !ok {
151 | 		db.fields[key] = make(OrderedFileInfos, 0)
152 | 		files = db.fields[key]
153 | 		EnsureDirectory(fieldDir)
154 | 	}
155 | 	var fileInfo *FileInfo = nil
156 | 	bestVarBits := uint(32)
157 | 	// TODO idea here is that we should be able to use the ordering of OrderedFileInfos to
158 | 	// binary search for the right one; right now this is just a simplistic linear scan
159 | 	for _, curFileInfo := range files {
160 | 		numVar := curFileInfo.numVariableBits
161 | 		if math.Float32bits(curFileInfo.minVal)>>numVar == math.Float32bits(value)>>numVar {
162 | 			if numVar < bestVarBits {
163 | 				fileInfo = curFileInfo
164 | 				bestVarBits = numVar
165 | 			}
166 | 		}
167 | 	}
168 | 	if fileInfo == nil { // no matching posting list found
169 | 		fileInfo, err = MakeFileInfo(fieldDir, value, INITIAL_VAR_BITS, docId)
170 | 		if err != nil {
171 | 			return nil, err
172 | 		}
173 | 		files = append(files, fileInfo)
174 | 		db.fields[key] = files
175 | 		if err != nil {
176 | 			return nil, err
177 | 		}
178 | 	} else {
179 | 		if fileInfo.header.NumDocs >= MaxDocsForFile(fileInfo) {
180 | 			newBits := uint(fileInfo.numVariableBits - 3)
181 | 			if newBits < 0 {
182 | 				newBits = 0
183 | 			}
184 | 			fileInfo, err = MakeFileInfo(fieldDir, value, newBits, docId)
185 | 			if err != nil {
186 | 				return nil, err
187 | 			}
188 | 			files = append(files, fileInfo)
189 | 			db.fields[key] = files
190 | 		}
191 | 	}
192 | 
193 | 	if fileInfo.writer == nil {
194 | 		numOpenFiles += 1
195 | 		fd, err := os.OpenFile(fileInfo.path, os.O_RDWR, 0666)
196 | 		if err != nil {
197 | 			return nil, err
198 | 		}
199 | 		var header PostingListHeader
200 | 		err = binary.Read(fd, binary.LittleEndian, &header)
201 | 		if err != nil {
202 | 			return nil, err
203 | 		}
204 | 		fileInfo.header = &header
205 | 		writer, err := NewBitWriter(fd)
206 | 		if err != nil {
207 | 			return nil, err
208 | 		}
209 | 		fileInfo.writer = writer
210 | 	}
211 | 	return fileInfo, nil
212 | }
213 | 
214 | func MakeFileInfo(fieldDir string, value float32, numVarBits uint, docId int64) (*FileInfo, error) {
215 | 	var fd *os.File
216 | 	var err error
217 | 	var header PostingListHeader
218 | 
219 | 	scoreBits := math.Float32bits(value)
220 | 	minVal := math.Float32frombits((scoreBits >> numVarBits) << numVarBits)
221 | 	numFixedBits := 32 - numVarBits
222 | 	scoreBitString := fmt.Sprintf("%032b", int64(scoreBits))
223 | 	fixedBits := scoreBitString[:numFixedBits]
224 | 	filename := path.Join(fieldDir, fixedBits)
225 | 
226 | 	if Exists(filename) {
227 | 		numOpenFiles += 1
228 | 		fd, err = os.OpenFile(filename, os.O_RDWR, 0666)
229 | 		if err != nil {
230 | 			return nil, err
231 | 		}
232 | 		err = binary.Read(fd, binary.LittleEndian, &header)
233 | 		if err != nil {
234 | 			return nil, err
235 | 		}
236 | 		fd.Seek(0, 2) // Goto EOF (whence=2 means "relative to end")
237 | 	} else {
238 | 		numOpenFiles += 1
239 | 		fd, err = os.Create(filename)
240 | 		if err != nil {
241 | 			return nil, err
242 | 		}
243 | 		header = PostingListHeader{
244 | 			Version:       1,
245 | 			MinVal:        value,
246 | 			MaxVal:        value,
247 | 			FirstDocId:    docId,
248 | 			FirstDocScore: value,
249 | 			LastDocId:     docId,
250 | 			NumDocs:       1,
251 | 		}
252 | 		err = binary.Write(fd, binary.LittleEndian, header)
253 | 		if err != nil {
254 | 			return nil, err
255 | 		}
256 | 	}
257 | 	if header.Version != 1 {
258 | 		return nil, errors.New("Incorrect file version")
259 | 	}
260 | 	writer, err := NewBitWriter(fd)
261 | 	if err != nil {
262 | 		return nil, err
263 | 	}
264 | 	return &FileInfo{
265 | 		header:          &header,
266 | 		writer:          writer,
267 | 		path:            filename,
268 | 		numVariableBits: numVarBits,
269 | 		minVal:          minVal,
270 | 	}, nil
271 | }
272 | 
273 | func WritePostingListEntry(fileInfo *FileInfo, docId int64, score float32) {
274 | 	header := fileInfo.header
275 | 	docIncr := docId - header.LastDocId
276 | 
277 | 	if docIncr == 0 {
278 | 		// special case for first entry (it exists in the header, so do not write here)
279 | 		return
280 | 	}
281 | 
282 | 	// header maintenance
283 | 	header.LastDocId = docId
284 | 	header.NumDocs += 1
285 | 	if score < header.MinVal {
286 | 		header.MinVal = score
287 | 	}
288 | 	if score > header.MaxVal {
289 | 		header.MaxVal = score
290 | 	}
291 | 	scoreBits := math.Float32bits(score)
292 | 	scoreMask := uint32(0xffffffff) >> (32 - fileInfo.numVariableBits)
293 | 	scoreRemainder := uint64(scoreBits & scoreMask)
294 | 
295 | 	if scoreRemainder == 0 {
296 | 		fileInfo.writer.WriteVarUInt32(uint32(docIncr << 1))
297 | 	} else {
298 | 		fileInfo.writer.WriteVarUInt32(uint32((docIncr << 1) | 1))
299 | 		fileInfo.writer.WriteBits(scoreRemainder, fileInfo.numVariableBits)
300 | 	}
301 | 
302 | }
303 | 
304 | func (op *PostingListDocItr) Close() {
305 | 	if op.reader != nil {
306 | 		numOpenFiles -= 1
307 | 		err := op.reader.Close()
308 | 		if err != nil {
309 | 			panic(fmt.Sprintf("%v", err))
310 | 		}
311 | 	}
312 | }
313 | 
314 | func (op *PostingListDocItr) Next(minId int64) bool {
315 | 	reader := op.reader
316 | 	if reader == nil {
317 | 		if op.docId == -1 && minId <= op.header.FirstDocId {
318 | 			op.docId = op.header.FirstDocId
319 | 			op.score = op.header.FirstDocScore
320 | 			return true
321 | 		} else {
322 | 			//fmt.Printf("%08d Open       @doc %08d %s\n", time.Now().UnixNano() % 100000000, minId, op.path)
323 | 			fd, err := os.OpenFile(op.path, os.O_RDONLY, 0)
324 | 			numOpenFiles += 1
325 | 			if err != nil {
326 | 				panic(fmt.Sprintf("%v", err))
327 | 			}
328 | 			_, err = fd.Seek(HEADER_SIZE, 0)
329 | 			if err != nil {
330 | 				panic(fmt.Sprintf("%v", err))
331 | 			}
332 | 			reader, err = NewBitReader(fd)
333 | 			if err != nil {
334 | 				panic(fmt.Sprintf("%v", err))
335 | 			}
336 | 			op.reader = reader
337 | 		}
338 | 	}
339 | 	docId := op.docId
340 | 	for {
341 | 		if docId == op.maxDocId {
342 | 			return false
343 | 		}
344 | 		pair, err := reader.ReadVarUInt32()
345 | 		if err != nil {
346 | 			panic(fmt.Sprintf("%v", err))
347 | 		}
348 | 		docIncr := pair >> 1
349 | 		var valueBits uint64
350 | 		if pair&1 == 1 {
351 | 			valueBits, err = reader.ReadBits(op.numVarBits)
352 | 			if err != nil {
353 | 				panic(fmt.Sprintf("%v", err))
354 | 			}
355 | 		}
356 | 		if docIncr == 0 {
357 | 			panic(fmt.Sprintf("Inconsistent file data @ %v %v", reader.MmapPtr*8, op.path))
358 | 		}
359 | 		docId += int64(docIncr)
360 | 		if docId < minId {
361 | 			continue
362 | 		}
363 | 		score := math.Float32frombits(op.rangePrefix | uint32(valueBits))
364 | 		op.docId = docId
365 | 		op.score = score
366 | 		return true
367 | 	}
368 | }
369 | 
370 | func (db *FsScoreDb) BulkIndex(records []map[string]float32) ([]int64, error) {
371 | 	ids := make([]int64, len(records))
372 | 	for idx, record := range records {
373 | 		docid := db.nextId
374 | 		db.nextId += 1
375 | 		for key, value := range record {
376 | 			fileInfo, err := FindPostingListFileForWrite(db, docid, key, value)
377 | 			if err != nil {
378 | 				return nil, err
379 | 			}
380 | 			WritePostingListEntry(fileInfo, docid, value)
381 | 			ids[idx] = docid
382 | 		}
383 | 	}
384 | 	CloseWriters(db)
385 | 	return ids, nil
386 | }
387 | 
388 | func CloseWriters(db *FsScoreDb) error {
389 | 	for _, fieldIndex := range db.fields {
390 | 		for idx, fileInfo := range fieldIndex {
391 | 			writer := fileInfo.writer
392 | 			if writer == nil {
393 | 				continue
394 | 			}
395 | 			origPos, err := writer.File.Seek(0, 1) // save position to restore later
396 | 			if err != nil {
397 | 				return err
398 | 			}
399 | 			_, err = writer.File.Seek(0, 0)
400 | 			if err != nil {
401 | 				return err
402 | 			}
403 | 			err = binary.Write(writer.File, binary.LittleEndian, fileInfo.header)
404 | 			if err != nil {
405 | 				return err
406 | 			}
407 | 			_, err = writer.File.Seek(origPos, 0)
408 | 			if err != nil {
409 | 				return err
410 | 			}
411 | 			err = writer.Close()
412 | 			if err != nil {
413 | 				return err
414 | 			}
415 | 			numOpenFiles -= 1
416 | 			fieldIndex[idx].writer = nil
417 | 		}
418 | 	}
419 | 	return nil
420 | }
421 | 
422 | func (db *FsScoreDb) Index(record map[string]float32) (int64, error) {
423 | 	docid := db.nextId
424 | 	db.nextId += 1
425 | 	for key, value := range record {
426 | 		fileInfo, err := FindPostingListFileForWrite(db, docid, key, value)
427 | 		if err != nil {
428 | 			return -1, err
429 | 		}
430 | 		WritePostingListEntry(fileInfo, docid, value)
431 | 	}
432 | 	CloseWriters(db)
433 | 	return docid, nil
434 | }
435 | 
436 | func (db *FsScoreDb) FieldDocItr(fieldName string) DocItr {
437 | 	files, ok := db.fields[fieldName]
438 | 	if !ok {
439 | 		return NewMemoryScoreDocItr([]float32{})
440 | 	}
441 | 	itrs := make([]DocItr, len(files))
442 | 	for fileIdx, fileInfo := range files {
443 | 		itrs[fileIdx] = NewPostingListDocItr(math.Float32bits(fileInfo.minVal), fileInfo.path, fileInfo.header, fileInfo.numVariableBits)
444 | 	}
445 | 	return NewFieldDocItr(fieldName, itrs)
446 | }
447 | 
448 | type PostingListDocItr struct {
449 | 	score       float32
450 | 	docId       int64
451 | 	maxDocId    int64
452 | 	min, max    float32
453 | 	numVarBits  uint
454 | 	rangePrefix uint32
455 | 	path        string
456 | 	reader      *BitReader
457 | 	header      *PostingListHeader
458 | }
459 | 
460 | func NewPostingListDocItr(rangePrefix uint32, path string, header *PostingListHeader, numVarBits uint) DocItr {
461 | 	itr := &PostingListDocItr{
462 | 		score:       0.0,
463 | 		docId:       -1,
464 | 		maxDocId:    header.LastDocId,
465 | 		min:         header.MinVal,
466 | 		max:         header.MaxVal,
467 | 		numVarBits:  numVarBits,
468 | 		rangePrefix: rangePrefix,
469 | 		path:        path,
470 | 		header:      header,
471 | 	}
472 | 	return itr
473 | }
474 | 
475 | func (op *PostingListDocItr) Name() string { return "PostingListDocItr" }
476 | func (op *PostingListDocItr) Cur() (int64, float32) {
477 | 	return op.docId, op.score
478 | }
479 | func (op *PostingListDocItr) GetBounds() (min, max float32) {
480 | 	return op.min, op.max
481 | }
482 | func (op *PostingListDocItr) SetBounds(min, max float32) bool {
483 | 	if min > op.min {
484 | 		op.min = min
485 | 	}
486 | 	if max < op.max {
487 | 		op.max = max
488 | 	}
489 | 	if op.min > op.max {
490 | 		return false
491 | 	}
492 | 	return true
493 | }
494 | 


--------------------------------------------------------------------------------
/fsscoredb_test.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestFsScore(t *testing.T) {
 9 | 	testdir := RmAllTestData()("fsscoredb.1")
10 | 	defer RmAllTestData()
11 | 	db := BaseDb{StreamingDb: BaseStreamingDb{NewFsScoreDb(testdir)}, IdDb: NewMemoryIdDb()}
12 | 	DbBasicsTest(db, t)
13 | }
14 | 
15 | func TestFsScoreLarge(t *testing.T) {
16 | 	testdir := RmAllTestData()("fsscoredb.2")
17 | 	defer RmAllTestData()
18 | 	db := BaseDb{StreamingDb: BaseStreamingDb{NewFsScoreDb(testdir)}, IdDb: NewMemoryIdDb()}
19 | 
20 | 	for i := 0; i < 100; i++ {
21 | 		db.Index(fmt.Sprintf("r%d", i), map[string]float32{"age": float32(1000 + 100 - i), "height": 100 + 1.0 + float32(i%10)/10.0})
22 | 	}
23 | 
24 | 	CallAndCheck(db, t, []string{"r0", "r1"}, 2, []interface{}{"sum",
25 | 		[]interface{}{"field", "age"},
26 | 		[]interface{}{"field", "height"}})
27 | }
28 | 


--------------------------------------------------------------------------------
/http.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"fmt"
  6 | 	"io/ioutil"
  7 | 	"math"
  8 | 	"net/http"
  9 | 	"net/url"
 10 | 	"strconv"
 11 | )
 12 | 
 13 | type ScoreDbServer struct {
 14 | 	Db                    Db
 15 | 	ReadOnly, AutoMigrate bool
 16 | }
 17 | 
 18 | func serializeIds(ids []int64) (string, error) {
 19 | 	b, err := json.Marshal(ids)
 20 | 	if err != nil {
 21 | 		return "", err
 22 | 	}
 23 | 	s := string(b)
 24 | 	return s, nil
 25 | }
 26 | 
 27 | func QueryIntVal(queryParams url.Values, key string, defaultValue int) (int, error) {
 28 | 	vals, ok := queryParams[key]
 29 | 	if !ok || len(vals) == 0 {
 30 | 		return defaultValue, nil
 31 | 	}
 32 | 	return strconv.Atoi(vals[0])
 33 | }
 34 | 
 35 | func QueryFloatVal(queryParams url.Values, key string, defaultValue float32) (float32, error) {
 36 | 	vals, ok := queryParams[key]
 37 | 	if !ok || len(vals) == 0 {
 38 | 		return defaultValue, nil
 39 | 	}
 40 | 	f64, err := strconv.ParseFloat(vals[0], 32)
 41 | 	if err != nil {
 42 | 		return 0.0, err
 43 | 	} else {
 44 | 		return float32(f64), nil
 45 | 	}
 46 | }
 47 | 
 48 | func (sds *ScoreDbServer) ServeHTTP(w http.ResponseWriter, req *http.Request) {
 49 | 	p := req.URL.Path
 50 | 	if p[0] == '/' {
 51 | 		p = p[1:]
 52 | 	}
 53 | 
 54 | 	if req.Method == "PUT" && !sds.ReadOnly {
 55 | 
 56 | 		b, err := ioutil.ReadAll(req.Body)
 57 | 		if err != nil {
 58 | 			http.Error(w, "Could not read request body", 400)
 59 | 			return
 60 | 		}
 61 | 		var records []Record
 62 | 		if len(p) > 0 {
 63 | 			var values map[string]float32
 64 | 			err = json.Unmarshal(b, &values)
 65 | 			if err == nil {
 66 | 				records = append(records, Record{Id: p, Values: values})
 67 | 			}
 68 | 		} else {
 69 | 			err = json.Unmarshal(b, &records)
 70 | 		}
 71 | 		if err != nil {
 72 | 			http.Error(w, fmt.Sprintf("Could not parse json: %v", err), 400)
 73 | 			return
 74 | 		}
 75 | 		err = sds.Db.BulkIndex(records)
 76 | 		if err != nil {
 77 | 			http.Error(w, "Could not index data", 500)
 78 | 			return
 79 | 		}
 80 | 
 81 | 	} else if req.Method == "GET" && len(p) == 0 {
 82 | 
 83 | 		queryParams := req.URL.Query()
 84 | 
 85 | 		offset, err := QueryIntVal(queryParams, "offset", 0)
 86 | 		if err != nil {
 87 | 			http.Error(w, "Invalid value for offset", 400)
 88 | 			return
 89 | 		}
 90 | 
 91 | 		limit, err := QueryIntVal(queryParams, "limit", 10)
 92 | 		if err != nil {
 93 | 			http.Error(w, "Invalid value for limit", 400)
 94 | 			return
 95 | 		}
 96 | 
 97 | 		minScore, err := QueryFloatVal(queryParams, "minScore", float32(math.Inf(-1)))
 98 | 		if err != nil {
 99 | 			http.Error(w, "Invalid value for minscore", 400)
100 | 			return
101 | 		}
102 | 
103 | 		scorerStrings, ok := queryParams["score"]
104 | 		if !ok || len(scorerStrings) == 0 {
105 | 			http.Error(w, "No score function was specified", 400)
106 | 			return
107 | 		}
108 | 		scorer := new([]interface{})
109 | 		err = json.Unmarshal([]byte(scorerStrings[0]), scorer)
110 | 		if err != nil {
111 | 			http.Error(w, "Score parameter is not a valid JSON array", 400)
112 | 			return
113 | 		}
114 | 
115 | 		query := Query{
116 | 			Offset:   offset,
117 | 			Limit:    limit,
118 | 			MinScore: minScore,
119 | 			Scorer:   *scorer,
120 | 		}
121 | 
122 | 		results, err := sds.Db.Query(query)
123 | 		if err != nil {
124 | 			fmt.Printf("Internal error. %+v:  %v\n", query, err)
125 | 			http.Error(w, "Internal Error in ScoreDB; please report", 500)
126 | 			return
127 | 		}
128 | 		response, err := json.Marshal(results)
129 | 		if err != nil {
130 | 			fmt.Printf("Internal error. %+v:  %v\n", query, err)
131 | 			http.Error(w, "Internal Error in ScoreDB; please report", 500)
132 | 			return
133 | 		}
134 | 		fmt.Fprintf(w, "%s\n", response)
135 | 
136 | 	} else {
137 | 
138 | 		http.NotFound(w, req)
139 | 		return
140 | 
141 | 	}
142 | }
143 | 
144 | func ServeHttp(addr string, db Db, readOnly bool) error {
145 | 	scoreDbServer := ScoreDbServer{Db: db, ReadOnly: readOnly}
146 | 	return http.ListenAndServe(addr, &scoreDbServer)
147 | }
148 | 


--------------------------------------------------------------------------------
/memorydb.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | )
  7 | 
  8 | func NewMemoryIdDb() MemoryIdDb {
  9 | 	return MemoryIdDb{make(map[int64]string)}
 10 | }
 11 | 
 12 | type MemoryIdDb struct {
 13 | 	bindings map[int64]string
 14 | }
 15 | 
 16 | func (db MemoryIdDb) Put(scoreIds []int64, clientIds []string) error {
 17 | 	for idx, scoreId := range scoreIds {
 18 | 		db.bindings[scoreId] = clientIds[idx]
 19 | 	}
 20 | 	return nil
 21 | }
 22 | 
 23 | func (db MemoryIdDb) Get(scoreIds []int64) ([]string, error) {
 24 | 	result := make([]string, len(scoreIds))
 25 | 	for idx, scoreId := range scoreIds {
 26 | 		clientId, ok := db.bindings[scoreId]
 27 | 		if !ok {
 28 | 			return nil, fmt.Errorf("Unable to find client id for internal id %d", scoreId)
 29 | 
 30 | 		}
 31 | 		result[idx] = clientId
 32 | 	}
 33 | 	return result, nil
 34 | }
 35 | 
 36 | type MemoryScoreDb struct {
 37 | 	Fields map[string][]float32
 38 | 	nextId int64
 39 | }
 40 | 
 41 | func NewMemoryScoreDb() *MemoryScoreDb {
 42 | 	return &MemoryScoreDb{
 43 | 		Fields: make(map[string][]float32),
 44 | 		nextId: 1,
 45 | 	}
 46 | }
 47 | 
 48 | func (db *MemoryScoreDb) BulkIndex(records []map[string]float32) ([]int64, error) {
 49 | 	fields := db.Fields
 50 | 	ids := make([]int64, len(records))
 51 | 	for idx, record := range records {
 52 | 		ids[idx] = db.nextId
 53 | 		db.nextId += 1
 54 | 		for key, value := range record {
 55 | 			_, ok := fields[key]
 56 | 			if !ok {
 57 | 				fields[key] = make([]float32, 0, 64)
 58 | 			}
 59 | 			fields[key] = append(fields[key], value)
 60 | 		}
 61 | 	}
 62 | 	return ids, nil
 63 | }
 64 | 
 65 | func (db *MemoryScoreDb) FieldDocItr(fieldName string) DocItr {
 66 | 	scores := db.Fields[fieldName]
 67 | 	return NewMemoryScoreDocItr(scores)
 68 | }
 69 | 
 70 | func NewMemoryScoreDocItr(scores []float32) *MemoryScoreDocItr {
 71 | 	min, max := float32(math.Inf(1)), float32(math.Inf(-1))
 72 | 	for _, score := range scores {
 73 | 		if score < min {
 74 | 			min = score
 75 | 		}
 76 | 		if score > max {
 77 | 			max = score
 78 | 		}
 79 | 	}
 80 | 	return &MemoryScoreDocItr{
 81 | 		scores: scores,
 82 | 		idx:    -1,
 83 | 		min:    min,
 84 | 		max:    max,
 85 | 	}
 86 | }
 87 | 
 88 | type MemoryScoreDocItr struct {
 89 | 	scores   []float32
 90 | 	idx      int
 91 | 	min, max float32
 92 | }
 93 | 
 94 | func (op *MemoryScoreDocItr) Name() string { return "MemoryScoreDocItr" }
 95 | func (op *MemoryScoreDocItr) Cur() (int64, float32) {
 96 | 	idx := op.idx
 97 | 	if idx < 0 || idx >= len(op.scores) {
 98 | 		return -1, 0.0
 99 | 	}
100 | 	return int64(idx + 1), op.scores[idx]
101 | 
102 | }
103 | func (op *MemoryScoreDocItr) GetBounds() (min, max float32) {
104 | 	return op.min, op.max
105 | }
106 | func (op *MemoryScoreDocItr) SetBounds(min, max float32) bool {
107 | 	op.min = Max(op.min, min)
108 | 	op.max = Min(op.max, max)
109 | 	return true
110 | }
111 | 
112 | func (op *MemoryScoreDocItr) Close() {
113 | }
114 | 
115 | func (op *MemoryScoreDocItr) Next(minId int64) bool {
116 | 	if minId == 0 {
117 | 		minId = 1
118 | 	}
119 | 	op.idx = int(minId - 1)
120 | 	return op.idx < len(op.scores)
121 | }
122 | 


--------------------------------------------------------------------------------
/memorydb_test.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestMemoryScoreDb(t *testing.T) {
 8 | 	db := BaseDb{StreamingDb: BaseStreamingDb{NewMemoryScoreDb()}, IdDb: NewMemoryIdDb()}
 9 | 	DbBasicsTest(db, t)
10 | }
11 | 


--------------------------------------------------------------------------------
/memorydocitr.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import (
 4 | 	"math"
 5 | )
 6 | 
 7 | type MemoryDocItr struct {
 8 | 	score    float32
 9 | 	docId    int64
10 | 	min, max float32
11 | 
12 | 	scores []float32
13 | 	docs   []int64
14 | 	index  int
15 | }
16 | 
17 | func NewMemoryDocItr(scores []float32, docs []int64) *MemoryDocItr {
18 | 	return &MemoryDocItr{
19 | 		score:  0.0,
20 | 		docId:  -1,
21 | 		min:    float32(math.Inf(-1)),
22 | 		max:    float32(math.Inf(1)),
23 | 		scores: scores,
24 | 		docs:   docs,
25 | 		index:  -1,
26 | 	}
27 | }
28 | func (op *MemoryDocItr) Cur() (int64, float32) {
29 | 	return op.docId, op.score
30 | }
31 | func (op *MemoryDocItr) GetBounds() (min, max float32) { return op.min, op.max }
32 | func (op *MemoryDocItr) SetBounds(min, max float32) bool {
33 | 	if min > op.max || max < op.min {
34 | 		return false
35 | 	}
36 | 	if min > op.min {
37 | 		op.min = min
38 | 	}
39 | 	if max < op.max {
40 | 		op.max = max
41 | 	}
42 | 	return true
43 | }
44 | func (op *MemoryDocItr) Name() string { return "MemoryDocItr" }
45 | func (op *MemoryDocItr) Close()       {}
46 | func (op *MemoryDocItr) Next(minId int64) bool {
47 | 	for {
48 | 		op.index += 1
49 | 		index := op.index
50 | 		if index >= len(op.docs) {
51 | 			return false
52 | 		}
53 | 		docId := op.docs[index]
54 | 		if docId >= minId {
55 | 			score := op.scores[index]
56 | 			if score >= op.min && score <= op.max {
57 | 				op.score = score
58 | 				op.docId = op.docs[index]
59 | 				return true
60 | 			}
61 | 		}
62 | 	}
63 | }
64 | 


--------------------------------------------------------------------------------
/migratabledb.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"time"
 6 | )
 7 | 
 8 | type MigratableDb struct {
 9 | 	Current Db
10 | }
11 | 
12 | func (db *MigratableDb) BulkIndex(records []Record) error {
13 | 	return db.Current.BulkIndex(records)
14 | }
15 | 
16 | func (db *MigratableDb) Index(id string, values map[string]float32) error {
17 | 	return db.Current.Index(id, values)
18 | }
19 | 
20 | func (db *MigratableDb) Query(query Query) (QueryResult, error) {
21 | 	fmt.Printf("Query versus %v at %v", db.Current, time.Now().Unix())
22 | 	return db.Current.Query(query)
23 | }
24 | 


--------------------------------------------------------------------------------
/mindocitr.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"sort"
  6 | )
  7 | 
  8 | type MinComponents []DocItr
  9 | 
 10 | func (a MinComponents) Len() int      { return len(a) }
 11 | func (a MinComponents) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
 12 | func (a MinComponents) Less(i, j int) bool {
 13 | 	min1, max1 := a[i].GetBounds()
 14 | 	min2, max2 := a[j].GetBounds()
 15 | 	return max1-min1 > max2-min2
 16 | }
 17 | 
 18 | type MinDocItr struct {
 19 | 	score    float32
 20 | 	docId    int64
 21 | 	min, max float32
 22 | 	parts    MinComponents
 23 | }
 24 | 
 25 | func NewMinDocItr(itrs []DocItr) *MinDocItr {
 26 | 	min, max := float32(0.0), float32(0.0)
 27 | 	components := make(MinComponents, len(itrs))
 28 | 	for idx, part := range itrs {
 29 | 		curMin, curMax := part.GetBounds()
 30 | 		components[idx] = part
 31 | 		if idx == 0 {
 32 | 			min, max = curMin, curMax
 33 | 		} else {
 34 | 			min = Min(min, curMin)
 35 | 			max = Min(max, curMax)
 36 | 		}
 37 | 	}
 38 | 	sort.Sort(components)
 39 | 	return &MinDocItr{
 40 | 		score: 0.0,
 41 | 		docId: -1,
 42 | 		min:   min,
 43 | 		max:   max,
 44 | 		parts: components,
 45 | 	}
 46 | }
 47 | 
 48 | func (op *MinDocItr) Name() string { return "MinDocItr" }
 49 | func (op *MinDocItr) Cur() (int64, float32) {
 50 | 	return op.docId, op.score
 51 | }
 52 | func (op *MinDocItr) GetBounds() (min, max float32) { return op.min, op.max }
 53 | func (op *MinDocItr) Close() {
 54 | 	for _, part := range op.parts {
 55 | 		part.Close()
 56 | 	}
 57 | }
 58 | 
 59 | func (op *MinDocItr) Next(minId int64) bool {
 60 | 	min, max := op.min, op.max
 61 | 	keepGoing := true
 62 | 	var score float32
 63 | 	for keepGoing {
 64 | 		//fmt.Printf("MinDocItr Next itr (%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score)
 65 | 		keepGoing = false
 66 | 		score = PositiveInfinity
 67 | 		for _, part := range op.parts {
 68 | 			var curDocId int64
 69 | 			var curScore float32
 70 | 			for {
 71 | 				curDocId, curScore = part.Cur()
 72 | 				if curDocId >= minId {
 73 | 					break
 74 | 				}
 75 | 				if !part.Next(minId) {
 76 | 					return false
 77 | 				}
 78 | 			}
 79 | 			if curDocId > minId {
 80 | 				minId = curDocId
 81 | 				keepGoing = true
 82 | 				break
 83 | 			}
 84 | 			score = Min(score, curScore)
 85 | 		}
 86 | 		if !keepGoing {
 87 | 			if score < min || score > max {
 88 | 				minId += 1
 89 | 				keepGoing = true
 90 | 			}
 91 | 		}
 92 | 	}
 93 | 	op.docId = minId
 94 | 	op.score = score
 95 | 	//fmt.Printf("MinDocItr Next(%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score)
 96 | 	return true
 97 | }
 98 | 
 99 | func (op *MinDocItr) SetBounds(min, max float32) bool {
100 | 	fmt.Printf("MinDocItr SetBounds %v %v\n", min, max)
101 | 	op.min = min
102 | 	for _, component := range op.parts {
103 | 		curMin, curMax := component.GetBounds()
104 | 		if curMin < min {
105 | 			//fmt.Printf("MinDocItr SetBounds for component %v %v\n", min, curMax)
106 | 			if !component.SetBounds(min, curMax) {
107 | 				return false
108 | 			}
109 | 		}
110 | 	}
111 | 	return true
112 | }
113 | 


--------------------------------------------------------------------------------
/powdocitr.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import (
 4 | 	"math"
 5 | )
 6 | 
 7 | // Takes a constant power of a value.
 8 | // Important: for bounds caluclation reasons, assumes only positive values are provided as inputs!
 9 | type PowDocItr struct {
10 | 	exp, oneOverExp float32
11 | 	itr             DocItr
12 | }
13 | 
14 | func NewPowDocItr(itr DocItr, exp float32) *PowDocItr {
15 | 	return &PowDocItr{exp: exp, oneOverExp: 1.0 / exp, itr: itr}
16 | }
17 | 
18 | func Pow(val, exp float32) float32 {
19 | 	return float32(math.Pow(float64(val), float64(exp)))
20 | }
21 | 
22 | func (op *PowDocItr) Name() string { return "PowDocItr" }
23 | func (op *PowDocItr) Cur() (int64, float32) {
24 | 	docId, score := op.itr.Cur()
25 | 	return docId, Pow(score, op.exp)
26 | }
27 | func (op *PowDocItr) Close() {
28 | 	op.itr.Close()
29 | }
30 | func (op *PowDocItr) Next(minId int64) bool {
31 | 	ret := op.itr.Next(minId)
32 | 	return ret
33 | }
34 | func (op *PowDocItr) GetBounds() (min, max float32) {
35 | 	exp := op.exp
36 | 	min, max = op.itr.GetBounds()
37 | 	v1 := Pow(min, exp)
38 | 	v2 := Pow(max, exp)
39 | 	if v1 < v2 {
40 | 		return v1, v2
41 | 	} else {
42 | 		return v2, v1
43 | 	}
44 | }
45 | func (op *PowDocItr) SetBounds(min, max float32) bool {
46 | 	min = Max(0, min)
47 | 	max = Max(0, max)
48 | 	oneOverExp := op.oneOverExp
49 | 	v1 := Pow(min, oneOverExp)
50 | 	v2 := Pow(max, oneOverExp)
51 | 	if v1 < v2 {
52 | 		return op.itr.SetBounds(v1, v2)
53 | 	} else {
54 | 		return op.itr.SetBounds(v2, v1)
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/productdocitr.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"sort"
  5 | )
  6 | 
  7 | type ProductComponents []DocItr
  8 | 
  9 | func (a ProductComponents) Len() int      { return len(a) }
 10 | func (a ProductComponents) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
 11 | func (a ProductComponents) Less(i, j int) bool {
 12 | 	min1, max1 := a[i].GetBounds()
 13 | 	min2, max2 := a[j].GetBounds()
 14 | 	return max1-min1 > max2-min2
 15 | }
 16 | 
 17 | type ProductDocItr struct {
 18 | 	score    float32
 19 | 	docId    int64
 20 | 	min, max float32
 21 | 	parts    ProductComponents
 22 | }
 23 | 
 24 | func NewProductDocItr(itrs []DocItr) *ProductDocItr {
 25 | 	min, max := float32(0.0), float32(0.0)
 26 | 	components := make(ProductComponents, len(itrs))
 27 | 	for idx, part := range itrs {
 28 | 		curMin, curMax := part.GetBounds()
 29 | 		//fmt.Printf("Init %v %v %v\n", idx, curMin, curMax)
 30 | 		components[idx] = part
 31 | 		if idx == 0 {
 32 | 			min, max = curMin, curMax
 33 | 		} else {
 34 | 			// assumes positive inputs:
 35 | 			min *= curMin
 36 | 			max *= curMax
 37 | 		}
 38 | 	}
 39 | 	sort.Sort(components)
 40 | 	return &ProductDocItr{
 41 | 		score: 0.0,
 42 | 		docId: -1,
 43 | 		min:   min,
 44 | 		max:   max,
 45 | 		parts: components,
 46 | 	}
 47 | }
 48 | 
 49 | func (op *ProductDocItr) Name() string { return "ProductDocItr" }
 50 | func (op *ProductDocItr) Cur() (int64, float32) {
 51 | 	return op.docId, op.score
 52 | }
 53 | func (op *ProductDocItr) GetBounds() (min, max float32) { return op.min, op.max }
 54 | func (op *ProductDocItr) Close() {
 55 | 	for _, part := range op.parts {
 56 | 		part.Close()
 57 | 	}
 58 | }
 59 | func (op *ProductDocItr) Next(minId int64) bool {
 60 | 	min, max := op.min, op.max
 61 | 	keepGoing := true
 62 | 	var score float32
 63 | 	for keepGoing {
 64 | 		//fmt.Printf("ProductDocItr Next itr (%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score)
 65 | 		keepGoing = false
 66 | 		score = float32(1.0)
 67 | 		for _, part := range op.parts {
 68 | 			var curDocId int64
 69 | 			var curScore float32
 70 | 			for {
 71 | 				curDocId, curScore = part.Cur()
 72 | 				if curDocId >= minId {
 73 | 					break
 74 | 				}
 75 | 				if !part.Next(minId) {
 76 | 					return false
 77 | 				}
 78 | 			}
 79 | 			if curDocId > minId {
 80 | 				minId = curDocId
 81 | 				keepGoing = true
 82 | 				break
 83 | 			}
 84 | 			score *= curScore
 85 | 		}
 86 | 		if !keepGoing {
 87 | 			if score < min || score > max {
 88 | 				minId += 1
 89 | 				keepGoing = true
 90 | 			}
 91 | 		}
 92 | 	}
 93 | 	op.docId = minId
 94 | 	op.score = score
 95 | 	//fmt.Printf("ProductDocItr Next(%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score)
 96 | 	return true
 97 | }
 98 | 
 99 | func (op *ProductDocItr) SetBounds(min, max float32) bool {
100 | 	//fmt.Printf("ProductDocItr SetBounds %v %v\n", min, max)
101 | 	op.min = min
102 | 	op.max = max
103 | 
104 | 	for curfield, component := range op.parts {
105 | 		newMin, newMax := min, max
106 | 		for otherfactor, otherComponent := range op.parts {
107 | 			// Then divide by the other maxes or mins
108 | 			if curfield != otherfactor {
109 | 				otherMin, otherMax := otherComponent.GetBounds()
110 | 				if otherMax == 0.0 {
111 | 					newMin = 0.0
112 | 				} else {
113 | 					newMin /= otherMax
114 | 				}
115 | 				if otherMin == 0.0 {
116 | 					newMax = PositiveInfinity
117 | 				} else {
118 | 					newMax /= otherMin
119 | 				}
120 | 			}
121 | 		}
122 | 		curMin, curMax := component.GetBounds()
123 | 		if newMin < curMin {
124 | 			newMin = curMin
125 | 		}
126 | 		if newMax > curMax {
127 | 			newMax = curMax
128 | 		}
129 | 		if newMin != curMin || newMax != curMax {
130 | 			//fmt.Printf("ProductDocItr SetBounds for component %v %v\n", newMin, newMax)
131 | 			component.SetBounds(newMin, newMax)
132 | 		}
133 | 	}
134 | 	return true
135 | }
136 | 


--------------------------------------------------------------------------------
/productdocitr_test.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestProductDocItr(t *testing.T) {
 8 | 	i1 := NewMemoryScoreDocItr([]float32{0.2, 0.8, 0.5})
 9 | 	i2 := NewMemoryScoreDocItr([]float32{1.0, 0.0, 0.7})
10 | 	itr := NewProductDocItr([]DocItr{i1, i2})
11 | 
12 | 	if itr.min != 0.0 {
13 | 		t.Fatalf("%v", itr.min)
14 | 	}
15 | 	if itr.max != 0.8 {
16 | 		t.Fatalf("%v", itr.max)
17 | 	}
18 | 
19 | 	itr.SetBounds(0.5, 1.0)
20 | 
21 | 	min1, _ := i1.GetBounds()
22 | 	min2, _ := i2.GetBounds()
23 | 
24 | 	if min1 != 0.5 {
25 | 		t.Fatalf("%v", min1)
26 | 	}
27 | 	if min2*0.2 == 0.5 {
28 | 		t.Fatalf("%v", min2)
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/scale_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pschanely/scoredb/57beea075b4b5a53ee0a27b9752a0ca544c4510d/scale_performance.png


--------------------------------------------------------------------------------
/scaledocitr.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import ()
 4 | 
 5 | // Multiplies a value by a constant
 6 | type ScaleDocItr struct {
 7 | 	factor float32
 8 | 	docItr DocItr
 9 | }
10 | 
11 | func (op *ScaleDocItr) Name() string { return "ScaleDocItr" }
12 | func (op *ScaleDocItr) Cur() (int64, float32) {
13 | 	docId, score := op.docItr.Cur()
14 | 	return docId, score * op.factor
15 | }
16 | func (op *ScaleDocItr) GetBounds() (min, max float32) {
17 | 	min, max = op.docItr.GetBounds()
18 | 	factor := op.factor
19 | 	if factor >= 0 {
20 | 		return min * op.factor, max * op.factor
21 | 	} else {
22 | 		return max * op.factor, min * op.factor
23 | 	}
24 | }
25 | func (op *ScaleDocItr) Close() {
26 | 	op.docItr.Close()
27 | }
28 | func (op *ScaleDocItr) Next(minId int64) bool {
29 | 	return op.docItr.Next(minId)
30 | }
31 | 
32 | func (op *ScaleDocItr) SetBounds(min, max float32) bool {
33 | 	factor := op.factor
34 | 	if factor >= 0 {
35 | 		return op.docItr.SetBounds(min/op.factor, max/op.factor)
36 | 	} else {
37 | 		return op.docItr.SetBounds(max/op.factor, min/op.factor)
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/scoredb/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"encoding/json"
  6 | 	"flag"
  7 | 	"fmt"
  8 | 	"github.com/pschanely/scoredb"
  9 | 	"log"
 10 | 	"os"
 11 | 	"path"
 12 | 	"runtime"
 13 | 	"strings"
 14 | 	"time"
 15 | )
 16 | 
 17 | func MakeStandardDb(dataDir string, numShards int) (*scoredb.BaseDb, error) {
 18 | 	var shards []scoredb.StreamingDb
 19 | 
 20 | 	if scoredb.Exists(dataDir) && scoredb.Exists(path.Join(dataDir, "shard.0")) {
 21 | 		i := 0
 22 | 		shards = make([]scoredb.StreamingDb, 0, numShards)
 23 | 		for {
 24 | 			shardDir := path.Join(dataDir, fmt.Sprintf("shard.%d", i))
 25 | 			if scoredb.Exists(shardDir) {
 26 | 				shards = append(shards, scoredb.BaseStreamingDb{scoredb.NewFsScoreDb(shardDir)})
 27 | 			} else {
 28 | 				break
 29 | 			}
 30 | 			i += 1
 31 | 		}
 32 | 	} else {
 33 | 		shards = make([]scoredb.StreamingDb, numShards)
 34 | 		for i := range shards {
 35 | 			shardDir := path.Join(dataDir, fmt.Sprintf("shard.%d", i))
 36 | 			shards[i] = scoredb.BaseStreamingDb{scoredb.NewFsScoreDb(shardDir)}
 37 | 		}
 38 | 	}
 39 | 	idDb, err := scoredb.NewBoltIdDb(path.Join(dataDir, "iddb"))
 40 | 	if err != nil {
 41 | 		return nil, err
 42 | 	}
 43 | 	return &scoredb.BaseDb{
 44 | 		StreamingDb: scoredb.ShardedDb{
 45 | 			Shards: shards,
 46 | 		},
 47 | 		IdDb: idDb,
 48 | 	}, nil
 49 | }
 50 | 
 51 | func watchDir(db *scoredb.MigratableDb, baseDir string, namePrefix string) {
 52 | 	log.Printf("Watching for databases at %s%s*\n", baseDir, namePrefix)
 53 | 	var lastName = ""
 54 | 	for {
 55 | 		dir, err := os.Open(baseDir)
 56 | 		var fileInfos []os.FileInfo
 57 | 		if err == nil {
 58 | 			fileInfos, err = dir.Readdir(0)
 59 | 			dir.Close()
 60 | 		}
 61 | 		if err != nil {
 62 | 			log.Printf("Unable to read %v: %v\n", dir, err)
 63 | 			time.Sleep(55 * time.Second)
 64 | 		} else {
 65 | 			var newDbName = ""
 66 | 			for _, fileInfo := range fileInfos {
 67 | 				name := fileInfo.Name()
 68 | 				if strings.HasPrefix(name, namePrefix) {
 69 | 					if name > newDbName {
 70 | 						newDbName = name
 71 | 					}
 72 | 				}
 73 | 			}
 74 | 			if newDbName > lastName {
 75 | 				fmt.Printf("Detected database at %s%s\n", baseDir, newDbName)
 76 | 				fullDbName := path.Join(baseDir, newDbName)
 77 | 				newDb, err := MakeStandardDb(fullDbName, 1)
 78 | 				if err != nil {
 79 | 					log.Printf("Unable to load database at %s%s (%v); ignoring\n", dir, fullDbName, err)
 80 | 				} else {
 81 | 					fmt.Printf("The database at %s%s is live at %v\n", baseDir, fullDbName, time.Now().Unix())
 82 | 					db.Current = newDb
 83 | 					lastName = newDbName
 84 | 				}
 85 | 			}
 86 | 		}
 87 | 		time.Sleep(10 * time.Second)
 88 | 	}
 89 | }
 90 | 
 91 | func SetupDirLoading(databaseDir string) *scoredb.MigratableDb {
 92 | 	migratable := scoredb.MigratableDb{Current: nil}
 93 | 	baseDir, namePrefix := path.Split(databaseDir)
 94 | 	fmt.Printf("Watching for new databases named %s* in %s\n", namePrefix, baseDir)
 95 | 	go watchDir(&migratable, baseDir, namePrefix)
 96 | 	return &migratable
 97 | }
 98 | 
 99 | func main() {
100 | 
101 | 	serveCommand := flag.NewFlagSet("serve", flag.ExitOnError)
102 | 	servePort := serveCommand.Int("port", 11625, "listening port in http mode, defaults to 11625")
103 | 	serveIntf := serveCommand.String("interface", "", "network interface to listen on in http mode, defaults to empty string (any interface)")
104 | 	serveDataDir := serveCommand.String("datadir", "./data", "Storage directory for database")
105 | 	serveNumShards := serveCommand.Int("numshards", 4, "Number of shards")
106 | 	serveReadOnly := serveCommand.Bool("readonly", false, "Only allow GET requests")
107 | 	serveAutoMigrate := serveCommand.Bool("automigrate", false, "When new directories appear matching <datadir>*, atomically swap in the database at that directory. (lexigraphically last)")
108 | 
109 | 	loadCommand := flag.NewFlagSet("load", flag.ExitOnError)
110 | 	loadDataDir := loadCommand.String("datadir", "./data", "Storage directory for database")
111 | 	loadNumShards := loadCommand.Int("numshards", 4, "Number of shards (ignored if db already exists)")
112 | 
113 | 	benchCommand := flag.NewFlagSet("benchmark", flag.ExitOnError)
114 | 	benchCsvFilename := benchCommand.String("csv", "", "csv filename of census data")
115 | 	benchMaxRecords := benchCommand.Int64("maxrecords", 1000*1000, "Maximum size of database to benchmark (in # of records)")
116 | 	benchCsvOutput := benchCommand.String("out", "output.csv", "csv of performance data to output")
117 | 	benchEsUrl := benchCommand.String("esurl", "http://localhost:9200/", "URL of elasticsearch instance")
118 | 	benchEsIndex := benchCommand.String("esindex", "benchmark_scoredb", "Index name to use for elasticsearch")
119 | 	benchFsDataDir := benchCommand.String("fsdatadir", "./benchmark_data", "Storage directory for native scoredb database")
120 | 
121 | 	/*
122 | 		for cmd := range([]*flag.FlagSet{serveCommand, benchCommand}) {
123 | 			// common args here
124 | 		}
125 | 	*/
126 | 
127 | 	if len(os.Args) <= 1 {
128 | 		fmt.Println("usage: scoredb <command> [<args>]")
129 | 		fmt.Println("Commands:")
130 | 		fmt.Println(" serve      Run a scoredb server")
131 | 		fmt.Println(" load       Load json lines from stdin")
132 | 		fmt.Println(" benchmark  Run performance benchmarks")
133 | 		fmt.Println("For more help, run scoredb <command> -h")
134 | 		os.Exit(1)
135 | 	}
136 | 	var db scoredb.Db
137 | 	var err error
138 | 	switch os.Args[1] {
139 | 	case "serve":
140 | 		serveCommand.Parse(os.Args[2:])
141 | 		if *serveAutoMigrate {
142 | 			db = SetupDirLoading(*serveDataDir)
143 | 		} else {
144 | 			db, err = MakeStandardDb(*serveDataDir, *serveNumShards)
145 | 			if err != nil {
146 | 				log.Fatalf("Failed to initialize database at %v: %v\n", *serveDataDir, err)
147 | 			}
148 | 		}
149 | 		addr := fmt.Sprintf("%s:%d", *serveIntf, *servePort)
150 | 		fmt.Printf("Serving on %s\n", addr)
151 | 		log.Fatal(scoredb.ServeHttp(addr, db, *serveReadOnly))
152 | 	case "load":
153 | 		loadCommand.Parse(os.Args[2:])
154 | 		db, err := MakeStandardDb(*loadDataDir, *loadNumShards)
155 | 		if err != nil {
156 | 			log.Fatal(fmt.Sprintf("Failed to initialize database at %v: %v\n", *loadDataDir, err))
157 | 		}
158 | 		scanner := bufio.NewScanner(os.Stdin)
159 | 		batchSize := 200
160 | 		batchIndex := 0
161 | 		var batch = make([]scoredb.Record, batchSize)
162 | 		for scanner.Scan() {
163 | 			record := scoredb.Record{}
164 | 			line := scanner.Bytes()
165 | 			json.Unmarshal(line, &record)
166 | 			batch[batchIndex] = record
167 | 			batchIndex += 1
168 | 			if batchIndex >= batchSize {
169 | 				db.BulkIndex(batch)
170 | 				batchIndex = 0
171 | 				batch = make([]scoredb.Record, batchSize)
172 | 			}
173 | 		}
174 | 		if batchIndex > 0 {
175 | 			db.BulkIndex(batch[:batchIndex])
176 | 		}
177 | 	case "benchmark":
178 | 		outputFd, err := os.Create(*benchCsvOutput)
179 | 		if err != nil {
180 | 			log.Fatal(fmt.Sprintf("Failed to output output csv file at %v: %v\n", *benchCsvOutput, err))
181 | 		}
182 | 
183 | 		runtime.GOMAXPROCS(runtime.NumCPU())
184 | 		benchCommand.Parse(os.Args[2:])
185 | 		esDb := &scoredb.EsScoreDb{BaseURL: *benchEsUrl, Index: *benchEsIndex}
186 | 		fsDb, err := MakeStandardDb(*benchFsDataDir, 4)
187 | 		if err != nil {
188 | 			log.Fatal(fmt.Sprintf("Failed to initialize database at %v: %v\n", *benchFsDataDir, err))
189 | 		}
190 | 		if !scoredb.Exists(*benchCsvFilename) {
191 | 			log.Fatal(fmt.Sprintf("Cannot find source csv data file at '%s'", *benchCsvFilename))
192 | 		}
193 | 
194 | 		fmt.Printf("Running es benchmarks\n")
195 | 		esDb.DeleteIndex()
196 | 		esDb.CreateIndex()
197 | 		counts, esIndexTimes, esQueryTimes, err := scoredb.RunBenchmark(esDb, *benchCsvFilename, *benchMaxRecords)
198 | 		//esDb.DeleteIndex()
199 | 		if err != nil {
200 | 			log.Fatal(fmt.Sprintf("Failed to run es benchmark: %v\n", err))
201 | 		}
202 | 
203 | 		fmt.Printf("Running native benchmarks\n")
204 | 		_, fsIndexTimes, fsQueryTimes, err := scoredb.RunBenchmark(fsDb, *benchCsvFilename, *benchMaxRecords)
205 | 		if err != nil {
206 | 			log.Fatal(fmt.Sprintf("Failed to run native benchmark: %v\n", err))
207 | 		}
208 | 
209 | 		fmt.Fprintf(outputFd, "records,es_index,native_index,es_query_1,native_query_1,es_query_2,native_query_2\n")
210 | 		for idx := 0; idx < len(esIndexTimes); idx++ {
211 | 			fmt.Fprintf(outputFd, "%v,%v,%v", counts[idx], esIndexTimes[idx], fsIndexTimes[idx])
212 | 			for idx2 := 0; idx2 < len(esQueryTimes[idx]); idx2++ {
213 | 				fmt.Fprintf(outputFd, ",%v,%v", esQueryTimes[idx][idx2], fsQueryTimes[idx][idx2])
214 | 			}
215 | 			fmt.Fprintf(outputFd, "\n")
216 | 		}
217 | 		outputFd.Close()
218 | 	default:
219 | 		fmt.Printf("%q is not valid command.\n", os.Args[1])
220 | 		os.Exit(2)
221 | 	}
222 | }
223 | 


--------------------------------------------------------------------------------
/shardeddb.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"math/rand"
  7 | )
  8 | 
  9 | type ShardedDb struct {
 10 | 	Shards []StreamingDb
 11 | }
 12 | 
 13 | var reservedShardBits = uint(14)
 14 | 
 15 | func NewShardedDb(shards []StreamingDb) (*ShardedDb, error) {
 16 | 	maxShards := (1 << reservedShardBits) - 1
 17 | 	if len(shards) >= 1<<reservedShardBits {
 18 | 		return nil, fmt.Errorf("Too many shards (%d); maximum number of shards is %d", len(shards), maxShards)
 19 | 	}
 20 | 	return &ShardedDb{Shards: shards}, nil
 21 | }
 22 | 
 23 | func ShardIdToExt(idInShard int64, shardNum int) int64 {
 24 | 	return (int64(shardNum) << uint(64-reservedShardBits)) | idInShard
 25 | }
 26 | 
 27 | func (db ShardedDb) BulkIndex(records []map[string]float32) ([]int64, error) {
 28 | 	numShards := len(db.Shards)
 29 | 	// TODO do something more complex some day?  Parallelize it like the query side?
 30 | 	shardNum := rand.Intn(numShards)
 31 | 	results, err := db.Shards[shardNum].BulkIndex(records)
 32 | 	if err != nil {
 33 | 		return nil, err
 34 | 	}
 35 | 	for idx, v := range results {
 36 | 		results[idx] = ShardIdToExt(v, shardNum)
 37 | 	}
 38 | 	return results, nil
 39 | }
 40 | 
 41 | func (db ShardedDb) QueryItr(scorer []interface{}) (DocItr, error) {
 42 | 	parts := make([]DocItr, len(db.Shards))
 43 | 	for idx, shard := range db.Shards {
 44 | 		itr, err := shard.QueryItr(scorer)
 45 | 		if err != nil {
 46 | 			return nil, err
 47 | 		}
 48 | 		parts[idx] = itr
 49 | 	}
 50 | 	return NewParallelDocItr(parts), nil
 51 | }
 52 | 
 53 | type CandidateResult struct {
 54 | 	DocId     int64
 55 | 	Score     float32
 56 | 	WorkerNum int
 57 | }
 58 | 
 59 | type Bounds struct {
 60 | 	min, max float32
 61 | }
 62 | 
 63 | type ParallelDocItr struct {
 64 | 	score         float32
 65 | 	docId         int64
 66 | 	NumAlive      int
 67 | 	Bounds        Bounds
 68 | 	ResultChannel chan CandidateResult
 69 | 	Comms         []chan Bounds
 70 | }
 71 | 
 72 | func RunItr(itr DocItr, myWorkerNum int, resultChannel chan CandidateResult, boundsChannel chan Bounds) {
 73 | 	bounds := Bounds{min: float32(math.Inf(-1)), max: float32(math.Inf(1))}
 74 | 	docId := int64(-1)
 75 | 	var score float32
 76 | 	for {
 77 | 		if !itr.Next(docId + 1) {
 78 | 			break
 79 | 		}
 80 | 		docId, score = itr.Cur()
 81 | 		if score <= bounds.min || score >= bounds.max {
 82 | 			continue
 83 | 		}
 84 | 		resultChannel <- CandidateResult{DocId: docId, Score: score, WorkerNum: myWorkerNum}
 85 | 		/*
 86 | 			select {
 87 | 			case newBounds, ok := <- boundsChannel:
 88 | 				if ok {
 89 | 					if bounds != newBounds {
 90 | 						bounds = newBounds
 91 | 						itr.SetBounds(bounds.min, bounds.max)
 92 | 					}
 93 | 				}
 94 | 			}
 95 | 		*/
 96 | 
 97 | 		newBounds := <-boundsChannel
 98 | 
 99 | 		if bounds != newBounds {
100 | 			bounds = newBounds
101 | 			itr.SetBounds(bounds.min, bounds.max)
102 | 		}
103 | 
104 | 	}
105 | 	itr.Close()
106 | 	resultChannel <- CandidateResult{DocId: -1}
107 | }
108 | 
109 | func NewParallelDocItr(parts []DocItr) *ParallelDocItr {
110 | 	op := ParallelDocItr{
111 | 		score:         0.0,
112 | 		docId:         -1,
113 | 		NumAlive:      len(parts),
114 | 		Bounds:        Bounds{min: float32(math.Inf(-1)), max: float32(math.Inf(1))},
115 | 		ResultChannel: make(chan CandidateResult),
116 | 		Comms:         make([](chan Bounds), len(parts)),
117 | 	}
118 | 	for idx, part := range parts {
119 | 		part := part
120 | 		curMin, curMax := part.GetBounds()
121 | 		op.Bounds.min = Min(op.Bounds.min, curMin)
122 | 		op.Bounds.max = Max(op.Bounds.max, curMax)
123 | 		boundsChannel := make(chan Bounds)
124 | 		op.Comms[idx] = boundsChannel
125 | 		go RunItr(part, idx, op.ResultChannel, boundsChannel)
126 | 	}
127 | 	return &op
128 | }
129 | 
130 | func (op *ParallelDocItr) Name() string {
131 | 	return "ParallelDocItr"
132 | }
133 | 
134 | func (op *ParallelDocItr) SetBounds(min, max float32) bool {
135 | 	op.Bounds.min, op.Bounds.max = min, max
136 | 	return true
137 | }
138 | 
139 | func (op *ParallelDocItr) GetBounds() (min, max float32) {
140 | 	return op.Bounds.min, op.Bounds.max
141 | }
142 | 
143 | func (op *ParallelDocItr) Next(minId int64) bool {
144 | 	for {
145 | 		result := <-op.ResultChannel
146 | 		if result.DocId == -1 {
147 | 			op.NumAlive -= 1
148 | 			if op.NumAlive <= 0 {
149 | 				return false
150 | 			}
151 | 		} else {
152 | 			workerNum := result.WorkerNum
153 | 			if result.Score > op.Bounds.min && result.Score < op.Bounds.max {
154 | 				op.docId = ShardIdToExt(result.DocId, workerNum)
155 | 				op.score = result.Score
156 | 				op.Comms[workerNum] <- op.Bounds
157 | 				return true
158 | 			} else {
159 | 				op.Comms[workerNum] <- op.Bounds
160 | 			}
161 | 		}
162 | 	}
163 | }
164 | 
165 | func (op *ParallelDocItr) Close() {} // unsure...
166 | 
167 | func (op *ParallelDocItr) Cur() (int64, float32) {
168 | 	return op.docId, op.score
169 | }
170 | 


--------------------------------------------------------------------------------
/shardeddb_test.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestShardedDb(t *testing.T) {
 8 | 	pathmaker := RmAllTestData()
 9 | 	defer RmAllTestData()
10 | 	idDb, err := NewBoltIdDb(pathmaker("shard_ids"))
11 | 	if err != nil {
12 | 		t.Fatal(err)
13 | 	}
14 | 	db := BaseDb{
15 | 		StreamingDb: ShardedDb{
16 | 			Shards: []StreamingDb{
17 | 				BaseStreamingDb{NewFsScoreDb(pathmaker("shard_1"))},
18 | 				BaseStreamingDb{NewFsScoreDb(pathmaker("shard_2"))},
19 | 			},
20 | 		},
21 | 		IdDb: idDb,
22 | 	}
23 | 	DbBasicsTest(db, t)
24 | }
25 | 


--------------------------------------------------------------------------------
/stub.go:
--------------------------------------------------------------------------------
 1 | package scoredb
 2 | 
 3 | type StubDb struct {
 4 | 	idx int64
 5 | }
 6 | 
 7 | func (sdb *StubDb) Index(record map[string]float32) (int64, error) {
 8 | 	sdb.idx += 1
 9 | 	return sdb.idx, nil
10 | }
11 | 
12 | func (sdb *StubDb) BulkIndex(records []map[string]float32) ([]int64, error) {
13 | 	ids := make([]int64, len(records))
14 | 	for i, _ := range records {
15 | 		sdb.idx++
16 | 		ids[i] = sdb.idx
17 | 	}
18 | 	return ids, nil
19 | }
20 | 
21 | func (db *StubDb) Query(query Query) (QueryResult, error) {
22 | 	return QueryResult{Ids: []string{"7", "42"}}, nil
23 | }
24 | 
25 | func (db *StubDb) LinearQuery(numResults int, coefs map[string]float32) []string {
26 | 	return []string{"7", "42"}
27 | }
28 | 


--------------------------------------------------------------------------------
/sumdocitr.go:
--------------------------------------------------------------------------------
  1 | package scoredb
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"sort"
  6 | )
  7 | 
  8 | type SumComponent struct {
  9 | 	docItr     DocItr
 10 | 	scoreRange float32
 11 | }
 12 | 
 13 | type SumComponents []SumComponent
 14 | 
 15 | func (a SumComponents) Len() int           { return len(a) }
 16 | func (a SumComponents) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 17 | func (a SumComponents) Less(i, j int) bool { return a[i].scoreRange > a[j].scoreRange }
 18 | 
 19 | type SumDocItr struct {
 20 | 	score    float32
 21 | 	docId    int64
 22 | 	min, max float32
 23 | 	parts    []SumComponent
 24 | }
 25 | 
 26 | func NewSumDocItr(itrs []DocItr) *SumDocItr {
 27 | 	min, max := float32(0.0), float32(0.0)
 28 | 	components := make(SumComponents, len(itrs))
 29 | 	for idx, part := range itrs {
 30 | 		curMin, curMax := part.GetBounds()
 31 | 		components[idx].docItr = part
 32 | 		components[idx].scoreRange = float32(math.Abs(float64(curMax - curMin)))
 33 | 		min += curMin
 34 | 		max += curMax
 35 | 	}
 36 | 	sort.Sort(components)
 37 | 	return &SumDocItr{
 38 | 		score: 0.0,
 39 | 		docId: -1,
 40 | 		min:   min,
 41 | 		max:   max,
 42 | 		parts: components,
 43 | 	}
 44 | }
 45 | 
 46 | func (op *SumDocItr) Name() string { return "SumDocItr" }
 47 | func (op *SumDocItr) Cur() (int64, float32) {
 48 | 	return op.docId, op.score
 49 | }
 50 | func (op *SumDocItr) GetBounds() (min, max float32) { return op.min, op.max }
 51 | func (op *SumDocItr) Close() {
 52 | 	for _, part := range op.parts {
 53 | 		part.docItr.Close()
 54 | 	}
 55 | }
 56 | func (op *SumDocItr) Next(minId int64) bool {
 57 | 	min, max := op.min, op.max
 58 | 	keepGoing := true
 59 | 	var score float32
 60 | 	for keepGoing {
 61 | 		keepGoing = false
 62 | 		score = float32(0.0)
 63 | 		for _, part := range op.parts {
 64 | 			var curDocId int64
 65 | 			var curScore float32
 66 | 			for {
 67 | 				curDocId, curScore = part.docItr.Cur()
 68 | 				if curDocId >= minId {
 69 | 					break
 70 | 				}
 71 | 				if !part.docItr.Next(minId) {
 72 | 					return false
 73 | 				}
 74 | 			}
 75 | 			if curDocId > minId {
 76 | 				minId = curDocId
 77 | 				keepGoing = true
 78 | 				break
 79 | 			}
 80 | 			score += curScore
 81 | 		}
 82 | 		if !keepGoing {
 83 | 			if score < min || score > max {
 84 | 				minId += 1
 85 | 				keepGoing = true
 86 | 			}
 87 | 		}
 88 | 	}
 89 | 	op.docId = minId
 90 | 	op.score = score
 91 | 	//fmt.Printf("SumDocItr Next(%v) [%v:%v] = %v score:%v\n", minId, op.min, op.max, op.docId, score)
 92 | 	return true
 93 | }
 94 | 
 95 | func (op *SumDocItr) SetBounds(min, max float32) bool {
 96 | 	//fmt.Printf("SumDocItr SetBounds %v %v\n", min, max)
 97 | 	op.min = min
 98 | 	op.max = max
 99 | 
100 | 	for curfield, component := range op.parts {
101 | 		newMin, newMax := min, max
102 | 		// subtract out the ranges of all the other components (the remaining range will be mine)
103 | 		for otherfactor, otherComponent := range op.parts {
104 | 			//Then subtract the other maxes or mins
105 | 			if curfield != otherfactor {
106 | 				otherMin, otherMax := otherComponent.docItr.GetBounds()
107 | 				newMin -= otherMax
108 | 				newMax -= otherMin
109 | 			}
110 | 		}
111 | 		curMin, curMax := component.docItr.GetBounds()
112 | 		if newMin < curMin {
113 | 			newMin = curMin
114 | 		}
115 | 		if newMax > curMax {
116 | 			newMax = curMax
117 | 		}
118 | 		if newMin != curMin || newMax != curMax {
119 | 			//fmt.Printf("SumDocItr SetBounds for component %v %v\n", newMin, newMax)
120 | 			component.docItr.SetBounds(newMin, newMax)
121 | 		}
122 | 	}
123 | 	return true
124 | }
125 | 


--------------------------------------------------------------------------------