├── qci ├── __init__.py ├── manual_run.sh ├── test_loadgen.ipy ├── test_loadgen2.ipy ├── test_loadgen3.ipy ├── test_endpoint.ipy ├── test_readwrite1.ipy ├── test_readwrite2.ipy ├── test_changedrange.ipy ├── utils.ipy ├── runtests.ipy └── test_readstat1.ipy ├── .gitignore ├── internal ├── cephprovider │ ├── test │ ├── cephprovider.h │ ├── cephcache.go │ ├── cephprovider.c │ └── cephprovider.go ├── bstore │ ├── bstore.go │ ├── linker.go │ ├── blockcache.go │ ├── blocktypes_test.go │ ├── bstore_test.go │ ├── blockstore.go │ └── blocktypes.go ├── bprovider │ ├── bprovider.go │ └── bprovider_test.go └── fileprovider │ └── fileprovider.go ├── cpinterface ├── go.capnp ├── interface.capnp └── cpinterface.go ├── Makefile ├── .project ├── tools ├── addtarget └── scrub ├── btrdb.conf ├── quasar.conf ├── logconfig.xml ├── README.md ├── btrdbd ├── main.go └── config.go ├── qtree ├── operators.go ├── qtree_test.go ├── qtree_utils.go └── qtree2_test.go ├── quasar.go └── quasar_test.go /qci/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin/* 2 | *.pyc 3 | pkg/* 4 | src/* 5 | *~ 6 | .idea 7 | quasar.iml 8 | *.log 9 | -------------------------------------------------------------------------------- /internal/cephprovider/test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UlricQin/btrdb/master/internal/cephprovider/test -------------------------------------------------------------------------------- /cpinterface/go.capnp: -------------------------------------------------------------------------------- 1 | @0xd12a1c51fedd6c88; 2 | annotation package(file) :Text; 3 | annotation import(file) :Text; 4 | annotation doc(struct, field, enum) :Text; 5 | annotation tag(enumerant) : Text; 6 | annotation notag(enumerant) : Void; 7 | annotation customtype(field) : Text; 8 | $package("capn"); 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | bqserver: 3 | go build -o bin/qserver github.com/SoftwareDefinedBuildings/quasar/qserver 4 | 5 | cleanbins: 6 | rm -f bin/qserver bin/qtool 7 | 8 | bins: cleanbins bqserver 9 | 10 | cleandb: 11 | rm -f /srv/quasar/*.db 12 | rm -f /srv/quasartestdb/* 13 | mongo quasar2 --eval 'db.superblocks.remove({})' 14 | 15 | newdbs: cleandb bins 16 | ./bin/qserver -makedb 17 | -------------------------------------------------------------------------------- /qci/manual_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | rm -f profile.* 3 | rm -f log.* 4 | export goversion=go_64_1.4.2 5 | export GOROOT=/srv/$goversion 6 | export GO=$GOROOT/bin/go 7 | mkdir -p gopath 8 | export GOPATH=`pwd`/gopath 9 | export PATH=$PATH:$GOROOT/bin/ 10 | git pull 11 | $GO get -v -d ./... 12 | $GO build -a -v -o exe ./qserver 13 | export CEPHTYPE=filestore 14 | export TEST_TYPE=loadgen2 15 | ipython qci/runtests.ipy 16 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | quasar 4 | 5 | 6 | 7 | 8 | 9 | com.googlecode.goclipse.goBuilder 10 | 11 | 12 | 13 | 14 | 15 | com.googlecode.goclipse.core.goNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /tools/addtarget: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | if len(sys.argv) != 3: 3 | print "usage: addtarget " 4 | sys.exit(1) 5 | 6 | _client = pymongo.MongoClient() 7 | db = _client[sys.argv[1]][sys.argv[2]] 8 | 9 | outf = open("targets","a") 10 | print >>outf, "#addtargets %s:",sys.argv[3] 11 | for r in db.find({"Path":{"$regex":sys.argv[3]}}): 12 | print >>outf, "# %s : %s" % (r["Metadata"]["SourceName"], r["Path"]) 13 | print >>outf, r["uuid"] 14 | printf >>outf, "\n" 15 | 16 | outf.close() -------------------------------------------------------------------------------- /internal/bstore/bstore.go: -------------------------------------------------------------------------------- 1 | package bstore 2 | 3 | import ( 4 | "github.com/op/go-logging" 5 | ) 6 | 7 | var lg *logging.Logger 8 | 9 | func init() { 10 | lg = logging.MustGetLogger("log") 11 | } 12 | 13 | //Note to self, if you bump VSIZE such that the max blob goes past 2^16, make sure to adapt 14 | //providers 15 | const ( 16 | VSIZE = 1024 17 | KFACTOR = 64 18 | VBSIZE = 2 + 9*VSIZE + 9*VSIZE + 2*VSIZE //Worst case with huffman 19 | CBSIZE = 1 + KFACTOR*9*6 20 | DBSIZE = VBSIZE 21 | PWFACTOR = uint8(6) //1<<6 == 64 22 | RELOCATION_BASE = 0xFF00000000000000 23 | ) 24 | -------------------------------------------------------------------------------- /internal/cephprovider/cephprovider.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | typedef struct 5 | { 6 | rados_ioctx_t ctx; 7 | rados_completion_t *comps; 8 | int comp_len; 9 | int comp_cap; 10 | } cephprovider_handle_t; 11 | 12 | typedef cephprovider_handle_t* phandle_t; 13 | 14 | void initialize_provider(const char* conffile, const char* pool); 15 | phandle_t handle_create(); 16 | void handle_write(phandle_t seg, uint8_t *uuid, uint64_t address, const char* data, int len, int trunc); 17 | uint64_t handle_obtainrange(cephprovider_handle_t *h); 18 | void handle_init_allocator(cephprovider_handle_t *h); 19 | int handle_read(phandle_t seg, uint8_t *uuid, uint64_t address, char* dest, int len); 20 | void handle_close(phandle_t seg); 21 | -------------------------------------------------------------------------------- /qci/test_loadgen.ipy: -------------------------------------------------------------------------------- 1 | def start_loadgen(): 2 | stdout=open("log.lg.stdout","w") 3 | uuids = [str(uuid.uuid4()) for x in xrange(20)] 4 | rc = subprocess.call(["./loadgen", "-i"]+uuids, stdout=stdout, stderr=subprocess.STDOUT) 5 | print "INSERT RV:",rc 6 | if rc != 0: 7 | sys.exit(rc) 8 | sys.stdout.flush() 9 | rc = subprocess.call(["./loadgen", "-v"]+uuids, stdout=stdout, stderr=subprocess.STDOUT) 10 | print "VERIFY RV:",rc 11 | if rc != 0: 12 | sys.exit(rc) 13 | 14 | 15 | p2 = Process(target=start_loadgen) 16 | p2.start() 17 | p2.join() 18 | if p2.exitcode != 0: 19 | print "EXITCODE FROM LOADGEN:", p2.exitcode 20 | os.kill(p.pid, 9) 21 | sys.exit(p2.exitcode) 22 | else: 23 | !rm FAILURE 24 | -------------------------------------------------------------------------------- /btrdb.conf: -------------------------------------------------------------------------------- 1 | # This is the configuration file for QUASAR version 2 2 | # without this file, it will not start. It should be 3 | # located either in the directory from which quasar is 4 | # started, or in /etc/quasar/quasar.conf 5 | 6 | [storage] 7 | # Either file-based or Ceph-based storage can be used 8 | provider=file 9 | filepath=/srv/quasar/ 10 | 11 | #provider=ceph 12 | #cephconf=/etc/ceph/ceph.conf 13 | #cephpool=data 14 | 15 | [http] 16 | enabled=true 17 | port=9000 18 | address=0.0.0.0 19 | 20 | [capnp] 21 | enabled=true 22 | port=4410 23 | address=0.0.0.0 24 | 25 | [mongo] 26 | server=localhost 27 | collection=quasar 28 | 29 | [cache] 30 | # Configure the RADOS and block caches. If you have a choice, rather 31 | # spend memory on the block cache. 32 | 33 | # This is measured in blocks, which are at most ~16K 34 | # blockcache=4000000 #64 GB 35 | blockcache=2000000 #32 GB 36 | # blockcache=1000000 #16 GB 37 | # blockcache=500000 #8 GB 38 | # blockcache=250000 #4 GB 39 | # blockcache=62500 #1 GB 40 | 41 | radosreadcache=2048 #in MB 42 | radoswritecache=256 #in MB 43 | 44 | [coalescence] 45 | earlytrip=16384 #readings 46 | interval=5000 #ms 47 | -------------------------------------------------------------------------------- /quasar.conf: -------------------------------------------------------------------------------- 1 | # This is the configuration file for QUASAR version 2 2 | # without this file, it will not start. It should be 3 | # located either in the directory from which quasar is 4 | # started, or in /etc/quasar/quasar.conf 5 | 6 | [storage] 7 | # Either file-based or Ceph-based storage can be used 8 | provider=file 9 | filepath=/srv/quasar/ 10 | 11 | #provider=ceph 12 | #cephconf=/etc/ceph/ceph.conf 13 | #cephpool=data 14 | 15 | [http] 16 | enabled=true 17 | port=9000 18 | address=0.0.0.0 19 | 20 | [capnp] 21 | enabled=true 22 | port=4410 23 | address=0.0.0.0 24 | 25 | [mongo] 26 | server=localhost 27 | collection=quasar 28 | 29 | [cache] 30 | # Configure the RADOS and block caches. If you have a choice, rather 31 | # spend memory on the block cache. 32 | 33 | # This is measured in blocks, which are at most ~16K 34 | # blockcache=4000000 #64 GB 35 | blockcache=2000000 #32 GB 36 | # blockcache=1000000 #16 GB 37 | # blockcache=500000 #8 GB 38 | # blockcache=250000 #4 GB 39 | # blockcache=62500 #1 GB 40 | 41 | radosreadcache=2048 #in MB 42 | radoswritecache=256 #in MB 43 | 44 | [coalescence] 45 | earlytrip=16384 #readings 46 | interval=5000 #ms 47 | -------------------------------------------------------------------------------- /qci/test_loadgen2.ipy: -------------------------------------------------------------------------------- 1 | import random 2 | import uuid 3 | import subprocess 4 | import sys 5 | 6 | num_streams = 1 7 | def start_loadgen(): 8 | global num_streams 9 | cf = open("loadConfig.ini", "w") 10 | random_seed = random.randint(0,10000) 11 | print "USING RANDOM SEED ", random_seed 12 | cf.write("""TOTAL_RECORDS=24000000 13 | TCP_CONNECTIONS={0} 14 | POINTS_PER_MESSAGE=5000 15 | NANOS_BETWEEN_POINTS=9000000 16 | MAX_TIME_RANDOM_OFFSET = 8999999 17 | FIRST_TIME=1420582220083869629 18 | DB_ADDR=localhost:4410 19 | NUM_STREAMS={0} 20 | RAND_SEED={1} 21 | """.format(num_streams, random_seed) 22 | ) 23 | 24 | for i in xrange(num_streams): 25 | cf.write("UUID%d=%s\n" % (i+1, uuid.uuid4())) 26 | cf.close() 27 | stdout=open("log.lg.1.stdout","w") 28 | rc = subprocess.call(["./loadgen", "-i"], stdout=stdout, stderr=subprocess.STDOUT) 29 | print "INSERT RV:",rc 30 | if rc != 0: 31 | sys.exit(rc) 32 | sys.stdout.flush() 33 | term_quasar() 34 | time.sleep(2) 35 | proc_profiles("ins") 36 | start_quasar() 37 | time.sleep(10) 38 | stdout2=open("log.lg.2.stdout","w") 39 | rc = subprocess.call(["./loadgen", "-v"], stdout=stdout2, stderr=subprocess.STDOUT) 40 | print "VERIFY RV:",rc 41 | if rc != 0: 42 | sys.exit(rc) 43 | !rm FAILURE 44 | 45 | start_loadgen() 46 | -------------------------------------------------------------------------------- /qci/test_loadgen3.ipy: -------------------------------------------------------------------------------- 1 | import random 2 | import uuid 3 | import subprocess 4 | import sys 5 | import time 6 | num_streams = 1 7 | def start_loadgen(): 8 | global num_streams 9 | cf = open("loadConfig.ini", "w") 10 | random_seed = random.randint(0,10000) 11 | print "USING RANDOM SEED ", random_seed 12 | cf.write("""TOTAL_RECORDS=24000000 13 | TCP_CONNECTIONS={0} 14 | POINTS_PER_MESSAGE=5000 15 | NANOS_BETWEEN_POINTS=9000000 16 | MAX_TIME_RANDOM_OFFSET = 8999999 17 | FIRST_TIME=1420582220083869629 18 | DB_ADDR=localhost:4410 19 | NUM_STREAMS={0} 20 | RAND_SEED={1} 21 | """.format(num_streams, random_seed) 22 | ) 23 | 24 | for i in xrange(num_streams): 25 | cf.write("UUID%d=%s\n" % (i+1, uuid.uuid4())) 26 | cf.close() 27 | stdout=open("log.lg.stdout1","w") 28 | rc = subprocess.call(["./loadgen", "-i"], stdout=stdout, stderr=subprocess.STDOUT) 29 | print "INSERT RV:",rc 30 | if rc != 0: 31 | sys.exit(rc) 32 | sys.stdout.flush() 33 | term_quasar() 34 | time.sleep(2) 35 | proc_profiles("inst") 36 | start_quasar() 37 | time.sleep(4) 38 | stdout2=open("log.lg.stdout2","w") 39 | rc = subprocess.call(["./loadgen", "-d"], stdout=stdout2, stderr=subprocess.STDOUT) 40 | print "DELETE RV:",rc 41 | if rc != 0: 42 | sys.exit(rc) 43 | !rm FAILURE 44 | 45 | 46 | start_loadgen() 47 | -------------------------------------------------------------------------------- /qci/test_endpoint.ipy: -------------------------------------------------------------------------------- 1 | import random 2 | import uuid 3 | import subprocess 4 | import sys 5 | import time 6 | import json 7 | def start_loadgen(): 8 | global num_streams 9 | cf = open("loadConfig.ini", "w") 10 | cf.write("""TOTAL_RECORDS=120000000 11 | TCP_CONNECTIONS=1 12 | POINTS_PER_MESSAGE=5000 13 | NANOS_BETWEEN_POINTS=9000000 14 | MAX_TIME_RANDOM_OFFSET = 8999999 15 | FIRST_TIME=1420582220083869629 16 | DB_ADDR=localhost:4410 17 | NUM_STREAMS=1 18 | UUID1=9f67541c-95ee-11e4-a7ac-0026b6df9cf2 19 | RAND_SEED=15 20 | """) 21 | cf.close() 22 | stdout=open("log.lg.stdout1","w") 23 | rc = subprocess.call(["./loadgen", "-i"], stdout=stdout, stderr=subprocess.STDOUT) 24 | print "INSERT RV:",rc 25 | if rc != 0: 26 | sys.exit(rc) 27 | sys.stdout.flush() 28 | term_quasar() 29 | time.sleep(2) 30 | proc_profiles("inst") 31 | start_quasar() 32 | time.sleep(4) 33 | 34 | # Check whether we have extra points 35 | dstr = !curl -s "http://localhost:9000/data/uuid/9f67541c-95ee-11e4-a7ac-0026b6df9cf2?starttime=1421395993269633024&endtime=1421455504336486400&unitoftime=ns&pw=37" 36 | data = json.loads(dstr[0]) 37 | lastpoint = data[0]["XReadings"][-1] 38 | lasttime = (lastpoint[0] * 1000000) + lastpoint[1] 39 | if lasttime != (1421455504336486400 - (2 ** 37)): 40 | print "Extra or missing points detected at end of statistical query" 41 | print "last time:", lasttime 42 | sys.exit(1) 43 | !rm FAILURE 44 | 45 | 46 | start_loadgen() 47 | -------------------------------------------------------------------------------- /logconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | stdout 4 | console 5 | 6 | DEBUG 7 | 8 | 9 | file 10 | file 11 | FINEST 12 | quasar.log 13 | [%D %T] [%L] (%S) %M 14 | false 15 | 20M 16 | 0K 17 | true 18 | 19 | 20 | file 21 | file 22 | WARNING 23 | quasar.serious.log 24 | [%D %T] [%L] (%S) %M 25 | false 26 | 20M 27 | 0K 28 | true 29 | 30 | 31 | -------------------------------------------------------------------------------- /qci/test_readwrite1.ipy: -------------------------------------------------------------------------------- 1 | 2 | import qdf 3 | import qdf.quasar 4 | import sys 5 | import random 6 | import uuid 7 | import time 8 | from twisted.internet import defer, protocol, reactor 9 | print "entered test readwrite1" 10 | EXIT_CODE = None 11 | def setexit(code): 12 | global EXIT_CODE 13 | EXIT_CODE = code 14 | reactor.stop() 15 | 16 | @defer.inlineCallbacks 17 | def testbody(db): 18 | print "connected" 19 | TOTALPOINTS = 1000000 20 | PERINSERT = 1000 21 | INTERVAL = int(1E9/120.) 22 | UID = str(uuid.uuid4()) 23 | randomdata = [(x*INTERVAL, random.random()) for x in xrange(TOTALPOINTS)] 24 | idx = 0 25 | print "random data generated" 26 | for i in xrange(TOTALPOINTS/PERINSERT): 27 | yield db.insertValues(UID, randomdata[idx:idx+PERINSERT]) 28 | idx += PERINSERT 29 | time.sleep(20) 30 | readdata = [] 31 | idx = 0 32 | print "reading data" 33 | for i in xrange(TOTALPOINTS/PERINSERT): 34 | (status, rv) = yield db.queryStandardValues(UID, i*INTERVAL*PERINSERT, (i+1)*INTERVAL*PERINSERT) 35 | (version, values) = rv 36 | readdata += [(v.time, v.value) for v in values] 37 | print "len readdata:",len(readdata) 38 | print "len insert:",len(randomdata) 39 | for i in xrange(len(randomdata)): 40 | if randomdata[i][0] != readdata[i][0]: 41 | print "time mismatch index",i 42 | break 43 | if randomdata[i][1] != readdata[i][1]: 44 | print "value mismatch index",i 45 | break 46 | else: 47 | print "lists match" 48 | setexit(0) 49 | return 50 | setexit(1) 51 | return 52 | 53 | def onFail(param): 54 | print "Encountered error: ", param 55 | setexit(2) 56 | 57 | def entrypoint(): 58 | print "in entrypoint" 59 | try: 60 | q = qdf.quasar.connectToArchiver("localhost", 4410) 61 | q.addCallback(testbody) 62 | q.addErrback(onFail) 63 | except Exception as e: 64 | print "ex: ",e 65 | setexit(1) 66 | 67 | reactor.callWhenRunning(entrypoint) 68 | reactor.run() 69 | if EXIT_CODE == None: 70 | EXIT_CODE = 42 71 | if EXIT_CODE != 0: 72 | sys.exit(EXIT_CODE) 73 | else: 74 | !rm FAILURE 75 | -------------------------------------------------------------------------------- /tools/scrub: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import pymongo 3 | import uuid 4 | import rados 5 | import sys 6 | import time 7 | 8 | if len(sys.argv) != 4: 9 | print "usage: scrub " 10 | sys.exit(1) 11 | 12 | _client = pymongo.MongoClient() 13 | db = _client[sys.argv[2]].superblocks 14 | 15 | 16 | uuids = [] 17 | #Get target uuids 18 | with open(sys.argv[3],"r") as uf: 19 | for l in uf.readlines(): 20 | l = l.strip() 21 | if l.startswith("#") or len(l) == 0: 22 | continue 23 | uuids.append(uuid.UUID(l)) 24 | 25 | #Get all metadata uuids 26 | known_uuids = [uuid.UUID(x) for x in db.distinct("uuid")] 27 | print "There are %d known uuids" % len(known_uuids) 28 | 29 | #Get all object names for these uuids 30 | cluster = rados.Rados(conffile="/etc/ceph/ceph.conf") 31 | print "Will attempt to connect to: " + str(cluster.conf_get('mon initial members')) 32 | 33 | cluster.connect() 34 | time.sleep(1) 35 | for i in xrange(10): 36 | try: 37 | cluster.require_state("connected") 38 | break 39 | except rados.RadosStateError as e: 40 | print e 41 | print "Not connected yet" 42 | time.sleep(1) 43 | 44 | ioctx = cluster.open_ioctx(sys.argv[1]) 45 | obj_iter = ioctx.list_objects() 46 | 47 | rogue_uuids = set() 48 | toremove = [] 49 | total = 0 50 | for obj in obj_iter: 51 | if obj.key == "allocator": 52 | continue 53 | total += 1 54 | uid = uuid.UUID(obj.key[:32]) 55 | if uid not in known_uuids: 56 | rogue_uuids.add(uid) 57 | if uid in uuids: 58 | toremove.append(obj.key) 59 | if total != 0: 60 | print "A total of %d objects matched (%.2f%%)" % (len(toremove), (float(len(toremove))/total)*100) 61 | else: 62 | print "No objects" 63 | print "There are %d rogue uuids" % len(rogue_uuids) 64 | 65 | print "If you wish to continue and delete the quasar objects, type 'yes i really do' exactly" 66 | inp = raw_input(">") 67 | if inp != "yes i really do": 68 | print "Aborting" 69 | sys.exit(1) 70 | 71 | for key in toremove: 72 | print "Removing: ",key 73 | ioctx.remove_object(key) 74 | 75 | print "If you wish to continue and delete the metadata, type 'yes I really do' exactly" 76 | inp = raw_input(">") 77 | if inp != "yes I really do": 78 | print "Aborting" 79 | sys.exit(1) 80 | 81 | for u in uuids: 82 | print "Removing: ", str(u) 83 | rv = db.remove({"uuid":str(u)}) 84 | print "OK, %d generations nuked" % rv["n"] 85 | 86 | print "Success" 87 | -------------------------------------------------------------------------------- /internal/bprovider/bprovider.go: -------------------------------------------------------------------------------- 1 | package bprovider 2 | 3 | //A blob provider implements a simple interface for storing blobs 4 | //An address base gets locked in the form of a segment, and then an arbitrary number of 5 | //blobs are written sequentially from that base, with each write call returning the address 6 | //of the base of the next write. At the end, the segment is unlocked. 7 | //For reading, the blob provider needs to work out its own framing, as it gets given 8 | //a start address and must magically return the blob corresponding to that address 9 | //The addresses have no special form*, other than being uint64s. It is up to the provider 10 | //to encode whatever metadata it requires inside that uint64 11 | 12 | //*I lied, addresses must not have the top byte as FF, those are reserved for relocation addresses 13 | 14 | //In case it is not obvious, the challenge a bprovider faces is being able to hand out an address 15 | //and support an arbitrary sized blob being written to that address. At the moment the max size of 16 | //a blob can be determined by max(CBSIZE, VBSIZE) which is under 32k, but may be as little as 1k 17 | //for well compressed blocks. 18 | 19 | import ( 20 | "errors" 21 | ) 22 | 23 | var ErrNoSpace = errors.New("No more space") 24 | var ErrInvalidArgument = errors.New("Invalid argument") 25 | var ErrExists = errors.New("File exists") 26 | 27 | type Segment interface { 28 | //Returns the address of the first free word in the segment when it was locked 29 | BaseAddress() uint64 30 | 31 | //Unlocks the segment for the StorageProvider to give to other consumers 32 | //Implies a flush 33 | Unlock() 34 | 35 | //Writes a slice to the segment, returns immediately 36 | //Returns nil if op is OK, otherwise ErrNoSpace or ErrInvalidArgument 37 | //It is up to the implementer to work out how to report no space immediately 38 | //The uint64 is the address to be used for the next write 39 | Write(uuid []byte, address uint64, data []byte) (uint64, error) 40 | 41 | //Block until all writes are complete. Note this does not imply a flush of the underlying files. 42 | Flush() 43 | } 44 | type StorageProvider interface { 45 | 46 | //Called at startup of a normal run 47 | Initialize(opts map[string]string) 48 | 49 | //Called to create the database for the first time 50 | //Note that initialize is not called before this function call 51 | //and you can assume the program will exit shortly after this 52 | //function call 53 | CreateDatabase(opts map[string]string) error 54 | 55 | // Lock a segment, or block until a segment can be locked 56 | // Returns a Segment struct 57 | LockSegment(uuid []byte) Segment 58 | 59 | // Read the blob into the given buffer 60 | Read(uuid []byte, address uint64, buffer []byte) []byte 61 | } 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | BTrDB 2 | ===== 3 | 4 | The Berkeley TRee DataBase is a high performance time series 5 | database designed to support high density data storage applications. 6 | This project used to be called QUASAR, but we have changed the name 7 | partly to match publications, and partly as a flag day. The capnp interface 8 | in BTrDB is designed to better support large queries and clusters and is not 9 | backwards compatible with the quasar interface. 10 | 11 | ### Dependencies 12 | 13 | BTrDB uses a MongoDB collection to store metadata. Also, if installed in High Availability 14 | mode, it requires a ceph pool. Note that even if not using ceph, librados needs to be 15 | installed. 16 | 17 | ### Installation 18 | 19 | To run an archiver, make sure that you have Go >= 1.4 installed and then 20 | run the following: 21 | 22 | ``` 23 | apt-get install librados-dev 24 | go get github.com/SoftwareDefinedBuildings/btrdb/btrdbd 25 | ``` 26 | 27 | This will install the tools into your 28 | $GOPATH/bin directory. If you have this directory on your $PATH then you do 29 | not need to do anything further. Otherwise you will need to add the binaries 30 | to your $PATH variable manually. 31 | 32 | Note that in order to run the btrdb server, you will need to copy btrdb.conf 33 | from the github repository to /etc/btrdb/btrdb.conf (or the directory that 34 | you are in). 35 | 36 | An alternative to 'go get'ing to your GOPATH is to clone the repository then do: 37 | 38 | ``` 39 | apt-get install librados-dev 40 | go get -d ./... && go install ./btrdbd 41 | ``` 42 | 43 | This will also put the btrdbd binary in your $GOPATH/bin. 44 | 45 | ### Configuration 46 | 47 | Sensible defaults (for a production deployment) are already found in btrdb.conf. Some things you may need 48 | to adjust: 49 | - The MongoDB server and collection name 50 | - The block cache size (defaults to 32GB). Note that quasar uses more than this, this is just 51 | a primary contributor to the RAM footprint. 52 | - The file storage path or ceph details 53 | 54 | Once your configuration is set up, you can set up the files, and database indices with 55 | 56 | ``` 57 | btrdbd -makedb 58 | ``` 59 | 60 | Which should print out: 61 | ``` 62 | Configuration OK! 63 | Creating a new database 64 | Done 65 | ``` 66 | 67 | You can now run a server with: 68 | ``` 69 | btrdbd 70 | ``` 71 | 72 | ### Using the database 73 | 74 | Note that we are presently working on release engineering, and hope to release the first (public) version in August 2016. If you are using it now, bear in mind it is still in development. 75 | 76 | To communicate with the database, there are [go bindings](https://github.com/SoftwareDefinedBuildings/btrdb-go) and [python bindings](https://github.com/SoftwareDefinedBuildings/btrdb-python). The go bindings are faster and more maintained. 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /internal/bstore/linker.go: -------------------------------------------------------------------------------- 1 | package bstore 2 | 3 | import ( 4 | "log" 5 | "sort" 6 | "sync" 7 | 8 | "github.com/SoftwareDefinedBuildings/btrdb/internal/bprovider" 9 | ) 10 | 11 | var ser_buf_pool = sync.Pool{ 12 | New: func() interface{} { 13 | return make([]byte, DBSIZE) 14 | }, 15 | } 16 | 17 | type pCBArr []*Coreblock 18 | 19 | func (dca pCBArr) Len() int { 20 | return len(dca) 21 | } 22 | 23 | func (dca pCBArr) Swap(i, j int) { 24 | dca[i], dca[j] = dca[j], dca[i] 25 | } 26 | 27 | func (dca pCBArr) Less(i, j int) bool { 28 | return dca[i].PointWidth < dca[j].PointWidth 29 | } 30 | 31 | func LinkAndStore(uuid []byte, bs *BlockStore, bp bprovider.StorageProvider, vblocks []*Vectorblock, cblocks []*Coreblock) map[uint64]uint64 { 32 | loaned_sercbufs := make([][]byte, len(cblocks)) 33 | loaned_servbufs := make([][]byte, len(vblocks)) 34 | 35 | //First sort the vblock array (time before lock costs less) 36 | sort.Sort(pCBArr(cblocks)) 37 | 38 | //Then lets lock a segment 39 | seg := bp.LockSegment(uuid) 40 | 41 | backpatch := make(map[uint64]uint64, len(cblocks)+len(vblocks)+1) 42 | backpatch[0] = 0 //Null address is still null 43 | 44 | ptr := seg.BaseAddress() 45 | 46 | //First step is to write all the vector blocks, order is not important 47 | for i := 0; i < len(vblocks); i++ { 48 | vb := vblocks[i] 49 | 50 | //Store relocation for cb backpatch 51 | backpatch[vb.Identifier] = ptr 52 | 53 | //Update the block. VB should now look as if it were read from disk 54 | vb.Identifier = ptr 55 | //So we can cache it 56 | bs.cachePut(ptr, vb) 57 | 58 | //Now write it 59 | serbuf := ser_buf_pool.Get().([]byte) 60 | cutdown := vb.Serialize(serbuf) 61 | loaned_servbufs[i] = serbuf 62 | nptr, err := seg.Write(uuid, ptr, cutdown) 63 | if err != nil { 64 | log.Panicf("Got error on segment write: %v", err) 65 | } 66 | ptr = nptr 67 | } 68 | 69 | //Now we need to write the coreblocks out 70 | for i := 0; i < len(cblocks); i++ { 71 | cb := cblocks[i] 72 | 73 | //Relocate and backpatch 74 | for k := 0; k < KFACTOR; k++ { 75 | if cb.Addr[k] < RELOCATION_BASE { 76 | continue 77 | } 78 | nval, ok := backpatch[cb.Addr[k]] 79 | if !ok { 80 | log.Panicf("Failed to backpatch! (trying to find addr 0x%016x)", cb.Addr[k]) 81 | } 82 | cb.Addr[k] = nval 83 | } 84 | backpatch[cb.Identifier] = ptr 85 | cb.Identifier = ptr 86 | bs.cachePut(ptr, cb) 87 | 88 | serbuf := ser_buf_pool.Get().([]byte) 89 | cutdown := cb.Serialize(serbuf) 90 | loaned_sercbufs[i] = serbuf 91 | nptr, err := seg.Write(uuid, ptr, cutdown) 92 | if err != nil { 93 | log.Panicf("Got error on segment write: %v", err) 94 | } 95 | ptr = nptr 96 | } 97 | seg.Unlock() 98 | //Return buffers to pool 99 | for _, v := range loaned_sercbufs { 100 | ser_buf_pool.Put(v) 101 | } 102 | for _, v := range loaned_servbufs { 103 | ser_buf_pool.Put(v) 104 | } 105 | return backpatch 106 | } 107 | -------------------------------------------------------------------------------- /internal/bstore/blockcache.go: -------------------------------------------------------------------------------- 1 | package bstore 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | type CacheItem struct { 8 | val Datablock 9 | vaddr uint64 10 | newer *CacheItem 11 | older *CacheItem 12 | } 13 | 14 | func (bs *BlockStore) initCache(size uint64) { 15 | bs.cachemax = size 16 | bs.cachemap = make(map[uint64]*CacheItem, size) 17 | go func() { 18 | for { 19 | lg.Info("Cachestats: %d misses, %d hits, %.2f %%", 20 | bs.cachemiss, bs.cachehit, (float64(bs.cachehit*100) / float64(bs.cachemiss+bs.cachehit))) 21 | time.Sleep(5 * time.Second) 22 | } 23 | }() 24 | } 25 | 26 | //This function must be called with the mutex held 27 | func (bs *BlockStore) cachePromote(i *CacheItem) { 28 | if bs.cachenew == i { 29 | //Already at front 30 | return 31 | } 32 | if i.newer != nil { 33 | i.newer.older = i.older 34 | } 35 | if i.older != nil { 36 | i.older.newer = i.newer 37 | } 38 | if bs.cacheold == i && i.newer != nil { 39 | //This was the tail of a list longer than 1 40 | bs.cacheold = i.newer 41 | } else if bs.cacheold == nil { 42 | //This was/is the only item in the list 43 | bs.cacheold = i 44 | } 45 | 46 | i.newer = nil 47 | i.older = bs.cachenew 48 | if bs.cachenew != nil { 49 | bs.cachenew.newer = i 50 | } 51 | bs.cachenew = i 52 | } 53 | func (bs *BlockStore) cachePut(vaddr uint64, item Datablock) { 54 | if bs.cachemax == 0 { 55 | return 56 | } 57 | bs.cachemtx.Lock() 58 | i, ok := bs.cachemap[vaddr] 59 | if ok { 60 | bs.cachePromote(i) 61 | } else { 62 | i = &CacheItem{ 63 | val: item, 64 | vaddr: vaddr, 65 | } 66 | bs.cachemap[vaddr] = i 67 | bs.cachePromote(i) 68 | bs.cachelen++ 69 | bs.cacheCheckCap() 70 | } 71 | bs.cachemtx.Unlock() 72 | } 73 | 74 | func (bs *BlockStore) cacheGet(vaddr uint64) Datablock { 75 | if bs.cachemax == 0 { 76 | bs.cachemiss++ 77 | return nil 78 | } 79 | bs.cachemtx.Lock() 80 | rv, ok := bs.cachemap[vaddr] 81 | if ok { 82 | bs.cachePromote(rv) 83 | } 84 | bs.cachemtx.Unlock() 85 | if ok { 86 | bs.cachehit++ 87 | return rv.val 88 | } else { 89 | bs.cachemiss++ 90 | return nil 91 | } 92 | } 93 | 94 | //debug function 95 | func (bs *BlockStore) walkCache() { 96 | fw := 0 97 | bw := 0 98 | it := bs.cachenew 99 | for { 100 | if it == nil { 101 | break 102 | } 103 | fw++ 104 | if it.older == nil { 105 | lg.Info("fw walked to end, compare %p/%p", it, bs.cacheold) 106 | } 107 | it = it.older 108 | } 109 | it = bs.cacheold 110 | for { 111 | if it == nil { 112 | break 113 | } 114 | bw++ 115 | if it.newer == nil { 116 | lg.Info("bw walked to end, compare %p/%p", it, bs.cachenew) 117 | } 118 | it = it.newer 119 | } 120 | lg.Info("Walked cache fw=%v, bw=%v, map=%v", fw, bw, len(bs.cachemap)) 121 | } 122 | 123 | //This must be called with the mutex held 124 | func (bs *BlockStore) cacheCheckCap() { 125 | for bs.cachelen > bs.cachemax { 126 | i := bs.cacheold 127 | delete(bs.cachemap, i.vaddr) 128 | if i.newer != nil { 129 | i.newer.older = nil 130 | } 131 | bs.cacheold = i.newer 132 | bs.cachelen-- 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /qci/test_readwrite2.ipy: -------------------------------------------------------------------------------- 1 | 2 | import qdf 3 | import qdf.quasar 4 | import sys 5 | import random 6 | import uuid 7 | import time 8 | from twisted.internet import defer, protocol, reactor 9 | print "entered test readwrite1" 10 | EXIT_CODE = None 11 | def setexit(code): 12 | global EXIT_CODE 13 | EXIT_CODE = code 14 | reactor.stop() 15 | 16 | @defer.inlineCallbacks 17 | def testbody(db): 18 | print "connected" 19 | TOTALPOINTS = 1000000 20 | PERINSERT = 1000 21 | INTERVAL = int(1E9/120.) 22 | UID = str(uuid.uuid4()) 23 | randomdata = [(x*INTERVAL, random.random()) for x in xrange(TOTALPOINTS)] 24 | idx = 0 25 | print "random data generated" 26 | for i in xrange(TOTALPOINTS/PERINSERT): 27 | yield db.insertValues(UID, randomdata[idx:idx+PERINSERT]) 28 | idx += PERINSERT 29 | time.sleep(20) 30 | readdata = [] 31 | idx = 0 32 | print "reading data" 33 | for i in xrange(TOTALPOINTS/PERINSERT): 34 | (status, rv) = yield db.queryStandardValues(UID, i*INTERVAL*PERINSERT, (i+1)*INTERVAL*PERINSERT) 35 | (version, values) = rv 36 | readdata += [(v.time, v.value) for v in values] 37 | print "len readdata:",len(readdata) 38 | print "len insert:",len(randomdata) 39 | for i in xrange(len(randomdata)): 40 | if randomdata[i][0] != readdata[i][0]: 41 | print "time mismatch index",i 42 | break 43 | if randomdata[i][1] != readdata[i][1]: 44 | print "value mismatch index",i 45 | break 46 | else: 47 | print "lists match" 48 | #delete middle 1/3 of data 49 | st = randomdata[len(randomdata)/3][0] 50 | et = randomdata[2*len(randomdata)/3][0] 51 | (status, rv) = yield db.deleteRange(UID, st, et) 52 | #also delete it from our data 53 | ndat = randomdata[0:len(randomdata)/3] #exlusive 54 | ndat += randomdata[2*len(randomdata)/3:] #inclusive 55 | 56 | readdata = [] 57 | print "reading data AFTER DELETE" 58 | for i in xrange(TOTALPOINTS/PERINSERT): 59 | (status, rv) = yield db.queryStandardValues(UID, i*INTERVAL*PERINSERT, (i+1)*INTERVAL*PERINSERT) 60 | (version, values) = rv 61 | readdata += [(v.time, v.value) for v in values] 62 | print "len readdata:",len(readdata) 63 | print "len insert:",len(ndat) 64 | odataskip = randomdata[666664:666668] 65 | print "odataskip:",odataskip 66 | for i in xrange(len(ndat)): 67 | if ndat[i][0] != readdata[i][0]: 68 | print "time mismatch index",i 69 | break 70 | if ndat[i][1] != readdata[i][1]: 71 | print "value mismatch index",i 72 | print "received",readdata[i][1] 73 | print "expected",ndat[i][1] 74 | print "nearby expected", ndat[i-2:i+2] 75 | print "nearby received", readdata[i-2:i+2] 76 | print "nearby ODAT", randomdata[i-2:i+2] 77 | break 78 | else: 79 | print "lists match" 80 | setexit(0) 81 | return 82 | 83 | setexit(1) 84 | return 85 | 86 | def onFail(param): 87 | print "Encountered error: ", param 88 | setexit(2) 89 | 90 | def entrypoint(): 91 | print "in entrypoint" 92 | try: 93 | q = qdf.quasar.connectToArchiver("localhost", 4410) 94 | q.addCallback(testbody) 95 | q.addErrback(onFail) 96 | except Exception as e: 97 | print "ex: ",e 98 | setexit(1) 99 | 100 | reactor.callWhenRunning(entrypoint) 101 | reactor.run() 102 | if EXIT_CODE == None: 103 | EXIT_CODE = 42 104 | if EXIT_CODE != 0: 105 | sys.exit(EXIT_CODE) 106 | else: 107 | !rm FAILURE 108 | -------------------------------------------------------------------------------- /qci/test_changedrange.ipy: -------------------------------------------------------------------------------- 1 | 2 | import qdf 3 | import qdf.quasar 4 | import sys 5 | import random 6 | import uuid 7 | import time 8 | from twisted.internet import defer, protocol, reactor 9 | print "entered test changedrange" 10 | EXIT_CODE = None 11 | def setexit(code): 12 | global EXIT_CODE 13 | EXIT_CODE = code 14 | reactor.stop() 15 | 16 | @defer.inlineCallbacks 17 | def testbody(db): 18 | print "connected" 19 | TOTALPOINTS = 1000000 20 | PERINSERT = 1000 21 | INTERVAL = int(1E9/120.) 22 | UID = str(uuid.uuid4()) 23 | OFFSET = random.randrange(100,1000000000000) 24 | randomdata = [(OFFSET + x*INTERVAL, random.random()) for x in xrange(TOTALPOINTS)] 25 | e_t = randomdata[-1][0] 26 | s_t = OFFSET 27 | print "SET: ", randomdata[0], randomdata[-1] 28 | print "e_t:", e_t 29 | print "s_t:", s_t 30 | idx = 0 31 | print "random data generated" 32 | for i in xrange(TOTALPOINTS/PERINSERT): 33 | yield db.insertValues(UID, randomdata[idx:idx+PERINSERT]) 34 | idx += PERINSERT 35 | 36 | #immediate query 37 | srep = [] 38 | (status, rv) = yield db.queryStatisticalValues(UID, 0, (1<<55), 55) 39 | print "status: ", status 40 | (version, values) = rv 41 | for v in values: 42 | srep.append([v.time, v.min, v.mean, v.max, v.count]) 43 | print "preflush:", srep 44 | print "version:", version 45 | #preflush_count = srep[0][4] 46 | preflush_count = 0 47 | print "flushing" 48 | yield db.flush(UID) 49 | 50 | srep = [] 51 | (status, rv) = yield db.queryStatisticalValues(UID, 0, (1<<55), 55) 52 | print "status: ", status 53 | (version, values) = rv 54 | for v in values: 55 | srep.append([v.time, v.min, v.mean, v.max, v.count]) 56 | 57 | #postflush_count = srep[0][4] 58 | print "postflush:", srep 59 | print "version:", version 60 | 61 | print "flushing2" 62 | yield db.flush(UID) 63 | 64 | srep = [] 65 | (status, rv) = yield db.queryStatisticalValues(UID, 0, (1<<55), 55) 66 | print "status2: ", status 67 | (version, values) = rv 68 | for v in values: 69 | srep.append([v.time, v.min, v.mean, v.max, v.count]) 70 | 71 | #postflush_count = srep[0][4] 72 | print "postflush2:", srep 73 | print "version2:", version 74 | 75 | #print "prepost counts: ",preflush_count, postflush_count 76 | 77 | def expected_cr(st, et, res): 78 | return st & ~((1<>>> USING %v AS SEED <<<<<", sd) 12 | rand.Seed(sd) 13 | } 14 | 15 | func Test_DeCompose(t *testing.T) { 16 | for i := 0; i < 16; i++ { 17 | x := rand.Float64() 18 | packed_m, packed_e := decompose(x) 19 | //log.Warning("x= %v m=%v e=%v",x, packed_m, packed_e) 20 | rv := recompose(packed_m, packed_e) 21 | if rv != x { 22 | t.Errorf("Number did not convert: +v", x) 23 | } 24 | } 25 | for i := 0; i < 10000000; i++ { 26 | x := rand.Float64() 27 | packed_m, packed_e := decompose(x) 28 | rv := recompose(packed_m, packed_e) 29 | if rv != x { 30 | t.Errorf("Number did not convert: +v", x) 31 | } 32 | } 33 | } 34 | 35 | func Test_2DeCompose(t *testing.T) { 36 | log.Warning("testing") 37 | for i := 0; i < 16; i++ { 38 | x := float64(i * 100000.0) 39 | packed_m, packed_e := decompose(x) 40 | rv := recompose(packed_m, packed_e) 41 | if rv != x { 42 | t.Errorf("Number did not convert: exp %v got %v", x, rv) 43 | } 44 | } 45 | } 46 | 47 | func Test_CB1(t *testing.T) { 48 | c := new(Coreblock) 49 | for i := 0; i < KFACTOR; i++ { 50 | c.Addr[i] = uint64(i + 1) 51 | } 52 | sarr := make([]byte, CBSIZE) 53 | donearr := c.Serialize(sarr) 54 | cn := new(Coreblock) 55 | cn.Deserialize(donearr) 56 | if !CompareNoTags(*c, *cn, []string{"implicit"}) { 57 | t.Error("Core block SERDES faled") 58 | } 59 | } 60 | 61 | func Test_Pack1(t *testing.T) { 62 | tst := func(x uint64) int { 63 | b := make([]byte, 9) 64 | ln := writeUnsignedHuff(b, x) 65 | for i := ln; i < 9; i++ { 66 | if b[i] != 0 { 67 | t.Errorf("Unexpected non-null byte") 68 | } 69 | } 70 | xr, _, _ := readUnsignedHuff(b) 71 | if xr != x { 72 | t.Errorf("Number did not match:", x, xr) 73 | } 74 | return ln 75 | } 76 | //First test around the boundaries 77 | var order uint64 78 | for order = 0; order < 64; order++ { 79 | for offset := -4; offset < 4; offset++ { 80 | x := uint64((1 << order) + offset) 81 | tst(x) 82 | } 83 | } 84 | 85 | //Now test that the huff boundaries have the write number of chars 86 | bcheck := []struct { 87 | n uint64 88 | exp int 89 | }{ 90 | {(1 << 7) - 1, 1}, 91 | {(1 << 7), 2}, 92 | {(1 << 14) - 1, 2}, 93 | {(1 << 14), 3}, 94 | {(1 << 20) - 1, 3}, 95 | {(1 << 20), 4}, 96 | {(1 << 28) - 1, 4}, 97 | {(1 << 28), 5}, 98 | {(1 << 36) - 1, 5}, 99 | {(1 << 36), 6}, 100 | {(1 << 42) - 1, 6}, 101 | {(1 << 42), 7}, 102 | {(1 << 50) - 1, 7}, 103 | {(1 << 50), 8}, 104 | {(1 << 58) - 1, 8}, 105 | {(1 << 58), 9}, 106 | {0xFFFFFFFFFFFFFFFF, 9}, 107 | } 108 | for _, ob := range bcheck { 109 | l := tst(ob.n) 110 | if l != ob.exp { 111 | t.Errorf("Did not get expected number of bytes out test=", ob, l) 112 | } 113 | } 114 | 115 | //Check the big number 116 | tst(0xFFFFFFFFFFFFFFFF) 117 | 118 | //Check the small number 119 | tst(0) 120 | 121 | //Check random numbers 122 | for i := 0; i < 100000; i++ { 123 | x := uint64(rand.Int63()) 124 | tst(x) 125 | } 126 | } 127 | 128 | func Test_Pack2(t *testing.T) { 129 | //Unsigned numbers are probably covered ok, lets try a few signed numbers 130 | //Check random numbers 131 | tst := func(x int64) int { 132 | b := make([]byte, 9) 133 | ln := writeSignedHuff(b, x) 134 | for i := ln; i < 9; i++ { 135 | if b[i] != 0 { 136 | t.Errorf("Unexpected non-null byte") 137 | } 138 | } 139 | xr, _, _ := readSignedHuff(b) 140 | if xr != x { 141 | t.Errorf("Number did not match:", x, xr) 142 | } 143 | return ln 144 | } 145 | for i := 0; i < 10000000; i++ { 146 | x := rand.Int63() 147 | tst(x) 148 | } 149 | tst(-1) 150 | tst(-0x7FFFFFFFFFFFFFFF) 151 | tst(0x7FFFFFFFFFFFFFFF) 152 | } 153 | -------------------------------------------------------------------------------- /qci/utils.ipy: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | def getid(): 4 | return int(time.time() - 1423015475) 5 | 6 | def build_loadgen(totalrecords, numstreams, pointspermessage): 7 | !go get github.com/SoftwareDefinedBuildings/quasarloadgenerator 8 | !git clone https://github.com/SoftwareDefinedBuildings/quasarloadgenerator 9 | !cd quasarloadgenerator && git checkout delete-data 10 | !cd quasarloadgenerator && go get -d ... 11 | !cd quasarloadgenerator && go build -o ../loadgen . 12 | 13 | def mkconf(cephpool, collection, filepath="/srv/quasar/"): 14 | if cephpool is not None: 15 | conf=""" 16 | [storage] 17 | provider=ceph 18 | cephconf=/etc/ceph/ceph.conf 19 | cephpool={cephpool} 20 | """.format(cephpool=cephpool) 21 | else: 22 | conf=""" 23 | [storage] 24 | provider=file 25 | filepath={filepath} 26 | """.format(filepath=filepath) 27 | conf = conf + """ 28 | [http] 29 | enabled=true 30 | port=9000 31 | address=0.0.0.0 32 | 33 | [capnp] 34 | enabled=true 35 | port=4410 36 | address=0.0.0.0 37 | 38 | [mongo] 39 | server=localhost 40 | collection={collection} 41 | 42 | [debug] 43 | heapprofile=true 44 | cpuprofile=true 45 | 46 | [cache] 47 | # Configure the RADOS and block caches. If you have a choice, rather 48 | # spend memory on the block cache. 49 | 50 | # This is measured in blocks, which are at most ~16K 51 | blockcache=62500 #1 GB 52 | 53 | # Choose a RADOS cache roughly equal to (num concurrent reads) * (object size) 54 | # the transaction size is at most 16 MB, but is usually around 1.6MB. The 55 | # objects can vary in size, so the cache can be capped either in quantity or 56 | # in total size (or both) 57 | radoscachecount=2048 #in objects 58 | radoscachesize=256 #in MB 59 | 60 | [coalescence] 61 | earlytrip=16384 #readings 62 | interval=5000 #ms 63 | """.format(collection=collection) 64 | with open("quasar.conf","w") as f: 65 | f.write(conf) 66 | 67 | def wait_for_stable_ceph(): 68 | x = !ceph -s 69 | while any(("creating" in y) or ("peering" in y) or ("unclean" in y) for y in x): 70 | print "Waiting for creation:" 71 | print x 72 | time.sleep(5) 73 | x = !ceph -s 74 | 75 | def mkceph_local(cephpool): 76 | !ceph osd pool create $cephpool 4096 4096 replicated local 2 77 | time.sleep(5) 78 | wait_for_stable_ceph() 79 | 80 | def mkceph_remote(cephpool): 81 | !ceph osd pool create $cephpool 4096 4096 replicated remote 2 82 | time.sleep(5) 83 | wait_for_stable_ceph() 84 | 85 | def mkceph_tier(cephpool): 86 | cache = cephpool+"-cache" 87 | !ceph osd pool create $cephpool 4096 4096 replicated remote 2 88 | time.sleep(5) 89 | !ceph osd pool create $cache 4096 4096 replicated local 2 90 | time.sleep(5) 91 | wait_for_stable_ceph() 92 | !ceph osd tier add $cephpool $cache 93 | !ceph osd tier cache-mode $cache writeback 94 | !ceph osd tier set-overlay $cephpool $cache 95 | !ceph osd pool set $cache hit_set_type bloom 96 | !ceph osd pool set $cache hit_set_period 7200 97 | !ceph osd pool set $cache cache_min_flush_age 120 98 | wait_for_stable_ceph() 99 | 100 | def mkceph_primary(cephpool): 101 | !ceph osd pool create $cephpool 4096 4096 replicated primary 2 102 | time.sleep(5) 103 | wait_for_stable_ceph() 104 | 105 | def delceph_pool(cephpool): 106 | !ceph osd pool delete $cephpool $cephpool --yes-i-really-really-mean-it 107 | 108 | def delceph_tier(cephpool): 109 | cache = cephpool+"-cache" 110 | !ceph osd tier cache-mode $cache forward 111 | !rados -p $cache cache-flush-evict-all > log.evict 112 | !ceph osd tier remove-overlay $cephpool 113 | !ceph osd tier remove $cephpool $cache 114 | delceph_pool(cache) 115 | delceph_pool(cephpool) 116 | 117 | #get QDF pulled 118 | !git clone https://github.com/SoftwareDefinedBuildings/QDF.git 119 | !mv QDF/qdf . 120 | -------------------------------------------------------------------------------- /btrdbd/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | "os/signal" 8 | "runtime" 9 | "runtime/pprof" 10 | "strconv" 11 | "time" 12 | 13 | "github.com/SoftwareDefinedBuildings/btrdb" 14 | "github.com/SoftwareDefinedBuildings/btrdb/cpinterface" 15 | "github.com/SoftwareDefinedBuildings/btrdb/httpinterface" 16 | "github.com/SoftwareDefinedBuildings/btrdb/internal/bstore" 17 | "github.com/op/go-logging" 18 | ) 19 | 20 | var log *logging.Logger 21 | 22 | func init() { 23 | logging.SetFormatter(logging.MustStringFormatter("%{color}%{shortfile} ▶%{color:reset} %{message}")) 24 | log = logging.MustGetLogger("log") 25 | 26 | } 27 | 28 | var createDB = flag.Bool("makedb", false, "create a new database") 29 | 30 | func main() { 31 | loadConfig() 32 | flag.Parse() 33 | 34 | go func() { 35 | for { 36 | time.Sleep(10 * time.Second) 37 | fmt.Println("Num goroutines: ", runtime.NumGoroutine()) 38 | } 39 | }() 40 | if Configuration.Debug.Cpuprofile { 41 | f, err := os.Create("profile.cpu") 42 | if err != nil { 43 | log.Panicf("Error creating CPU profile: %v", err) 44 | } 45 | f2, err := os.Create("profile.block") 46 | if err != nil { 47 | log.Panicf("Error creating Block profile: %v", err) 48 | } 49 | pprof.StartCPUProfile(f) 50 | runtime.SetBlockProfileRate(1) 51 | defer runtime.SetBlockProfileRate(0) 52 | defer pprof.Lookup("block").WriteTo(f2, 1) 53 | defer pprof.StopCPUProfile() 54 | } 55 | 56 | if *createDB { 57 | fmt.Printf("Creating a new database\n") 58 | bstore.CreateDatabase(Params) 59 | fmt.Printf("Done\n") 60 | os.Exit(0) 61 | } 62 | nCPU := runtime.NumCPU() 63 | runtime.GOMAXPROCS(nCPU) 64 | cfg := btrdb.QuasarConfig{ 65 | DatablockCacheSize: uint64(Configuration.Cache.BlockCache), 66 | TransactionCoalesceEnable: true, 67 | TransactionCoalesceInterval: uint64(*Configuration.Coalescence.Interval), 68 | TransactionCoalesceEarlyTrip: uint64(*Configuration.Coalescence.Earlytrip), 69 | Params: Params, 70 | } 71 | q, err := btrdb.NewQuasar(&cfg) 72 | if err != nil { 73 | log.Panicf("error: ", err) 74 | } 75 | 76 | if Configuration.Http.Enabled { 77 | go httpinterface.QuasarServeHTTP(q, *Configuration.Http.Address+":"+strconv.FormatInt(int64(*Configuration.Http.Port), 10)) 78 | } 79 | if Configuration.Capnp.Enabled { 80 | go cpinterface.ServeCPNP(q, "tcp", *Configuration.Capnp.Address+":"+strconv.FormatInt(int64(*Configuration.Capnp.Port), 10)) 81 | } 82 | 83 | if Configuration.Debug.Heapprofile { 84 | go func() { 85 | idx := 0 86 | for { 87 | f, err := os.Create(fmt.Sprintf("profile.heap.%05d", idx)) 88 | if err != nil { 89 | log.Panicf("Could not create memory profile %v", err) 90 | } 91 | idx = idx + 1 92 | pprof.WriteHeapProfile(f) 93 | f.Close() 94 | time.Sleep(30 * time.Second) 95 | } 96 | }() 97 | } 98 | 99 | sigchan := make(chan os.Signal, 1) 100 | signal.Notify(sigchan, os.Interrupt) 101 | 102 | for { 103 | time.Sleep(5 * time.Second) 104 | log.Info("Still alive") 105 | 106 | select { 107 | case _ = <-sigchan: 108 | log.Warning("Received Ctrl-C, waiting for graceful shutdown") 109 | time.Sleep(4 * time.Second) //Allow http some time 110 | log.Warning("Checking for pending inserts") 111 | for { 112 | if q.IsPending() { 113 | log.Warning("Pending inserts... waiting... ") 114 | time.Sleep(2 * time.Second) 115 | } else { 116 | log.Warning("No pending inserts") 117 | break 118 | } 119 | } 120 | if Configuration.Debug.Heapprofile { 121 | log.Warning("writing heap profile") 122 | f, err := os.Create("profile.heap.FIN") 123 | if err != nil { 124 | log.Panicf("Could not create memory profile %v", err) 125 | } 126 | pprof.WriteHeapProfile(f) 127 | f.Close() 128 | 129 | } 130 | return //end the program 131 | default: 132 | 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /internal/cephprovider/cephcache.go: -------------------------------------------------------------------------------- 1 | package cephprovider 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | //"runtime" 7 | ) 8 | 9 | //We are caching 1MB blocks for read, so the address should have the bottom 20 bits clear 10 | const R_ADDRMASK = ^((uint64(1) << 20) - 1) 11 | const R_OFFSETMASK = (uint64(1) << 20) - 1 12 | 13 | type CephCache struct { 14 | cachemap map[uint64]*CacheItem 15 | cachemiss uint64 16 | cachehit uint64 17 | cacheold *CacheItem 18 | cachenew *CacheItem 19 | cachemtx sync.Mutex 20 | cachelen uint64 21 | cachemax uint64 22 | cacheinv uint64 23 | pool *sync.Pool 24 | } 25 | type CacheItem struct { 26 | val []byte 27 | addr uint64 28 | newer *CacheItem 29 | older *CacheItem 30 | } 31 | 32 | func (cc *CephCache) initCache(size uint64) { 33 | cc.cachemax = size 34 | cc.cachemap = make(map[uint64]*CacheItem, size) 35 | cc.pool = &sync.Pool{ 36 | New: func() interface{} { 37 | return make([]byte, R_CHUNKSIZE) 38 | }, 39 | } 40 | 41 | go func() { 42 | for { 43 | log.Info("Ceph BlockCache: %d invs %d misses, %d hits, %.2f %%", 44 | cc.cacheinv, cc.cachemiss, cc.cachehit, (float64(cc.cachehit*100) / float64(cc.cachemiss+cc.cachehit))) 45 | time.Sleep(5 * time.Second) 46 | } 47 | }() 48 | } 49 | 50 | //This function must be called with the mutex held 51 | func (cc *CephCache) cachePromote(i *CacheItem) { 52 | if cc.cachenew == i { 53 | //Already at front 54 | return 55 | } 56 | if i.newer != nil { 57 | i.newer.older = i.older 58 | } 59 | if i.older != nil { 60 | i.older.newer = i.newer 61 | } 62 | if cc.cacheold == i && i.newer != nil { 63 | //This was the tail of a list longer than 1 64 | cc.cacheold = i.newer 65 | } else if cc.cacheold == nil { 66 | //This was/is the only item in the list 67 | cc.cacheold = i 68 | } 69 | 70 | i.newer = nil 71 | i.older = cc.cachenew 72 | if cc.cachenew != nil { 73 | cc.cachenew.newer = i 74 | } 75 | cc.cachenew = i 76 | } 77 | 78 | func (cc *CephCache) cachePut(addr uint64, item []byte) { 79 | if cc.cachemax == 0 { 80 | return 81 | } 82 | cc.cachemtx.Lock() 83 | i, ok := cc.cachemap[addr] 84 | if ok { 85 | cc.cachePromote(i) 86 | } else { 87 | i = &CacheItem{ 88 | val: item, 89 | addr: addr, 90 | } 91 | cc.cachemap[addr] = i 92 | cc.cachePromote(i) 93 | cc.cachelen++ 94 | cc.cacheCheckCap() 95 | } 96 | cc.cachemtx.Unlock() 97 | } 98 | 99 | func (cc *CephCache) getBlank() []byte { 100 | rv := cc.pool.Get().([]byte) 101 | rv = rv[0:R_CHUNKSIZE] 102 | 103 | return rv 104 | } 105 | 106 | func (cc *CephCache) cacheGet(addr uint64) []byte { 107 | if cc.cachemax == 0 { 108 | cc.cachemiss++ 109 | return nil 110 | } 111 | cc.cachemtx.Lock() 112 | rv, ok := cc.cachemap[addr] 113 | if ok { 114 | cc.cachePromote(rv) 115 | } 116 | cc.cachemtx.Unlock() 117 | if ok { 118 | cc.cachehit++ 119 | return rv.val 120 | } else { 121 | cc.cachemiss++ 122 | return nil 123 | } 124 | } 125 | 126 | //This is rare and only happens if the block cache is too small 127 | func (cc *CephCache) cacheInvalidate(addr uint64) { 128 | if cc.cachemax == 0 { 129 | return 130 | } 131 | cc.cachemtx.Lock() 132 | i, ok := cc.cachemap[addr] 133 | if ok { 134 | if i.newer != nil { 135 | i.newer.older = i.older 136 | } 137 | if i.older != nil { 138 | i.older.newer = i.newer 139 | } 140 | if cc.cacheold == i { 141 | //This was the tail of a list longer than 1 142 | cc.cacheold = i.newer 143 | } 144 | if cc.cachenew == i { 145 | cc.cachenew = i.older 146 | } 147 | cc.cachelen-- 148 | cc.cacheinv++ 149 | delete(cc.cachemap, addr) 150 | } 151 | cc.cachemtx.Unlock() 152 | } 153 | 154 | //This must be called with the mutex held 155 | func (cc *CephCache) cacheCheckCap() { 156 | for cc.cachelen > cc.cachemax { 157 | i := cc.cacheold 158 | 159 | delete(cc.cachemap, i.addr) 160 | if i.newer != nil { 161 | i.newer.older = nil 162 | } 163 | cc.cacheold = i.newer 164 | cc.cachelen-- 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /qci/runtests.ipy: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ipython 2 | from multiprocessing import Process, Pipe 3 | import os 4 | import time 5 | import sys 6 | import subprocess 7 | import uuid 8 | import pymongo 9 | %run qci/utils.ipy 10 | runid = getid() 11 | print "RUN ID IS", runid 12 | build_loadgen(10000000,20,1000) 13 | cephpool = "q"+str(runid) 14 | collection = "q"+str(runid) 15 | filestore = "q"+str(runid) 16 | if "CEPHTYPE" not in os.environ or os.environ["CEPHTYPE"] == "local": 17 | mkceph_local(cephpool) 18 | elif os.environ["CEPHTYPE"] == "remote": 19 | mkceph_remote(cephpool) 20 | elif os.environ["CEPHTYPE"] == "primary": 21 | mkceph_primary(cephpool) 22 | elif os.environ["CEPHTYPE"] == "tier": 23 | mkceph_tier(cephpool) 24 | elif os.environ["CEPHTYPE"] == "filestore": 25 | cephpool = None 26 | 27 | mkconf(cephpool, collection, filestore) 28 | 29 | #Create database 30 | rc = subprocess.call(["./exe","-makedb"]) 31 | print "rc0", rc 32 | if rc != 0: 33 | sys.exit(rc) 34 | 35 | #start quasar 36 | def start_q_(): 37 | stdout=open("log.q.stdout.%d" % (int(time.time())),"w") 38 | rc = subprocess.call(["./exe"],stdout=stdout, stderr=subprocess.STDOUT) 39 | print "rc1", rc 40 | if rc != 0: 41 | sys.exit(rc) 42 | 43 | 44 | def start_quasar(): 45 | global p 46 | p = Process(target=start_q_) 47 | p.start() 48 | 49 | def term_quasar(): 50 | #send sigint 51 | #os.kill(p.pid, 2) 52 | !pkill --signal 2 exe 53 | 54 | time.sleep(120) 55 | #os.kill(p.pid, 9) 56 | !pkill --signal 9 exe 57 | 58 | def kill_quasar(): 59 | !pkill --signal 9 exe 60 | 61 | def proc_profiles(pfx): 62 | !go tool pprof -text -cum exe profile.cpu > log.profile.cpu.cum 63 | !go tool pprof -text exe profile.cpu > log.profile.cpu 64 | hps = !ls profile.heap.* 65 | for hp in hps: 66 | num = hp.split(".")[-1] 67 | !go tool pprof -text exe $hp > temp 68 | tot = !cat temp | head -n 1 | cut -d ' ' -f 3 69 | tot = tot[0] 70 | fname = "log.heap."+pfx+"."+num+"___"+tot 71 | !mv temp $fname 72 | !rm -f profile.heap.* 73 | 74 | start_quasar() 75 | #wait a bit 76 | time.sleep(10) 77 | 78 | if not p.is_alive(): 79 | print "quasar died:", p.exitcode 80 | sys.exit(1) 81 | 82 | !rm FAILURE 83 | !touch FAILURE 84 | if "TEST_TYPE" not in os.environ or os.environ["TEST_TYPE"] == "loadgen": 85 | %run qci/test_loadgen.ipy 86 | elif os.environ["TEST_TYPE"] == "readwrite1": 87 | print "running reqdwrite1" 88 | %run qci/test_readwrite1.ipy 89 | print "run complete" 90 | elif os.environ["TEST_TYPE"] == "readwrite2": 91 | print "running reqdwrite2" 92 | %run qci/test_readwrite2.ipy 93 | print "run complete" 94 | elif os.environ["TEST_TYPE"] == "readstat1": 95 | print "running readstat1" 96 | %run qci/test_readstat1.ipy 97 | print "run complete" 98 | elif os.environ["TEST_TYPE"] == "loadgen2": 99 | print "running loadgen2" 100 | %run qci/test_loadgen2.ipy 101 | print "run complete" 102 | elif os.environ["TEST_TYPE"] == "loadgen3": 103 | print "running loadgen3" 104 | %run qci/test_loadgen3.ipy 105 | print "run complete" 106 | elif os.environ["TEST_TYPE"] == "endpoint": 107 | print "running endpoint" 108 | %run qci/test_endpoint.ipy 109 | print "run complete" 110 | elif os.environ["TEST_TYPE"] == "changedrange": 111 | print "running changedrange" 112 | %run qci/test_changedrange.ipy 113 | print "run complete" 114 | 115 | failed = !cat FAILURE; echo $? 116 | failed = (failed[-1] == "0") 117 | 118 | if not p.is_alive(): 119 | print "quasar died:", p.exitcode 120 | sys.exit(1) 121 | 122 | if not failed: 123 | print "WRITING SUCCESS FILE" 124 | with open("success","w") as f: 125 | f.write("OK\n") 126 | 127 | term_quasar() 128 | 129 | proc_profiles("end") 130 | 131 | if os.environ["CEPHTYPE"] == "tier": 132 | delceph_tier(cephpool) 133 | elif os.environ["CEPHTYPE"] == "filestore": 134 | !rm -r {filestore} 135 | cl = pymongo.MongoClient() 136 | cl.drop_database(collection) 137 | cl.disconnect() 138 | else: 139 | delceph_pool(cephpool) 140 | 141 | print "done" 142 | 143 | if failed: 144 | sys.exit(1) 145 | 146 | -------------------------------------------------------------------------------- /qtree/operators.go: -------------------------------------------------------------------------------- 1 | package qtree 2 | 3 | import ( 4 | "math" 5 | 6 | "github.com/SoftwareDefinedBuildings/btrdb/internal/bstore" 7 | ) 8 | 9 | func (n *QTreeNode) OpCountMean() (uint64, float64) { 10 | total := 0.0 11 | cnt := uint64(0) 12 | if n.isLeaf { 13 | for i := 0; i < int(n.vector_block.Len); i++ { 14 | total += n.vector_block.Value[i] 15 | } 16 | return uint64(n.vector_block.Len), total / float64(n.vector_block.Len) 17 | } else { 18 | for i := 0; i < bstore.KFACTOR; i++ { 19 | if n.core_block.Count[i] == 0 { 20 | continue 21 | } 22 | cnt += n.core_block.Count[i] 23 | total += n.core_block.Mean[i] * float64(n.core_block.Count[i]) 24 | } 25 | return cnt, total / float64(cnt) 26 | } 27 | } 28 | 29 | func (n *QTreeNode) OpMin() float64 { 30 | if n.isLeaf { 31 | min := n.vector_block.Value[0] 32 | for i := 0; i < int(n.vector_block.Len); i++ { 33 | if n.vector_block.Value[i] < min { 34 | min = n.vector_block.Value[i] 35 | } 36 | } 37 | return min 38 | } else { 39 | min := float64(0) 40 | minset := false 41 | for i := 0; i < len(n.core_block.Min); i++ { 42 | if n.core_block.Count[i] == 0 { 43 | continue 44 | } 45 | if !minset || n.core_block.Min[i] < min { 46 | min = n.core_block.Min[i] 47 | minset = true 48 | } 49 | } 50 | return min 51 | } 52 | } 53 | 54 | func (n *QTreeNode) OpMax() float64 { 55 | if n.isLeaf { 56 | max := n.vector_block.Value[0] 57 | for i := 0; i < int(n.vector_block.Len); i++ { 58 | if n.vector_block.Value[i] > max { 59 | max = n.vector_block.Value[i] 60 | } 61 | } 62 | return max 63 | } else { 64 | max := float64(0) 65 | maxset := false 66 | for i := 0; i < len(n.core_block.Max); i++ { 67 | if n.core_block.Count[i] == 0 { 68 | continue 69 | } 70 | if !maxset || n.core_block.Max[i] > max { 71 | max = n.core_block.Max[i] 72 | maxset = true 73 | } 74 | } 75 | return max 76 | } 77 | } 78 | 79 | /* 80 | 81 | ok so here is the problem. If we call opreduce on a core node, then we can only deliver 82 | pointwidths GREATER than our pointwidth and less than pointwidth + 6 right? 83 | but as a leaf we can potentially deliver pointwidths down to 0... 84 | */ 85 | func (n *QTreeNode) OpReduce(pointwidth uint8, index uint64) (uint64, float64, float64, float64) { 86 | if !n.isLeaf && pointwidth < n.PointWidth() { 87 | log.Panic("Bad pointwidth for core. See code comment") 88 | } 89 | if pointwidth > n.PointWidth()+PWFACTOR { 90 | log.Panic("Can't guarantee this PW") 91 | } 92 | maxpw := n.PointWidth() + PWFACTOR 93 | pwdelta := pointwidth - n.PointWidth() 94 | width := int64(1) << pointwidth 95 | maxidx := 1 << (maxpw - pointwidth) 96 | if maxidx <= 0 || index >= uint64(maxidx) { 97 | log.Critical("node is %s", n.TreePath()) 98 | log.Panic("bad index", maxidx, index) 99 | } 100 | sum := 0.0 101 | min := math.NaN() 102 | max := math.NaN() 103 | minset := false 104 | maxset := false 105 | count := uint64(0) 106 | if n.isLeaf { 107 | st := n.StartTime() + int64(index)*width 108 | et := st + width 109 | if n.vector_block.Len != 0 { 110 | for i := 0; i < int(n.vector_block.Len); i++ { 111 | if n.vector_block.Time[i] < st { 112 | continue 113 | } 114 | if n.vector_block.Time[i] >= et { 115 | break 116 | } 117 | v := n.vector_block.Value[i] 118 | sum += v 119 | if !minset || v < min { 120 | minset = true 121 | min = v 122 | } 123 | if !maxset || v > max { 124 | maxset = true 125 | max = v 126 | } 127 | count++ 128 | } 129 | } 130 | return count, min, sum / float64(count), max 131 | } else { 132 | s := index << pwdelta 133 | e := (index + 1) << pwdelta 134 | for i := s; i < e; i++ { 135 | if n.core_block.Count[i] == 0 { 136 | continue 137 | } 138 | count += n.core_block.Count[i] 139 | sum += n.core_block.Mean[i] * float64(n.core_block.Count[i]) 140 | if !minset || n.core_block.Min[i] < min { 141 | minset = true 142 | min = n.core_block.Min[i] 143 | } 144 | if !maxset || n.core_block.Max[i] > max { 145 | maxset = true 146 | max = n.core_block.Max[i] 147 | } 148 | } 149 | mean := sum / float64(count) 150 | return count, min, mean, max 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /internal/bprovider/bprovider_test.go: -------------------------------------------------------------------------------- 1 | package bprovider_test 2 | 3 | import ( 4 | "math/rand" 5 | "sync" 6 | "testing" 7 | "time" 8 | 9 | "github.com/SoftwareDefinedBuildings/btrdb/internal/bprovider" 10 | "github.com/SoftwareDefinedBuildings/btrdb/internal/cephprovider" 11 | "github.com/SoftwareDefinedBuildings/btrdb/internal/fileprovider" 12 | "github.com/op/go-logging" 13 | ) 14 | 15 | var log *logging.Logger 16 | 17 | func init() { 18 | log = logging.MustGetLogger("log") 19 | } 20 | 21 | func makeFileProvider() *fileprovider.FileStorageProvider { 22 | params := map[string]string{ 23 | "dbpath": "/srv/quasartestdb/", 24 | } 25 | fp := new(fileprovider.FileStorageProvider) 26 | err := fp.CreateDatabase(params) 27 | if err != nil { 28 | log.Panicf("Error on create %v", err) 29 | } 30 | fp.Initialize(params) 31 | return fp 32 | } 33 | 34 | func makeCephProvider() *cephprovider.CephStorageProvider { 35 | params := map[string]string{} 36 | cp := new(cephprovider.CephStorageProvider) 37 | /*err := cp.CreateDatabase(params) 38 | if err != nil { 39 | log.Panicf("Error on create %v",err) 40 | }*/ 41 | cp.Initialize(params) 42 | return cp 43 | } 44 | 45 | func TestCephInitDB(t *testing.T) { 46 | params := map[string]string{} 47 | cp := new(cephprovider.CephStorageProvider) 48 | err := cp.CreateDatabase(params) 49 | if err != nil { 50 | log.Panicf("Error on create %v", err) 51 | } 52 | } 53 | 54 | func x_RW1(t *testing.T, sp bprovider.StorageProvider) { 55 | seg := sp.LockSegment() 56 | addr := seg.BaseAddress() 57 | data := make([]byte, 1024) 58 | for i := 0; i < 1024; i++ { 59 | data[i] = byte(i) 60 | } 61 | _, err := seg.Write(addr, data) 62 | if err != nil { 63 | t.Fatalf("Got error on write: %v", err) 64 | } 65 | seg.Unlock() 66 | 67 | //Read back 68 | rdata := make([]byte, 30000) 69 | rslice := sp.Read(addr, rdata) 70 | if len(rslice) != len(data) { 71 | t.Fatalf("Got wrong slice len back") 72 | } 73 | for i := 0; i < 1024; i++ { 74 | if rslice[i] != data[i] { 75 | t.Fatalf("Index %v differed got %v, expected %v", i, rslice[i], data[i]) 76 | } 77 | } 78 | } 79 | 80 | func x_RWFuzz(t *testing.T, sp bprovider.StorageProvider) { 81 | wg := sync.WaitGroup{} 82 | const par = 2096 83 | const seglimlim = 50 84 | const arrszlim = 20482 85 | const maxseeds = 1 86 | for si := 1; si <= maxseeds; si++ { 87 | log.Warning("Trying seed %v", si) 88 | rand.Seed(int64(si)) 89 | wg.Add(par) 90 | for li := 0; li < par; li++ { 91 | lic := li 92 | go func() { 93 | 94 | seg := sp.LockSegment() 95 | addr := seg.BaseAddress() 96 | log.Warning("Segment %v base addr 0x%016x", lic, addr) 97 | seglimit := 1 //rand.Int() % seglimlim 98 | stored_data := make([][]byte, seglimit) 99 | stored_addrs := make([]uint64, seglimit) 100 | for k := 0; k < seglimit; k++ { 101 | arrsize := rand.Int() % arrszlim 102 | data := make([]byte, arrsize) 103 | for i := 0; i < arrsize; i++ { 104 | data[i] = byte(rand.Int()) 105 | } 106 | stored_data[k] = data 107 | naddr, err := seg.Write(addr, data) 108 | if err != nil { 109 | log.Error("ea %v", err) 110 | t.Errorf("Got error on write: %v", err) 111 | return 112 | } 113 | stored_addrs[k] = addr 114 | addr = naddr 115 | } 116 | seg.Unlock() 117 | sleeptime := time.Duration(rand.Int() % 2000) 118 | time.Sleep(sleeptime * time.Millisecond) 119 | //Read back 120 | for k := 0; k < seglimit; k++ { 121 | rdata := make([]byte, 33000) 122 | rslice := sp.Read(stored_addrs[k], rdata) 123 | if len(rslice) != len(stored_data[k]) { 124 | log.Error("eb") 125 | t.Errorf("Got wrong slice len back") 126 | return 127 | } 128 | for j := 0; j < len(stored_data[k]); j++ { 129 | if rslice[j] != stored_data[k][j] { 130 | log.Error("ec") 131 | t.Errorf("Index %v differed got %v, expected %v", j, rslice[j], stored_data[k][j]) 132 | } 133 | } 134 | } 135 | wg.Done() 136 | }() 137 | } 138 | wg.Wait() 139 | } 140 | } 141 | 142 | func Test_FP_RW1(t *testing.T) { 143 | fp := makeFileProvider() 144 | x_RW1(t, fp) 145 | } 146 | 147 | func Test_FP_FUZZ(t *testing.T) { 148 | fp := makeFileProvider() 149 | x_RWFuzz(t, fp) 150 | } 151 | 152 | func Test_CP_RW1(t *testing.T) { 153 | cp := makeCephProvider() 154 | x_RW1(t, cp) 155 | } 156 | 157 | func Test_CP_FUZZ(t *testing.T) { 158 | cp := makeCephProvider() 159 | x_RWFuzz(t, cp) 160 | } 161 | -------------------------------------------------------------------------------- /btrdbd/config.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "strconv" 7 | 8 | gcfg "gopkg.in/gcfg.v1" 9 | ) 10 | 11 | type Config struct { 12 | Http struct { 13 | Port *int 14 | Address *string 15 | Enabled bool 16 | } 17 | Capnp struct { 18 | Port *int 19 | Address *string 20 | Enabled bool 21 | } 22 | Mongo struct { 23 | Server *string 24 | Collection *string 25 | } 26 | Storage struct { 27 | Provider string 28 | Filepath *string 29 | Cephconf *string 30 | Cephpool *string 31 | } 32 | Cache struct { 33 | BlockCache int 34 | RadosWriteCache *int 35 | RadosReadCache *int 36 | } 37 | Debug struct { 38 | Cpuprofile bool 39 | Heapprofile bool 40 | } 41 | Coalescence struct { 42 | Earlytrip *int 43 | Interval *int 44 | } 45 | } 46 | 47 | var Configuration Config 48 | var Params map[string]string 49 | 50 | func loadConfig() { 51 | found := false 52 | err := gcfg.ReadFileInto(&Configuration, "./btrdb.conf") 53 | if err != nil { 54 | fmt.Printf("Could not load configuration file './btrdb.conf':\n%v\n", err) 55 | } else { 56 | found = true 57 | } 58 | 59 | if !found { 60 | err := gcfg.ReadFileInto(&Configuration, "/etc/btrdb/btrdb.conf") 61 | if err != nil { 62 | fmt.Printf("Could not load configuration file '/etc/btrdb/btrdb.conf':\n%v\n", err) 63 | } else { 64 | found = true 65 | } 66 | } 67 | 68 | if !found { 69 | fmt.Printf("Aborting: no configuration found!\n") 70 | os.Exit(1) 71 | } 72 | 73 | if Configuration.Mongo.Server == nil || *Configuration.Mongo.Server == "" { 74 | fmt.Printf("Aborting: configuration missing MongoDB server address\n") 75 | os.Exit(1) 76 | } 77 | if Configuration.Mongo.Collection == nil || *Configuration.Mongo.Collection == "" { 78 | fmt.Printf("Aborting: configuration missing MongoDB collection\n") 79 | os.Exit(1) 80 | } 81 | 82 | if Configuration.Storage.Provider == "file" { 83 | if Configuration.Storage.Filepath == nil { 84 | fmt.Printf("Aborting: using Files for storage, but no filepath specified\n") 85 | os.Exit(1) 86 | } 87 | } else if Configuration.Storage.Provider == "ceph" { 88 | if Configuration.Storage.Cephconf == nil { 89 | fmt.Printf("Aborting: using Ceph for storage, but no cephconf specified\n") 90 | os.Exit(1) 91 | } 92 | if Configuration.Storage.Cephpool == nil { 93 | fmt.Printf("Aborting: using Ceph for storage, but no cephpool specified\n") 94 | os.Exit(1) 95 | } 96 | } else { 97 | fmt.Printf("Aborting: unknown storage provider specified\n") 98 | os.Exit(1) 99 | } 100 | 101 | if Configuration.Cache.RadosWriteCache == nil { 102 | z := 0 103 | Configuration.Cache.RadosWriteCache = &z 104 | } 105 | if Configuration.Cache.RadosReadCache == nil { 106 | z := 0 107 | Configuration.Cache.RadosReadCache = &z 108 | } 109 | 110 | if Configuration.Http.Enabled && Configuration.Http.Port == nil { 111 | fmt.Printf("Aborting: http server enabled, but no port specified\n") 112 | os.Exit(1) 113 | } 114 | 115 | if Configuration.Http.Enabled && Configuration.Http.Address == nil { 116 | fmt.Printf("Aborting: http server enabled, but no address specified\n") 117 | os.Exit(1) 118 | } 119 | 120 | if Configuration.Capnp.Enabled && Configuration.Capnp.Port == nil { 121 | fmt.Printf("Aborting: capn proto server enabled, but no port specified\n") 122 | os.Exit(1) 123 | } 124 | 125 | if Configuration.Capnp.Enabled && Configuration.Capnp.Address == nil { 126 | fmt.Printf("Aborting: capn proto server enabled, but no address specified\n") 127 | os.Exit(1) 128 | } 129 | 130 | if Configuration.Coalescence.Earlytrip == nil { 131 | fmt.Printf("Aborting: transaction coalescence early trip object count not set\n") 132 | os.Exit(1) 133 | } 134 | 135 | if Configuration.Coalescence.Interval == nil { 136 | fmt.Printf("Aborting: transaction coalescence commit interval not set\n") 137 | os.Exit(1) 138 | } 139 | 140 | Params = map[string]string{ 141 | "mongoserver": *Configuration.Mongo.Server, 142 | "provider": Configuration.Storage.Provider, 143 | "cachesize": strconv.FormatInt(int64(Configuration.Cache.BlockCache), 10), 144 | "collection": *Configuration.Mongo.Collection, 145 | } 146 | if Configuration.Storage.Provider == "ceph" { 147 | Params["cephconf"] = *Configuration.Storage.Cephconf 148 | Params["cephpool"] = *Configuration.Storage.Cephpool 149 | Params["cephrcache"] = strconv.FormatInt(int64(*Configuration.Cache.RadosReadCache), 10) 150 | Params["cephwcache"] = strconv.FormatInt(int64(*Configuration.Cache.RadosWriteCache), 10) 151 | } 152 | if Configuration.Storage.Provider == "file" { 153 | Params["dbpath"] = *Configuration.Storage.Filepath 154 | } 155 | 156 | fmt.Printf("Configuration OK!\n") 157 | } 158 | -------------------------------------------------------------------------------- /qci/test_readstat1.ipy: -------------------------------------------------------------------------------- 1 | 2 | import qdf 3 | import qdf.quasar 4 | import sys 5 | import random 6 | import uuid 7 | import time 8 | import numpy as np 9 | from twisted.internet import defer, protocol, reactor 10 | print "entered test readwrite1" 11 | EXIT_CODE = None 12 | def setexit(code): 13 | global EXIT_CODE 14 | EXIT_CODE = code 15 | reactor.stop() 16 | 17 | def statify(data, pw, starttime, endtime): 18 | rv = {} 19 | mask = ~((1< t { 107 | t = nt 108 | } 109 | } 110 | return rv 111 | } 112 | 113 | func MakeWTree() (*QTree, uuid.UUID) { 114 | id := uuid.NewRandom() 115 | mBS() 116 | tr, err := NewWriteQTree(_bs, id) 117 | if err != nil { 118 | log.Panic(err) 119 | } 120 | return tr, id 121 | } 122 | func CompareData(lhs []Record, rhs []Record) { 123 | if len(lhs) != len(rhs) { 124 | log.Panic("lhs != rhs len") 125 | } 126 | for i, v := range lhs { 127 | if rhs[i] != v { 128 | log.Panic("data differs") 129 | } 130 | } 131 | } 132 | func TestTreeSWriteLarge(t *testing.T) { 133 | mBS() 134 | testuuid := uuid.NewRandom() 135 | tr, err := NewWriteQTree(_bs, testuuid) 136 | log.Printf("Generated tree %v", testuuid.String()) 137 | if err != nil { 138 | t.Error(err) 139 | } 140 | log.Printf("Generating dummy records") 141 | records := GenData(0, 40*DAY, HOUR, 2*MINUTE, func(t int64) float64 { 142 | return float64(t) 143 | }) 144 | log.Printf("We generated %v records", len(records)) 145 | 146 | tr.InsertValues(records) 147 | tr.Commit() 148 | 149 | tr, err = NewReadQTree(_bs, testuuid, bstore.LatestGeneration) 150 | if err != nil { 151 | log.Panic(err) 152 | } 153 | rrec, err := tr.ReadStandardValuesBlock(0, 40*DAY+2*MINUTE) 154 | if err != nil { 155 | log.Panic(err) 156 | } 157 | log.Printf("We read %v records", len(rrec)) 158 | if len(rrec) != len(records) { 159 | t.FailNow() 160 | } 161 | for i := 0; i < len(rrec); i++ { 162 | if records[i].Time != rrec[i].Time || 163 | records[i].Val != rrec[i].Val { 164 | t.FailNow() 165 | } 166 | //log.Printf("[%5d] w=%v r=%v d=%v", i, records[i].Time, rrec[i].Time, 167 | // int64(records[i].Time- rrec[i].Time)) 168 | } 169 | 170 | } 171 | 172 | func BenchmarkMultiSWrite(b *testing.B) { 173 | mBS() 174 | testuuid := uuid.NewRandom() 175 | log.Printf("MultiSWrite is using %v", testuuid.String()) 176 | log.Printf("Generating dummy records") 177 | records := GenData(0, 1*DAY, SECOND, 100*MILLISECOND, func(t int64) float64 { 178 | return float64(t) 179 | }) 180 | log.Printf("We generated %v records, randomizing a copy", len(records)) 181 | rec_copy_orig := make([]Record, len(records)) 182 | perm := rand.Perm(len(records)) 183 | for i, v := range perm { 184 | rec_copy_orig[v] = records[i] 185 | } 186 | b.ResetTimer() 187 | for iter := 0; iter < b.N; iter++ { 188 | rec_copy := make([]Record, len(rec_copy_orig)) 189 | copy(rec_copy, rec_copy_orig) 190 | iperstage := 4000 191 | idx := 0 192 | for { 193 | tr, err := NewWriteQTree(_bs, testuuid) 194 | if err != nil { 195 | b.Error(err) 196 | } 197 | end := idx + iperstage 198 | if end > len(rec_copy) { 199 | end = len(rec_copy) 200 | } 201 | tr.InsertValues(rec_copy[idx:end]) 202 | tr.Commit() 203 | idx = end 204 | if idx == len(rec_copy) { 205 | break 206 | } 207 | } 208 | /* 209 | //Read back the records 210 | tr, err := NewReadQTree(_bs, testuuid, bstore.LatestGeneration) 211 | if err != nil { 212 | log.Panic(err) 213 | } 214 | rrec, err := tr.ReadStandardValuesBlock(0, 40*DAY+2*MINUTE) 215 | if err != nil { 216 | log.Panic(err) 217 | } 218 | */ 219 | } 220 | } 221 | func TestTreeMultiSWrite(t *testing.T) { 222 | mBS() 223 | testuuid := uuid.NewRandom() 224 | log.Printf("MultiSWrite is going into %v", testuuid.String()) 225 | log.Printf("Generating dummy records") 226 | records := GenData(0, 1*HOUR, 1*MINUTE, 2*SECOND, func(t int64) float64 { 227 | return float64(t) 228 | }) 229 | log.Printf("We generated %v records, randomizing a copy", len(records)) 230 | rec_copy := make([]Record, len(records)) 231 | perm := rand.Perm(len(records)) 232 | for i, v := range perm { 233 | rec_copy[v] = records[i] 234 | } 235 | iperstage := 30 236 | idx := 0 237 | for { 238 | tr, err := NewWriteQTree(_bs, testuuid) 239 | if err != nil { 240 | t.Error(err) 241 | } 242 | end := idx + iperstage 243 | if end > len(rec_copy) { 244 | end = len(rec_copy) 245 | } 246 | tr.InsertValues(rec_copy[idx:end]) 247 | tr.root.PrintCounts(2) 248 | tr.Commit() 249 | idx = end 250 | if idx == len(rec_copy) { 251 | break 252 | } 253 | } 254 | 255 | //Read back the records 256 | tr, err := NewReadQTree(_bs, testuuid, bstore.LatestGeneration) 257 | if err != nil { 258 | log.Panic(err) 259 | } 260 | rrec, err := tr.ReadStandardValuesBlock(0, 40*DAY+2*MINUTE) 261 | if err != nil { 262 | log.Panic(err) 263 | } 264 | //Verify we have the same number (for now) 265 | log.Printf("wrote %v, read %v", len(records), len(rrec)) 266 | tr.root.PrintCounts(0) 267 | if len(records) != len(rrec) { 268 | t.FailNow() 269 | } 270 | } 271 | -------------------------------------------------------------------------------- /cpinterface/interface.capnp: -------------------------------------------------------------------------------- 1 | using Go = import "go.capnp"; 2 | $Go.package("cpinterface"); 3 | $Go.import("github.com/SoftwareDefinedBuildings/btrdb/cpinterface"); 4 | 5 | @0x85360901bcc4bed2; 6 | 7 | ### 8 | # Request type, each request gives back exactly one response 9 | ### 10 | struct Request { 11 | # This will be added to the response, so that requests can be mapped 12 | # to responses as they can come back out of order. 13 | echoTag @0 : UInt64; 14 | union { 15 | void @1 : Void; 16 | queryStandardValues @2 : CmdQueryStandardValues; 17 | queryStatisticalValues @3 : CmdQueryStatisticalValues; 18 | queryWindowValues @9 : CmdQueryWindowValues; 19 | queryVersion @4 : CmdQueryVersion; 20 | queryNearestValue @5 : CmdQueryNearestValue; 21 | queryChangedRanges @6 : CmdQueryChangedRanges; 22 | insertValues @7 : CmdInsertValues; 23 | deleteValues @8 : CmdDeleteValues; 24 | } 25 | } 26 | 27 | # The basic record type. Times are measured in nanoseconds 28 | # since the Epoch. At the time of writing, BTrDB is only 29 | # capable of storing dates from approx 1935 to 2078... 30 | struct Record { 31 | time @0 : Int64; 32 | value @1 : Float64; 33 | } 34 | 35 | # Query pre-aggregated statistical records from the database. 36 | # these are particularly useful for plotting applications 37 | # and locating where data is. 38 | struct StatisticalRecord { 39 | time @0 : Int64; 40 | count @1 : UInt64; 41 | min @2 : Float64; 42 | mean @3 : Float64; 43 | max @4 : Float64; 44 | } 45 | 46 | # Query from startTime (inclusive) to endTime (exclusive) in 47 | # nanoseconds. 48 | # If you want consistent values over a series of 49 | # reads, or you wish to view a stream as it was in the past 50 | # then you can specify a nonzero version. Repeating a query 51 | # with the same version is guaranteed to return the same results 52 | # irrespective of any deletes or adds that take place. 53 | # returns many RecordLists 54 | struct CmdQueryStandardValues { 55 | uuid @0 : Data; 56 | version @1 : UInt64; 57 | startTime @2 : Int64; 58 | endTime @3 : Int64; 59 | } 60 | 61 | 62 | # Query from startTime (inclusive) to endTime (exclusive) in 63 | # nanoseconds. Note that both of those times will be rounded 64 | # down if they have set bits in the bottom pointWidth bits. 65 | # pointWidth is the log of the number of records to aggregate 66 | # per result. A PW of 30 therefore means (1<<30) ns per record 67 | # which is about a second. 68 | # If you want consistent values over a series of 69 | # reads, or you wish to view a stream as it was in the past 70 | # then you can specify a nonzero version 71 | # returns many StatisticalRecordLists 72 | struct CmdQueryStatisticalValues { 73 | uuid @0 : Data; 74 | version @1 : UInt64; 75 | startTime @2 : Int64; 76 | endTime @3 : Int64; 77 | pointWidth @4 : UInt8; 78 | } 79 | 80 | # Query from startTime (inclusive) to endTime (exclusive) in 81 | # nanoseconds. Aggregate windows with an end time less than or equal 82 | # to endTime will be returned. Windows start from exactly startTime and 83 | # increase by Width. Leap seconds etc are your problem. The depth 84 | # (currently unimplemented) represents the minimum PW to descend to 85 | # while computing windows. 86 | # If you want consistent values over a series of 87 | # reads, or you wish to view a stream as it was in the past 88 | # then you can specify a nonzero version 89 | # returns many StatisticalRecordLists 90 | struct CmdQueryWindowValues { 91 | uuid @0 : Data; 92 | version @1 : UInt64; 93 | startTime @2 : Int64; 94 | endTime @3 : Int64; 95 | width @4 : UInt64; 96 | depth @5 : UInt8; 97 | } 98 | 99 | # For every UUID given, return the current version and last 100 | # modified time of the stream. 101 | # returns VersionList 102 | struct CmdQueryVersion { 103 | uuids @0 : List(Data); 104 | } 105 | 106 | # Query the next (or previous if backward=true) value in the 107 | # stream, starting from time. 108 | # returns a RecordList 109 | struct CmdQueryNearestValue { 110 | uuid @0 : Data; 111 | version @1 : UInt64; 112 | time @2 : Int64; 113 | backward @3 : Bool; 114 | } 115 | 116 | # For the given UUID, return all the time ranges that have 117 | # changed between the given generations. toGeneration is 118 | # not included. Note that depending on how full the stream is, 119 | # the returned result may be rounded off. A sparsely populated 120 | # stream returns less accurate results than a densely populated 121 | # one. 122 | # returns many RangeLists 123 | struct CmdQueryChangedRanges { 124 | uuid @0 : Data; 125 | fromGeneration @1 : UInt64; 126 | toGeneration @2 : UInt64; 127 | unused @3 : UInt64; 128 | resolution @4 : UInt8; 129 | } 130 | 131 | # Insert values. If sync is true, the database will flush the 132 | # results to disk before returning success. Please PLEASE don't 133 | # use that without seriously considering if you need it, as it 134 | # disables transaction coalescence and reduces performance 135 | # by several orders of magnitude. 136 | # returns Void 137 | struct CmdInsertValues { 138 | uuid @0 : Data; 139 | values @1 : List(Record); 140 | sync @2 : Bool; 141 | } 142 | 143 | # Delete the values between the given times. 144 | # returns Void 145 | struct CmdDeleteValues { 146 | uuid @0 : Data; 147 | startTime @1 : Int64; 148 | endTime @2 : Int64; 149 | } 150 | 151 | ### 152 | # Response type 153 | ### 154 | struct Response { 155 | echoTag @0 : UInt64; 156 | statusCode @1 : StatusCode; 157 | final @2 : Bool; 158 | union { 159 | void @3 : Void; 160 | records @4 : Records; 161 | statisticalRecords @5 : StatisticalRecords; 162 | versionList @6 : Versions; 163 | changedRngList @7 : Ranges; 164 | } 165 | } 166 | 167 | # Contains all the error codes that are emitted by Quasar 168 | enum StatusCode { 169 | ok @0; 170 | 171 | # Returned (ATM) for almost everything 172 | internalError @1; 173 | 174 | # Returned for a bad UUID or a bad version 175 | noSuchStreamOrVersion @2; 176 | 177 | # Returned for a bad parameter, like time range 178 | invalidParameter @3; 179 | 180 | # Returned from nearest value when it doesn't exist 181 | noSuchPoint @4; 182 | } 183 | 184 | # Contains a list of records, and the version of the stream 185 | # used to satisfy the request. 186 | struct Records { 187 | version @0 : UInt64; 188 | values @1 : List(Record); 189 | } 190 | 191 | # Contains a list of statistical records and the version of 192 | # the stream used to satisfy the request. 193 | struct StatisticalRecords { 194 | version @0 : UInt64; 195 | values @1 : List(StatisticalRecord); 196 | } 197 | 198 | # Contains the latest version numbers for the requested 199 | # streams 200 | struct Versions { 201 | uuids @0 : List(Data); 202 | versions @1 : List(UInt64); 203 | } 204 | 205 | # Represents a range of time that has been changed 206 | struct ChangedRange { 207 | startTime @0 : Int64; 208 | endTime @1 : Int64; 209 | } 210 | 211 | # Response to the QueryChangedRanges 212 | struct Ranges { 213 | version @0 : UInt64; 214 | values @1 : List(ChangedRange); 215 | } 216 | -------------------------------------------------------------------------------- /internal/cephprovider/cephprovider.c: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "cephprovider.h" 8 | #include 9 | #include 10 | 11 | #define ADDR_LOCK_SIZE 0x1000000000 12 | #define COMP_CAP_STEP 64 13 | #define OID_SIZE 43 //32 for uuid, 10 for id, 1 for nul 14 | 15 | rados_t cluster; 16 | char* pool; 17 | 18 | const char nibbles [] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'}; 19 | 20 | void make_object_id(uint8_t *uuid, uint64_t address, char* dest) 21 | { 22 | int i; 23 | int dp; 24 | for (i=0;i<16;i++) 25 | { 26 | int nibble; 27 | dest[i*2] = nibbles[uuid[i]>>4]; 28 | dest[i*2+1] = nibbles[uuid[i]&0xF]; 29 | } 30 | for (i=0;i<10;i++) 31 | { 32 | dest[32+i] = nibbles[address >> (4*(9-i)) & 0xF]; 33 | } 34 | dest[OID_SIZE-1] = 0; 35 | } 36 | 37 | void initialize_provider(const char* conffile, const char* cephpool) 38 | { 39 | int err; 40 | err = rados_create(&cluster, NULL); 41 | if (err < 0) 42 | { 43 | fprintf(stderr, "could not create RADOS cluster handle\n"); 44 | errno = -err; 45 | return; 46 | } 47 | 48 | err = rados_conf_read_file(cluster, conffile); 49 | if (err < 0) 50 | { 51 | fprintf(stderr, "could not create load ceph conf\n"); 52 | errno = -err; 53 | return; 54 | } 55 | 56 | err = rados_connect(cluster); 57 | if (err < 0) 58 | { 59 | fprintf(stderr, "could not create connect to cluster\n"); 60 | errno = -err; 61 | return; 62 | } 63 | 64 | pool = (char*) malloc(strlen(cephpool)+1); 65 | strcpy(pool, cephpool); 66 | 67 | errno = 0; 68 | } 69 | 70 | cephprovider_handle_t* handle_create() 71 | { 72 | int err; 73 | cephprovider_handle_t *rv = (cephprovider_handle_t*) malloc(sizeof(cephprovider_handle_t)); 74 | rv->comps = (rados_completion_t*) malloc(sizeof(rados_completion_t) *COMP_CAP_STEP); 75 | rv->comp_cap = COMP_CAP_STEP; 76 | rv->comp_len = 0; 77 | 78 | err = rados_ioctx_create(cluster, pool, &rv->ctx); 79 | if (err < 0) 80 | { 81 | fprintf(stderr, "could not create io context\n"); 82 | errno = -err; 83 | rados_ioctx_destroy(rv->ctx); 84 | free(rv); 85 | return NULL; 86 | } 87 | errno = 0; 88 | return rv; 89 | } 90 | 91 | void handle_write(cephprovider_handle_t *h, uint8_t *uuid, uint64_t address, const char *data, int len, int trunc) 92 | { 93 | //The ceph provider uses 24 bits of address per object, and the top 40 bits as an object ID 94 | int offset = address & 0xFFFFFF; 95 | uint64_t id = address >> 24; 96 | int err; 97 | char oid [OID_SIZE]; 98 | make_object_id(uuid, id, &oid[0]); 99 | if (trunc) 100 | { 101 | err = rados_trunc(h->ctx, oid, len + offset); 102 | if (err < 0) 103 | { 104 | fprintf(stderr, "could not trunc\n"); 105 | errno = -err; 106 | return; 107 | } 108 | } 109 | //Check we have a completion we can use 110 | if (h->comp_len == h->comp_cap) 111 | { 112 | h->comp_cap += COMP_CAP_STEP; 113 | h->comps = realloc(h->comps, (h->comp_cap * sizeof(rados_completion_t))); 114 | if (!h->comps) 115 | { 116 | return; 117 | } 118 | } 119 | err = rados_aio_create_completion(NULL, NULL, NULL, &(h->comps[h->comp_len])); 120 | if (err < 0) 121 | { 122 | fprintf(stderr, "could not create completion\n"); 123 | errno = -err; 124 | return; 125 | } 126 | err = rados_aio_write(h->ctx, oid, h->comps[h->comp_len], data, len, offset); 127 | if (err < 0) 128 | { 129 | fprintf(stderr, "could not aio write\n"); 130 | errno = -err; 131 | return; 132 | } 133 | h->comp_len++; 134 | errno = 0; 135 | } 136 | 137 | int handle_read(cephprovider_handle_t *h, uint8_t *uuid, uint64_t address, char* dest, int len) 138 | { 139 | //The ceph provider uses 24 bits of address per object, and the top 40 bits as an object ID 140 | int offset = address & 0xFFFFFF; 141 | uint64_t id = address >> 24; 142 | int rv; 143 | char oid [OID_SIZE]; 144 | make_object_id(uuid, id, &oid[0]); 145 | rv = rados_read(h->ctx, oid, dest, len, offset); 146 | if (rv < 0) 147 | { 148 | fprintf(stderr, "could not read %s\n", oid); 149 | errno = -rv; 150 | return -1; 151 | } 152 | errno = 0; 153 | return rv; 154 | } 155 | 156 | void handle_init_allocator(cephprovider_handle_t *h) 157 | { 158 | int err; 159 | struct timeval dur; 160 | dur.tv_sec = 5; 161 | dur.tv_usec = 0; 162 | uint64_t addr; 163 | if (h->comp_len == h->comp_cap) 164 | { 165 | h->comp_cap += COMP_CAP_STEP; 166 | h->comps = realloc(h->comps, (h->comp_cap * sizeof(rados_completion_t))); 167 | if (!h->comps) 168 | { 169 | errno = -err; 170 | return; 171 | } 172 | } 173 | err = rados_aio_create_completion(NULL, NULL, NULL, &(h->comps[h->comp_len])); 174 | if (err < 0) 175 | { 176 | fprintf(stderr, "could not create completion\n"); 177 | errno = -err; 178 | return; 179 | } 180 | 181 | err = rados_lock_exclusive(h->ctx, "allocator", "alloc_lock", "main", "alloc", &dur, 0); 182 | if (err < 0) { 183 | fprintf(stderr, "could not lock allocator\n"); 184 | errno = -err; 185 | return; 186 | } 187 | addr = 0x1000000; //Not zero!! 188 | err = rados_aio_write_full(h->ctx, "allocator", h->comps[h->comp_len], (char *) &addr, 8); 189 | if (err < 0) { 190 | fprintf(stderr, "could not write allocator\n"); 191 | errno = -err; 192 | return; 193 | } 194 | rados_aio_wait_for_safe(h->comps[h->comp_len]); 195 | err = rados_unlock(h->ctx, "allocator", "alloc_lock", "main"); 196 | if (err < 0) { 197 | fprintf(stderr, "could not unlock allocator\n"); 198 | errno = -err; 199 | return; 200 | } 201 | rados_aio_release(h->comps[h->comp_len]); 202 | errno = 0; 203 | } 204 | 205 | //Returns the address of the start of a range that can be 206 | //used 207 | uint64_t handle_obtainrange(cephprovider_handle_t *h) 208 | { 209 | int err; 210 | int rv; 211 | int then; 212 | struct timeval dur; 213 | dur.tv_sec = 60; 214 | dur.tv_usec = 0; 215 | uint64_t addr; 216 | if (h->comp_len == h->comp_cap) 217 | { 218 | h->comp_cap += COMP_CAP_STEP; 219 | h->comps = realloc(h->comps, (h->comp_cap * sizeof(rados_completion_t))); 220 | if (!h->comps) 221 | { 222 | errno = -err; 223 | return 0; 224 | } 225 | } 226 | err = rados_aio_create_completion(NULL, NULL, NULL, &(h->comps[h->comp_len])); 227 | if (err < 0) 228 | { 229 | fprintf(stderr, "could not create completion\n"); 230 | errno = -err; 231 | return 0; 232 | } 233 | then = (int) time(); 234 | while((int)time() - then < 60) 235 | { 236 | err = rados_lock_exclusive(h->ctx, "allocator", "alloc_lock", "main", "alloc", &dur, 0); 237 | if (err == 0) { 238 | break; 239 | } 240 | } 241 | if (err < 0) { 242 | fprintf(stderr, "could not lock allocator\n"); 243 | errno = -err; 244 | return 0; 245 | } 246 | rv = rados_read(h->ctx, "allocator", (char *) &addr, 8, 0); 247 | if (rv < 0 || rv != 8) { 248 | fprintf(stderr, "could not read allocator\n"); 249 | errno = -err; 250 | return 0; 251 | } 252 | printf("read allocation 0x%016" PRIx64 "\n",addr); 253 | addr += ADDR_LOCK_SIZE; 254 | printf("writing allocation 0x%016" PRIx64 "\n",addr); 255 | err = rados_aio_write_full(h->ctx, "allocator", h->comps[h->comp_len], (char *) &addr, 8); 256 | if (err < 0) { 257 | fprintf(stderr, "could not write allocator\n"); 258 | errno = -err; 259 | return 0; 260 | } 261 | rados_aio_wait_for_safe(h->comps[h->comp_len]); 262 | err = rados_unlock(h->ctx, "allocator", "alloc_lock", "main"); 263 | if (err < 0) { 264 | fprintf(stderr, "could not unlock allocator\n"); 265 | errno = -err; 266 | return 0; 267 | } 268 | rados_aio_release(h->comps[h->comp_len]); 269 | errno = 0; 270 | printf("Returning %016" PRIx64 "\n", addr - ADDR_LOCK_SIZE); 271 | return addr - ADDR_LOCK_SIZE; 272 | } 273 | 274 | void handle_close(cephprovider_handle_t *h) 275 | { 276 | int i; 277 | for (i=0; i < h->comp_len; i++) 278 | { 279 | rados_aio_wait_for_complete(h->comps[i]); 280 | rados_aio_release(h->comps[i]); 281 | } 282 | free(h->comps); 283 | rados_ioctx_destroy(h->ctx); 284 | free(h); 285 | 286 | errno = 0; 287 | } 288 | -------------------------------------------------------------------------------- /internal/fileprovider/fileprovider.go: -------------------------------------------------------------------------------- 1 | package fileprovider 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | "sync" 8 | 9 | "github.com/SoftwareDefinedBuildings/btrdb/internal/bprovider" 10 | "github.com/op/go-logging" 11 | ) 12 | 13 | var log *logging.Logger 14 | 15 | func init() { 16 | log = logging.MustGetLogger("log") 17 | } 18 | 19 | const NUMFILES = 256 20 | 21 | type writeparams struct { 22 | Address uint64 23 | Data []byte 24 | } 25 | 26 | type FileProviderSegment struct { 27 | sp *FileStorageProvider 28 | fidx int 29 | f *os.File 30 | base int64 31 | ptr int64 32 | wchan chan writeparams 33 | wg sync.WaitGroup 34 | } 35 | 36 | type FileStorageProvider struct { 37 | fidx chan int 38 | retfidx chan int 39 | dbf []*os.File 40 | dbrf []*os.File 41 | dbrf_mtx []sync.Mutex 42 | favail []bool 43 | } 44 | 45 | func (seg *FileProviderSegment) writer() { 46 | 47 | for args := range seg.wchan { 48 | off := int64(args.Address & ((1 << 50) - 1)) 49 | lenarr := make([]byte, 2) 50 | lenarr[0] = byte(len(args.Data)) 51 | lenarr[1] = byte(len(args.Data) >> 8) 52 | _, err := seg.f.WriteAt(lenarr, off) 53 | if err != nil { 54 | log.Panic("File writing error %v", err) 55 | } 56 | _, err = seg.f.WriteAt(args.Data, off+2) 57 | if err != nil { 58 | log.Panic("File writing error %v", err) 59 | } 60 | } 61 | seg.wg.Done() 62 | } 63 | func (seg *FileProviderSegment) init() { 64 | seg.wchan = make(chan writeparams, 16) 65 | seg.wg.Add(1) 66 | go seg.writer() 67 | } 68 | 69 | //Returns the address of the first free word in the segment when it was locked 70 | func (seg *FileProviderSegment) BaseAddress() uint64 { 71 | //This seems arbitrary, why not go with the top 8 bits? The reason is this: 72 | //a) this still leaves 1PB per file 73 | //b) The huffman encoding can do 58 bits in 8 bytes, but anything more is 9 74 | //c) if we later decide to more than 256 files, we can 75 | return (uint64(seg.fidx) << 50) + uint64(seg.base) 76 | } 77 | 78 | //Unlocks the segment for the StorageProvider to give to other consumers 79 | //Implies a flush 80 | func (seg *FileProviderSegment) Unlock() { 81 | seg.Flush() 82 | seg.sp.retfidx <- seg.fidx 83 | } 84 | 85 | //Writes a slice to the segment, returns immediately 86 | //Returns nil if op is OK, otherwise ErrNoSpace or ErrInvalidArgument 87 | //It is up to the implementer to work out how to report no space immediately 88 | //The uint64 rv is the address to be used for the next write 89 | func (seg *FileProviderSegment) Write(uuid []byte, address uint64, data []byte) (uint64, error) { 90 | //TODO remove 91 | if seg.ptr != int64(address&((1<<50)-1)) { 92 | log.Panic("Pointer does not match address %x vs %x", seg.ptr, int64(address&((1<<50)-1))) 93 | } 94 | wp := writeparams{Address: address, Data: data} 95 | seg.wchan <- wp 96 | seg.ptr = int64(address&((1<<50)-1)) + int64(len(data)) + 2 97 | return uint64(seg.ptr) + (uint64(seg.fidx) << 50), nil 98 | } 99 | 100 | //Block until all writes are complete, not 101 | func (seg *FileProviderSegment) Flush() { 102 | close(seg.wchan) 103 | seg.wg.Wait() 104 | } 105 | 106 | //Provide file indices into fidx, does not return 107 | func (sp *FileStorageProvider) provideFiles() { 108 | for { 109 | //Read all returned files 110 | ldretfi: 111 | for { 112 | select { 113 | case fi := <-sp.retfidx: 114 | sp.favail[fi] = true 115 | default: 116 | break ldretfi 117 | } 118 | } 119 | 120 | //Greedily select file 121 | minidx := -1 122 | var minv int64 = 0 123 | for i := 0; i < NUMFILES; i++ { 124 | if !sp.favail[i] { 125 | continue 126 | } 127 | off, err := sp.dbf[i].Seek(0, os.SEEK_CUR) 128 | if err != nil { 129 | log.Panic(err) 130 | } 131 | if minidx == -1 || off < minv { 132 | minidx = i 133 | minv = off 134 | } 135 | } 136 | 137 | //Return it, or do blocking read if not found 138 | if minidx != -1 { 139 | sp.favail[minidx] = false 140 | sp.fidx <- minidx 141 | } else { 142 | //Do a blocking read on retfidx to avoid fast spin on nonblocking 143 | fi := <-sp.retfidx 144 | sp.favail[fi] = true 145 | } 146 | 147 | } 148 | } 149 | 150 | //Called at startup 151 | func (sp *FileStorageProvider) Initialize(opts map[string]string) { 152 | //Initialize file indices thingy 153 | sp.fidx = make(chan int) 154 | sp.retfidx = make(chan int, NUMFILES+1) 155 | sp.dbf = make([]*os.File, NUMFILES) 156 | sp.dbrf = make([]*os.File, NUMFILES) 157 | sp.dbrf_mtx = make([]sync.Mutex, NUMFILES) 158 | sp.favail = make([]bool, NUMFILES) 159 | for i := 0; i < NUMFILES; i++ { 160 | //Open file 161 | dbpath, ok := opts["dbpath"] 162 | if !ok { 163 | log.Panic("Expected dbpath") 164 | } 165 | fname := fmt.Sprintf("%s/blockstore.%02x.db", dbpath, i) 166 | //write file descriptor 167 | { 168 | f, err := os.OpenFile(fname, os.O_RDWR, 0666) 169 | if err != nil && os.IsNotExist(err) { 170 | log.Critical("Aborting: seems database does not exist. Have you run `btrdbd -makedb`?") 171 | os.Exit(1) 172 | } 173 | if err != nil { 174 | log.Panicf("Problem with blockstore DB: ", err) 175 | } 176 | sp.dbf[i] = f 177 | } 178 | //Read file descriptor 179 | { 180 | f, err := os.OpenFile(fname, os.O_RDONLY, 0666) 181 | if err != nil { 182 | log.Panicf("Problem with blockstore DB: ", err) 183 | } 184 | sp.dbrf[i] = f 185 | } 186 | sp.favail[i] = true 187 | } 188 | go sp.provideFiles() 189 | 190 | } 191 | 192 | // Lock a segment, or block until a segment can be locked 193 | // Returns a Segment struct 194 | func (sp *FileStorageProvider) LockSegment(uuid []byte) bprovider.Segment { 195 | //Grab a file index 196 | fidx := <-sp.fidx 197 | f := sp.dbf[fidx] 198 | l, err := f.Seek(0, os.SEEK_END) 199 | if err != nil { 200 | log.Panicf("Error on lock segment: %v", err) 201 | } 202 | 203 | //Construct segment 204 | seg := &FileProviderSegment{sp: sp, fidx: fidx, f: sp.dbf[fidx], base: l, ptr: l} 205 | seg.init() 206 | 207 | return seg 208 | } 209 | 210 | //This is the size of a maximal size cblock + header 211 | const FIRSTREAD = 3459 212 | 213 | func (sp *FileStorageProvider) Read(uuid []byte, address uint64, buffer []byte) []byte { 214 | fidx := address >> 50 215 | off := int64(address & ((1 << 50) - 1)) 216 | if fidx > NUMFILES { 217 | log.Panic("Encoded file idx too large") 218 | } 219 | sp.dbrf_mtx[fidx].Lock() 220 | nread, err := sp.dbrf[fidx].ReadAt(buffer[:FIRSTREAD], off) 221 | if err != nil && err != io.EOF { 222 | log.Panic("Non EOF read error: %v", err) 223 | } 224 | if nread < 2 { 225 | log.Panic("Unexpected (very) short read") 226 | } 227 | //Now we read the blob size 228 | bsize := int(buffer[0]) + (int(buffer[1]) << 8) 229 | if bsize > nread-2 { 230 | _, err := sp.dbrf[fidx].ReadAt(buffer[nread:bsize+2], off+int64(nread)) 231 | if err != nil { 232 | log.Panic("Read error: %v", err) 233 | } 234 | } 235 | sp.dbrf_mtx[fidx].Unlock() 236 | return buffer[2 : bsize+2] 237 | } 238 | 239 | //Called to create the database for the first time 240 | func (sp *FileStorageProvider) CreateDatabase(opts map[string]string) error { 241 | for i := 0; i < NUMFILES; i++ { 242 | //Open file 243 | dbpath, ok := opts["dbpath"] 244 | if !ok { 245 | log.Panicf("Expected dbpath") 246 | } 247 | fname := fmt.Sprintf("%s/blockstore.%02x.db", dbpath, i) 248 | //write file descriptor 249 | { 250 | f, err := os.OpenFile(fname, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666) 251 | if err != nil && !os.IsExist(err) { 252 | log.Panicf("Problem with blockstore DB: ", err) 253 | } else if os.IsExist(err) { 254 | return bprovider.ErrExists 255 | } 256 | //Add a file tag 257 | //An exercise left for the reader: if you remove this, everything breaks :-) 258 | //Hint: what is the physical address of the first byte of file zero? 259 | _, err = f.Write([]byte("QUASARDB")) 260 | if err != nil { 261 | log.Panicf("Could not write to blockstore:", err) 262 | } 263 | 264 | err = f.Close() 265 | if err != nil { 266 | log.Panicf("Error on close %v", err) 267 | } 268 | } 269 | } 270 | return nil 271 | } 272 | -------------------------------------------------------------------------------- /quasar.go: -------------------------------------------------------------------------------- 1 | package btrdb 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | "time" 7 | 8 | "github.com/pborman/uuid" 9 | "github.com/SoftwareDefinedBuildings/btrdb/internal/bstore" 10 | "github.com/SoftwareDefinedBuildings/btrdb/qtree" 11 | "github.com/op/go-logging" 12 | ) 13 | 14 | var log *logging.Logger 15 | 16 | func init() { 17 | log = logging.MustGetLogger("log") 18 | } 19 | 20 | type openTree struct { 21 | store []qtree.Record 22 | id uuid.UUID 23 | sigEC chan bool 24 | } 25 | 26 | const MinimumTime = -(16 << 56) 27 | const MaximumTime = (48 << 56) 28 | const LatestGeneration = bstore.LatestGeneration 29 | 30 | type Quasar struct { 31 | cfg QuasarConfig 32 | bs *bstore.BlockStore 33 | 34 | //Transaction coalescence 35 | globlock sync.Mutex 36 | treelocks map[[16]byte]*sync.Mutex 37 | openTrees map[[16]byte]*openTree 38 | } 39 | 40 | func newOpenTree(id uuid.UUID) *openTree { 41 | return &openTree{ 42 | id: id, 43 | } 44 | } 45 | 46 | type QuasarConfig struct { 47 | //Measured in the number of datablocks 48 | //So 1000 is 8 MB cache 49 | DatablockCacheSize uint64 50 | 51 | //This enables the grouping of value inserts 52 | //with a commit every Interval millis 53 | //If the number of stored values exceeds 54 | //EarlyTrip 55 | TransactionCoalesceEnable bool 56 | TransactionCoalesceInterval uint64 57 | TransactionCoalesceEarlyTrip uint64 58 | 59 | Params map[string]string 60 | } 61 | 62 | // Return true if there are uncommited results to be written to disk 63 | // Should only be used during shutdown as it hogs the glock 64 | func (q *Quasar) IsPending() bool { 65 | isPend := false 66 | q.globlock.Lock() 67 | for uuid, ot := range q.openTrees { 68 | q.treelocks[uuid].Lock() 69 | if len(ot.store) != 0 { 70 | isPend = true 71 | q.treelocks[uuid].Unlock() 72 | break 73 | } 74 | q.treelocks[uuid].Unlock() 75 | } 76 | q.globlock.Unlock() 77 | return isPend 78 | } 79 | 80 | func NewQuasar(cfg *QuasarConfig) (*Quasar, error) { 81 | bs, err := bstore.NewBlockStore(cfg.Params) 82 | if err != nil { 83 | return nil, err 84 | } 85 | rv := &Quasar{ 86 | cfg: *cfg, 87 | bs: bs, 88 | openTrees: make(map[[16]byte]*openTree, 128), 89 | treelocks: make(map[[16]byte]*sync.Mutex, 128), 90 | } 91 | return rv, nil 92 | } 93 | 94 | func (q *Quasar) getTree(id uuid.UUID) (*openTree, *sync.Mutex) { 95 | mk := bstore.UUIDToMapKey(id) 96 | q.globlock.Lock() 97 | ot, ok := q.openTrees[mk] 98 | if !ok { 99 | ot := newOpenTree(id) 100 | mtx := &sync.Mutex{} 101 | q.openTrees[mk] = ot 102 | q.treelocks[mk] = mtx 103 | q.globlock.Unlock() 104 | return ot, mtx 105 | } 106 | mtx, ok := q.treelocks[mk] 107 | if !ok { 108 | log.Panicf("This should not happen") 109 | } 110 | q.globlock.Unlock() 111 | return ot, mtx 112 | } 113 | 114 | func (t *openTree) commit(q *Quasar) { 115 | if len(t.store) == 0 { 116 | //This might happen with a race in the timeout commit 117 | fmt.Println("no store in commit") 118 | return 119 | } 120 | tr, err := qtree.NewWriteQTree(q.bs, t.id) 121 | if err != nil { 122 | log.Panic(err) 123 | } 124 | if err := tr.InsertValues(t.store); err != nil { 125 | log.Error("BAD INSERT: ", err) 126 | } 127 | tr.Commit() 128 | t.store = nil 129 | } 130 | func (q *Quasar) InsertValues(id uuid.UUID, r []qtree.Record) { 131 | defer func() { 132 | if r := recover(); r != nil { 133 | log.Error("BAD INSERT: ", r) 134 | } 135 | }() 136 | tr, mtx := q.getTree(id) 137 | mtx.Lock() 138 | if tr == nil { 139 | log.Panicf("This should not happen") 140 | } 141 | if tr.store == nil { 142 | //Empty store 143 | tr.store = make([]qtree.Record, 0, len(r)*2) 144 | tr.sigEC = make(chan bool, 1) 145 | //Also spawn the coalesce timeout goroutine 146 | go func(abrt chan bool) { 147 | tmt := time.After(time.Duration(q.cfg.TransactionCoalesceInterval) * time.Millisecond) 148 | select { 149 | case <-tmt: 150 | //do coalesce 151 | mtx.Lock() 152 | //In case we early tripped between waiting for lock and getting it, commit will return ok 153 | //log.Debug("Coalesce timeout %v", id.String()) 154 | tr.commit(q) 155 | mtx.Unlock() 156 | case <-abrt: 157 | return 158 | } 159 | }(tr.sigEC) 160 | } 161 | tr.store = append(tr.store, r...) 162 | if uint64(len(tr.store)) >= q.cfg.TransactionCoalesceEarlyTrip { 163 | tr.sigEC <- true 164 | log.Debug("Coalesce early trip %v", id.String()) 165 | tr.commit(q) 166 | } 167 | mtx.Unlock() 168 | } 169 | func (q *Quasar) Flush(id uuid.UUID) error { 170 | tr, mtx := q.getTree(id) 171 | mtx.Lock() 172 | if len(tr.store) != 0 { 173 | tr.sigEC <- true 174 | tr.commit(q) 175 | fmt.Printf("Commit done %+v\n", id) 176 | } else { 177 | fmt.Printf("no store\n") 178 | } 179 | mtx.Unlock() 180 | return nil 181 | } 182 | 183 | //These functions are the API. TODO add all the bounds checking on PW, and sanity on start/end 184 | func (q *Quasar) QueryValues(id uuid.UUID, start int64, end int64, gen uint64) ([]qtree.Record, uint64, error) { 185 | tr, err := qtree.NewReadQTree(q.bs, id, gen) 186 | if err != nil { 187 | return nil, 0, err 188 | } 189 | rv, err := tr.ReadStandardValuesBlock(start, end) 190 | return rv, tr.Generation(), err 191 | } 192 | 193 | func (q *Quasar) QueryValuesStream(id uuid.UUID, start int64, end int64, gen uint64) (chan qtree.Record, chan error, uint64) { 194 | tr, err := qtree.NewReadQTree(q.bs, id, gen) 195 | if err != nil { 196 | return nil, nil, 0 197 | } 198 | recordc := make(chan qtree.Record) 199 | errc := make(chan error) 200 | go tr.ReadStandardValuesCI(recordc, errc, start, end) 201 | return recordc, errc, tr.Generation() 202 | } 203 | 204 | func (q *Quasar) QueryStatisticalValues(id uuid.UUID, start int64, end int64, 205 | gen uint64, pointwidth uint8) ([]qtree.StatRecord, uint64, error) { 206 | //fmt.Printf("QSV0 s=%v e=%v pw=%v\n", start, end, pointwidth) 207 | start &^= ((1 << pointwidth) - 1) 208 | end &^= ((1 << pointwidth) - 1) 209 | end -= 1 210 | tr, err := qtree.NewReadQTree(q.bs, id, gen) 211 | if err != nil { 212 | return nil, 0, err 213 | } 214 | rv, err := tr.QueryStatisticalValuesBlock(start, end, pointwidth) 215 | if err != nil { 216 | return nil, 0, err 217 | } 218 | return rv, tr.Generation(), nil 219 | } 220 | func (q *Quasar) QueryStatisticalValuesStream(id uuid.UUID, start int64, end int64, 221 | gen uint64, pointwidth uint8) (chan qtree.StatRecord, chan error, uint64) { 222 | fmt.Printf("QSV1 s=%v e=%v pw=%v\n", start, end, pointwidth) 223 | start &^= ((1 << pointwidth) - 1) 224 | end &^= ((1 << pointwidth) - 1) 225 | end -= 1 226 | rvv := make(chan qtree.StatRecord, 1024) 227 | rve := make(chan error) 228 | tr, err := qtree.NewReadQTree(q.bs, id, gen) 229 | if err != nil { 230 | return nil, nil, 0 231 | } 232 | go tr.QueryStatisticalValues(rvv, rve, start, end, pointwidth) 233 | return rvv, rve, tr.Generation() 234 | } 235 | 236 | func (q *Quasar) QueryWindow(id uuid.UUID, start int64, end int64, 237 | gen uint64, width uint64, depth uint8) (chan qtree.StatRecord, uint64) { 238 | rvv := make(chan qtree.StatRecord, 1024) 239 | tr, err := qtree.NewReadQTree(q.bs, id, gen) 240 | if err != nil { 241 | return nil, 0 242 | } 243 | go tr.QueryWindow(start, end, width, depth, rvv) 244 | return rvv, tr.Generation() 245 | } 246 | 247 | func (q *Quasar) QueryGeneration(id uuid.UUID) (uint64, error) { 248 | sb := q.bs.LoadSuperblock(id, bstore.LatestGeneration) 249 | if sb == nil { 250 | return 0, qtree.ErrNoSuchStream 251 | } 252 | return sb.Gen(), nil 253 | } 254 | 255 | func (q *Quasar) QueryNearestValue(id uuid.UUID, time int64, backwards bool, gen uint64) (qtree.Record, uint64, error) { 256 | tr, err := qtree.NewReadQTree(q.bs, id, gen) 257 | if err != nil { 258 | return qtree.Record{}, 0, err 259 | } 260 | rv, err := tr.FindNearestValue(time, backwards) 261 | return rv, tr.Generation(), err 262 | } 263 | 264 | type ChangedRange struct { 265 | Start int64 266 | End int64 267 | } 268 | 269 | //Resolution is how far down the tree to go when working out which blocks have changed. Higher resolutions are faster 270 | //but will give you back coarser results. 271 | func (q *Quasar) QueryChangedRanges(id uuid.UUID, startgen uint64, endgen uint64, resolution uint8) ([]ChangedRange, uint64, error) { 272 | //0 is a reserved generation, so is 1, which means "before first" 273 | if startgen == 0 { 274 | startgen = 1 275 | } 276 | tr, err := qtree.NewReadQTree(q.bs, id, endgen) 277 | if err != nil { 278 | log.Debug("Error on QCR open tree") 279 | return nil, 0, err 280 | } 281 | rv := make([]ChangedRange, 0, 1024) 282 | rch := tr.FindChangedSince(startgen, resolution) 283 | var lr *ChangedRange = nil 284 | for { 285 | 286 | select { 287 | case cr, ok := <-rch: 288 | if !ok { 289 | //This is the end. 290 | //Do we have an unsaved LR? 291 | if lr != nil { 292 | rv = append(rv, *lr) 293 | } 294 | return rv, tr.Generation(), nil 295 | } 296 | if !cr.Valid { 297 | log.Panicf("Didn't think this could happen") 298 | } 299 | //Coalesce 300 | if lr != nil && cr.Start == lr.End { 301 | lr.End = cr.End 302 | } else { 303 | if lr != nil { 304 | rv = append(rv, *lr) 305 | } 306 | lr = &ChangedRange{Start: cr.Start, End: cr.End} 307 | } 308 | } 309 | } 310 | return rv, tr.Generation(), nil 311 | } 312 | 313 | func (q *Quasar) DeleteRange(id uuid.UUID, start int64, end int64) error { 314 | tr, mtx := q.getTree(id) 315 | mtx.Lock() 316 | if len(tr.store) != 0 { 317 | tr.sigEC <- true 318 | tr.commit(q) 319 | } 320 | wtr, err := qtree.NewWriteQTree(q.bs, id) 321 | if err != nil { 322 | log.Panic(err) 323 | } 324 | err = wtr.DeleteRange(start, end) 325 | if err != nil { 326 | log.Panic(err) 327 | } 328 | wtr.Commit() 329 | mtx.Unlock() 330 | return nil 331 | } 332 | -------------------------------------------------------------------------------- /internal/bstore/bstore_test.go: -------------------------------------------------------------------------------- 1 | package bstore 2 | 3 | import ( 4 | "github.com/pborman/uuid" 5 | "math/rand" 6 | "reflect" 7 | "strings" 8 | "testing" 9 | "time" 10 | ) 11 | 12 | func mUint64() uint64 { 13 | return uint64(rand.Uint32()) 14 | //return (uint64(rand.Uint32()) << 32) + uint64(rand.Uint32()) 15 | } 16 | func mInt64() int64 { 17 | return int64(mUint64()) 18 | } 19 | func mFloat64() float64 { 20 | return rand.Float64() 21 | } 22 | 23 | /** 24 | * Randomly populate the fields of a struct 25 | */ 26 | func FillBlock(rv interface{}) { 27 | rand.Seed(time.Now().UnixNano()) 28 | t := reflect.ValueOf(rv) 29 | for i := 0; i < t.Elem().NumField(); i++ { 30 | fld := t.Elem().Field(i) 31 | switch fld.Type().Kind() { 32 | case reflect.Array: 33 | for k := 0; k < fld.Len(); k++ { 34 | if fld.Type().Elem().Kind() == reflect.Float64 { 35 | fld.Index(k).SetFloat(mFloat64()) 36 | } else if fld.Type().Elem().Kind() == reflect.Uint64 { 37 | fld.Index(k).SetUint(mUint64()) 38 | } else if fld.Type().Elem().Kind() == reflect.Int64 { 39 | fld.Index(k).SetInt(mInt64()) 40 | } else if fld.Type().Elem().Kind() == reflect.Uint8 { 41 | fld.Index(k).SetUint(mUint64()) 42 | } else { 43 | log.Panic("Unhandled element type: %v", fld.Type().Elem().Kind()) 44 | } 45 | } 46 | case reflect.Uint64: 47 | fld.SetUint(mUint64()) 48 | case reflect.Uint8: 49 | fld.SetUint(mUint64() & 0xFF) 50 | case reflect.Uint16: 51 | fld.SetUint(mUint64() & 0xFFFF) 52 | case reflect.Int64: 53 | fld.SetInt(mInt64()) 54 | case reflect.Int: 55 | fld.SetInt(mInt64()) 56 | default: 57 | log.Panicf("Unrecognized type: %+v", fld.Type().Kind()) 58 | } 59 | } 60 | } 61 | 62 | func MakeAllocatedCoreblock() *Coreblock { 63 | mBS() 64 | db, err := _gen.AllocateCoreblock() 65 | if err != nil { 66 | log.Panic(err) 67 | } 68 | addr := db.Identifier 69 | FillBlock(db) 70 | db.Identifier = addr 71 | return db 72 | } 73 | 74 | func MakeAllocatedVBlock() *Vectorblock { 75 | mBS() 76 | v, err := _gen.AllocateVectorblock() 77 | if err != nil { 78 | log.Panic(err) 79 | } 80 | addr := v.Identifier 81 | FillBlock(v) 82 | v.Len = VSIZE 83 | v.Identifier = addr 84 | return v 85 | } 86 | 87 | func MakeCoreblock() *Coreblock { 88 | db := new(Coreblock) 89 | FillBlock(db) 90 | for i := 0; i < KFACTOR; i++ { 91 | //These have special meaning, so don't test it here 92 | if db.Addr[i] == 0 { 93 | db.Addr[i] = 1 94 | } 95 | } 96 | return db 97 | } 98 | 99 | func MakeVBlock() *Vectorblock { 100 | v := new(Vectorblock) 101 | FillBlock(v) 102 | v.Len = VSIZE 103 | return v 104 | } 105 | 106 | /** 107 | * This should work with any object that uses the struct tags to 108 | * mean fields that don't need to match after SERDES 109 | */ 110 | func CompareNoTags(lhs interface{}, rhs interface{}, tags []string) bool { 111 | chk := make(map[string]bool) 112 | for _, s := range tags { 113 | chk[s] = true 114 | } 115 | vlhs := reflect.ValueOf(lhs) 116 | vrhs := reflect.ValueOf(rhs) 117 | if vlhs.Type() != vrhs.Type() { 118 | log.Fatalf("Types differ %v %v", vlhs.Type(), vrhs.Type()) 119 | return false 120 | } 121 | for k := 0; k < vlhs.NumField(); k++ { 122 | tagstring := string(reflect.TypeOf(lhs).Field(k).Tag) 123 | tags := strings.Split(tagstring, ",") 124 | doskip := false 125 | for _, k := range tags { 126 | if chk[k] { 127 | doskip = true 128 | } 129 | } 130 | if doskip { 131 | continue 132 | } 133 | if !reflect.DeepEqual(vlhs.Field(k).Interface(), vrhs.Field(k).Interface()) { 134 | log.Fatalf("Field differs: %v, %v != %v", reflect.TypeOf(lhs).Field(k).Name, 135 | vlhs.Field(k).Interface(), vrhs.Field(k).Interface()) 136 | return false 137 | } 138 | } 139 | return true 140 | } 141 | 142 | var _bs *BlockStore = nil 143 | var _gen *Generation = nil 144 | 145 | func mBS() { 146 | testuuid := uuid.NewRandom() 147 | params := map[string]string{ 148 | "dbpath": "/srv/quasartestdb/", 149 | "mongoserver": "localhost", 150 | "cachesize": "0", 151 | } 152 | nbs, err := NewBlockStore(params) 153 | if err != nil { 154 | log.Panic(err) 155 | } 156 | if _bs == nil { 157 | _bs = nbs 158 | _gen = _bs.ObtainGeneration(testuuid) 159 | } 160 | } 161 | 162 | func TestCoreBlockSERDES(t *testing.T) { 163 | db := MakeCoreblock() 164 | buf := make([]byte, CBSIZE) 165 | db.Serialize(buf) 166 | out := new(Coreblock) 167 | out.Deserialize(buf) 168 | if !CompareNoTags(*db, *out, []string{"implicit"}) { 169 | t.Error("Core block SERDES faled") 170 | } 171 | } 172 | 173 | func TestCoreBlockSERDESAbsFullZero(t *testing.T) { 174 | db := MakeCoreblock() 175 | db.Addr[10] = 0 176 | db.Min[10] = 0 177 | db.Mean[10] = 0 178 | db.Max[10] = 0 179 | db.Count[10] = 0 180 | 181 | db.Addr[11] = 0 182 | db.Min[11] = 0 183 | db.Mean[11] = 0 184 | db.Max[11] = 0 185 | db.Count[11] = 0 186 | db.CGeneration[11] = 0 187 | 188 | db.Addr[54] = 0 189 | db.Min[54] = 0 190 | db.Mean[54] = 0 191 | db.Max[54] = 0 192 | db.Count[54] = 0 193 | 194 | for i := 55; i < KFACTOR; i++ { 195 | db.Addr[i] = 0 196 | db.Min[i] = 0 197 | db.Mean[i] = 0 198 | db.Max[i] = 0 199 | db.Count[i] = 0 200 | db.CGeneration[i] = 0 201 | } 202 | 203 | buf := make([]byte, CBSIZE) 204 | db.Serialize(buf) 205 | out := new(Coreblock) 206 | out.Deserialize(buf) 207 | 208 | if !CompareNoTags(*db, *out, []string{"implicit"}) { 209 | t.Error("Core block SERDES faled") 210 | } 211 | } 212 | 213 | func TestCoreBlockBadDES(t *testing.T) { 214 | db := MakeCoreblock() 215 | buf := make([]byte, CBSIZE) 216 | db.Serialize(buf) 217 | out := new(Coreblock) 218 | out.Deserialize(buf) 219 | if out.GetDatablockType() != Core { 220 | t.FailNow() 221 | } 222 | defer func() { 223 | if r := recover(); r == nil { 224 | //We expected a failure 225 | t.FailNow() 226 | } 227 | }() 228 | vb := new(Vectorblock) 229 | vb.Deserialize(buf) 230 | t.FailNow() 231 | } 232 | func TestVectorBlockBadDES(t *testing.T) { 233 | v := MakeVBlock() 234 | buf := make([]byte, VBSIZE) 235 | v.Serialize(buf) 236 | out := new(Vectorblock) 237 | out.Deserialize(buf) 238 | if out.GetDatablockType() != Vector { 239 | t.Fatal("Wrong id on block") 240 | } 241 | defer func() { 242 | if r := recover(); r == nil { 243 | //We expected a failure 244 | t.Fatal("Did not throw exception") 245 | } 246 | }() 247 | cb := new(Coreblock) 248 | cb.Deserialize(buf) 249 | t.FailNow() 250 | } 251 | func TestBufferType(t *testing.T) { 252 | v := MakeVBlock() 253 | buf := make([]byte, VBSIZE) 254 | v.Serialize(buf) 255 | if DatablockGetBufferType(buf) != Vector { 256 | t.Fatal("Expected Vector") 257 | } 258 | c := MakeCoreblock() 259 | buf2 := make([]byte, CBSIZE) 260 | c.Serialize(buf2) 261 | if DatablockGetBufferType(buf2) != Core { 262 | t.Fatal("Expected Core") 263 | } 264 | buf3 := make([]byte, 2) 265 | buf3[0] = byte(5) 266 | if DatablockGetBufferType(buf3) != Bad { 267 | t.Fatal("Expected Bad") 268 | } 269 | } 270 | func TestVBlockSERDES(t *testing.T) { 271 | v := MakeVBlock() 272 | buf := make([]byte, VBSIZE) 273 | v.Serialize(buf) 274 | out := new(Vectorblock) 275 | out.Deserialize(buf) 276 | if !CompareNoTags(*v, *out, []string{"implicit"}) { 277 | t.Error("Vector block SERDES failed") 278 | } 279 | } 280 | 281 | func TestVBlockManSERDES(t *testing.T) { 282 | v := new(Vectorblock) 283 | for i := 0; i < 6; i++ { 284 | v.Time[i] = int64(i * 100000) 285 | v.Value[i] = float64(i * 100000.0) 286 | } 287 | v.Len = 6 288 | buf := make([]byte, VBSIZE) 289 | v.Serialize(buf) 290 | out := new(Vectorblock) 291 | out.Deserialize(buf) 292 | for i := 0; i < 6; i++ { 293 | if v.Value[i] != out.Value[i] { 294 | t.Error("Fail") 295 | } 296 | } 297 | } 298 | 299 | func TestCBlockE2ESERDES(t *testing.T) { 300 | db := MakeAllocatedCoreblock() 301 | for i := 0; i < KFACTOR; i++ { 302 | vb, err := _gen.AllocateVectorblock() 303 | if err != nil { 304 | t.Errorf("Could not allocate VB %v", err) 305 | } 306 | reloc_addr := vb.Identifier 307 | FillBlock(vb) 308 | vb.Len = VSIZE 309 | vb.Identifier = reloc_addr 310 | db.Addr[i] = vb.Identifier 311 | } 312 | cpy := *db 313 | amap, err := _gen.Commit() 314 | if err != nil { 315 | t.Error(err) 316 | } 317 | _bs = nil 318 | _gen = nil 319 | log.Info("reloc address was 0x%016x", cpy.Identifier) 320 | log.Info("cnt0 was %v", cpy.Count[0]) 321 | actual_addr, ok := amap[cpy.Identifier] 322 | if !ok { 323 | t.Errorf("relocation address 0x%016x did not exist in address map", cpy.Identifier) 324 | } 325 | mBS() 326 | out := _bs.ReadDatablock(actual_addr, cpy.Generation, cpy.PointWidth, cpy.StartTime) 327 | cpy.Identifier = actual_addr 328 | for i := 0; i < KFACTOR; i++ { 329 | cpy.Addr[i] = amap[cpy.Addr[i]] 330 | } 331 | if !CompareNoTags(cpy, *(out.(*Coreblock)), []string{}) { 332 | t.Error("E2E C SERDES failed") 333 | } 334 | } 335 | 336 | func TestVBlockE2ESERDES(t *testing.T) { 337 | db := MakeAllocatedVBlock() 338 | cpy := *db 339 | amap, err := _gen.Commit() 340 | if err != nil { 341 | t.Error(err) 342 | } 343 | _bs = nil 344 | _gen = nil 345 | log.Info("reloc address was 0x%016x", cpy.Identifier) 346 | actual_addr, ok := amap[cpy.Identifier] 347 | if !ok { 348 | t.Errorf("relocation address 0x%016x did not exist in address map", cpy.Identifier) 349 | } 350 | mBS() 351 | out := _bs.ReadDatablock(actual_addr, cpy.Generation, cpy.PointWidth, cpy.StartTime) 352 | cpy.Identifier = actual_addr 353 | //cpy.Identifier = actual_addr 354 | if !CompareNoTags(cpy, *(out.(*Vectorblock)), []string{}) { 355 | t.Error("E2E V SERDES failed") 356 | } 357 | } 358 | 359 | func TestVCopyInto(t *testing.T) { 360 | db := MakeVBlock() 361 | out := &Vectorblock{} 362 | db.CopyInto(out) 363 | if !CompareNoTags(*db, *out, []string{"metadata"}) { 364 | t.Error("V CopyInto failed") 365 | } 366 | } 367 | 368 | func TestCCopyInto(t *testing.T) { 369 | db := MakeCoreblock() 370 | out := &Coreblock{} 371 | db.CopyInto(out) 372 | if !CompareNoTags(*db, *out, []string{"metadata"}) { 373 | t.Error("C CopyInto failed") 374 | } 375 | } 376 | 377 | /* 378 | func BenchmarkSERDER(b *testing.B) { 379 | dblocks_in := make([]*Coreblock, b.N) 380 | for i := 0; i < b.N; i++ { 381 | dblocks_in[i] = MakeCoreblock() 382 | } 383 | dblocks_out := make([]*Coreblock, b.N) 384 | for i := 0; i < b.N; i++ { 385 | dblocks_out[i] = new(Coreblock) 386 | } 387 | buf := make([]byte, DBSIZE) 388 | b.ResetTimer() 389 | for i := 0; i < b.N; i++ { 390 | dblocks_in[0].Serialize(buf) 391 | dblocks_out[0].Deserialize(buf) 392 | } 393 | } 394 | */ 395 | -------------------------------------------------------------------------------- /qtree/qtree_utils.go: -------------------------------------------------------------------------------- 1 | package qtree 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/pborman/uuid" 7 | "github.com/SoftwareDefinedBuildings/btrdb/internal/bstore" 8 | ) 9 | 10 | const PWFACTOR = bstore.PWFACTOR 11 | const KFACTOR = bstore.KFACTOR 12 | const MICROSECOND = 1000 13 | const MILLISECOND = 1000 * MICROSECOND 14 | const SECOND = 1000 * MILLISECOND 15 | const MINUTE = 60 * SECOND 16 | const HOUR = 60 * MINUTE 17 | const DAY = 24 * HOUR 18 | const ROOTPW = 56 //This makes each bucket at the root ~= 2.2 years 19 | //so the root spans 146.23 years 20 | const ROOTSTART = -1152921504606846976 //This makes the 16th bucket start at 1970 (0) 21 | const MinimumTime = -(16 << 56) 22 | const MaximumTime = (48 << 56) 23 | 24 | type QTree struct { 25 | sb *bstore.Superblock 26 | bs *bstore.BlockStore 27 | gen *bstore.Generation 28 | root *QTreeNode 29 | commited bool 30 | } 31 | 32 | type Record struct { 33 | Time int64 34 | Val float64 35 | } 36 | 37 | type QTreeNode struct { 38 | tr *QTree 39 | vector_block *bstore.Vectorblock 40 | core_block *bstore.Coreblock 41 | isLeaf bool 42 | child_cache [bstore.KFACTOR]*QTreeNode 43 | parent *QTreeNode 44 | isNew bool 45 | } 46 | 47 | type RecordSlice []Record 48 | 49 | type ChangedRange struct { 50 | Valid bool 51 | Start int64 52 | End int64 53 | } 54 | 55 | func (s RecordSlice) Len() int { 56 | return len(s) 57 | } 58 | 59 | func (s RecordSlice) Swap(i, j int) { 60 | s[i], s[j] = s[j], s[i] 61 | } 62 | 63 | func (s RecordSlice) Less(i, j int) bool { 64 | return s[i].Time < s[j].Time 65 | } 66 | 67 | func (tr *QTree) Commit() { 68 | if tr.commited { 69 | log.Panicf("Tree alredy comitted") 70 | } 71 | if tr.gen == nil { 72 | log.Panicf("Commit on non-write-tree") 73 | } 74 | 75 | tr.gen.Commit() 76 | tr.commited = true 77 | tr.gen = nil 78 | 79 | } 80 | 81 | func (n *QTree) FindNearestValue(time int64, backwards bool) (Record, error) { 82 | if n.root == nil { 83 | return Record{}, ErrNoSuchPoint 84 | } 85 | return n.root.FindNearestValue(time, backwards) 86 | } 87 | 88 | func (n *QTree) Generation() uint64 { 89 | if n.gen != nil { 90 | //Return the gen it will have after commit 91 | return n.gen.Number() 92 | } else { 93 | //Return it's current gen 94 | return n.sb.Gen() 95 | } 96 | return n.gen.Number() 97 | } 98 | 99 | func (tr *QTree) GetReferencedAddrsDebug() map[uint64]bool { 100 | refset := make(map[uint64]bool, 1024000) 101 | 102 | rchan := tr.GetAllReferencedVAddrs() 103 | //for i, v := range e_tree. 104 | idx := 0 105 | for { 106 | val, ok := <-rchan 107 | if idx%8192 == 0 { 108 | log.Info("Got referenced addr #%d", idx) 109 | } 110 | idx += 1 111 | if !ok { 112 | break 113 | } 114 | refset[val] = true 115 | } 116 | return refset 117 | } 118 | 119 | func (tr *QTree) LoadNode(addr uint64, impl_Generation uint64, impl_Pointwidth uint8, impl_StartTime int64) (*QTreeNode, error) { 120 | db := tr.bs.ReadDatablock(tr.sb.Uuid(), addr, impl_Generation, impl_Pointwidth, impl_StartTime) 121 | n := &QTreeNode{tr: tr} 122 | switch db.GetDatablockType() { 123 | case bstore.Vector: 124 | n.vector_block = db.(*bstore.Vectorblock) 125 | n.isLeaf = true 126 | case bstore.Core: 127 | n.core_block = db.(*bstore.Coreblock) 128 | n.isLeaf = false 129 | default: 130 | log.Panicf("What kind of type is this? %+v", db.GetDatablockType()) 131 | } 132 | if n.ThisAddr() == 0 { 133 | log.Panicf("Node has zero address") 134 | } 135 | return n, nil 136 | } 137 | 138 | func (tr *QTree) NewCoreNode(startTime int64, pointWidth uint8) (*QTreeNode, error) { 139 | if tr.gen == nil { 140 | return nil, ErrImmutableTree 141 | } 142 | cb, err := tr.gen.AllocateCoreblock() 143 | if err != nil { 144 | return nil, err 145 | } 146 | cb.PointWidth = pointWidth 147 | startTime = ClampTime(startTime, pointWidth) 148 | cb.StartTime = startTime 149 | rv := &QTreeNode{ 150 | core_block: cb, 151 | tr: tr, 152 | isNew: true, 153 | } 154 | return rv, nil 155 | } 156 | 157 | func (tr *QTree) NewVectorNode(startTime int64, pointWidth uint8) (*QTreeNode, error) { 158 | if tr.gen == nil { 159 | return nil, ErrImmutableTree 160 | } 161 | vb, err := tr.gen.AllocateVectorblock() 162 | if err != nil { 163 | return nil, err 164 | } 165 | vb.PointWidth = pointWidth 166 | startTime = ClampTime(startTime, pointWidth) 167 | vb.StartTime = startTime 168 | rv := &QTreeNode{ 169 | vector_block: vb, 170 | tr: tr, 171 | isLeaf: true, 172 | isNew: true, 173 | } 174 | return rv, nil 175 | } 176 | 177 | /** 178 | * Load a quasar tree 179 | */ 180 | func NewReadQTree(bs *bstore.BlockStore, id uuid.UUID, generation uint64) (*QTree, error) { 181 | sb := bs.LoadSuperblock(id, generation) 182 | if sb == nil { 183 | return nil, ErrNoSuchStream 184 | } 185 | rv := &QTree{sb: sb, bs: bs} 186 | if sb.Root() != 0 { 187 | rt, err := rv.LoadNode(sb.Root(), sb.Gen(), ROOTPW, ROOTSTART) 188 | if err != nil { 189 | log.Panicf("%v", err) 190 | return nil, err 191 | } 192 | //log.Debug("The start time for the root is %v",rt.StartTime()) 193 | rv.root = rt 194 | } 195 | return rv, nil 196 | } 197 | 198 | func NewWriteQTree(bs *bstore.BlockStore, id uuid.UUID) (*QTree, error) { 199 | gen := bs.ObtainGeneration(id) 200 | rv := &QTree{ 201 | sb: gen.New_SB, 202 | gen: gen, 203 | bs: bs, 204 | } 205 | 206 | //If there is an existing root node, we need to load it so that it 207 | //has the correct values 208 | if rv.sb.Root() != 0 { 209 | rt, err := rv.LoadNode(rv.sb.Root(), rv.sb.Gen(), ROOTPW, ROOTSTART) 210 | if err != nil { 211 | log.Panicf("%v", err) 212 | return nil, err 213 | } 214 | rv.root = rt 215 | } else { 216 | rt, err := rv.NewCoreNode(ROOTSTART, ROOTPW) 217 | if err != nil { 218 | log.Panicf("%v", err) 219 | return nil, err 220 | } 221 | rv.root = rt 222 | } 223 | 224 | return rv, nil 225 | } 226 | 227 | func (n *QTreeNode) Generation() uint64 { 228 | if n.isLeaf { 229 | return n.vector_block.Generation 230 | } else { 231 | return n.core_block.Generation 232 | } 233 | } 234 | 235 | func (n *QTreeNode) TreePath() string { 236 | rv := "" 237 | if n.isLeaf { 238 | rv += "V" 239 | } else { 240 | rv += "C" 241 | } 242 | dn := n 243 | for { 244 | par := dn.Parent() 245 | if par == nil { 246 | return rv 247 | } 248 | //Try locate the index of this node in the parent 249 | addr := dn.ThisAddr() 250 | found := false 251 | for i := 0; i < bstore.KFACTOR; i++ { 252 | if par.core_block.Addr[i] == addr { 253 | rv = fmt.Sprintf("(%v)[%v].", par.PointWidth(), i) + rv 254 | found = true 255 | break 256 | } 257 | } 258 | if !found { 259 | log.Panicf("Could not find self address in parent") 260 | } 261 | dn = par 262 | } 263 | } 264 | 265 | func (n *QTreeNode) ArbitraryStartTime(idx uint64, pw uint8) int64 { 266 | return n.StartTime() + int64(idx*(1<> n.PointWidth()) 295 | if rv >= bstore.KFACTOR { 296 | rv = bstore.KFACTOR - 1 297 | } 298 | return uint16(rv) 299 | } 300 | 301 | //Unlike core nodes, vectors have infinitely many buckets. This 302 | //function allows you to get a bucket idx for a time and an 303 | //arbitrary point width 304 | func (n *QTreeNode) ClampVBucket(t int64, pw uint8) uint64 { 305 | if !n.isLeaf { 306 | log.Panicf("This is intended for vectors") 307 | } 308 | if t < n.StartTime() { 309 | t = n.StartTime() 310 | } 311 | t -= n.StartTime() 312 | if pw > n.Parent().PointWidth() { 313 | log.Panicf("I can't do this dave") 314 | } 315 | idx := uint64(t) >> pw 316 | maxidx := uint64(n.Parent().WidthTime()) >> pw 317 | if idx >= maxidx { 318 | idx = maxidx - 1 319 | } 320 | return idx 321 | } 322 | 323 | func (n *QTreeNode) clone() (*QTreeNode, error) { 324 | var rv *QTreeNode 325 | var err error 326 | if !n.isLeaf { 327 | rv, err = n.tr.NewCoreNode(n.StartTime(), n.PointWidth()) 328 | if err != nil { 329 | return nil, err 330 | } 331 | n.core_block.CopyInto(rv.core_block) 332 | } else { 333 | rv, err = n.tr.NewVectorNode(n.StartTime(), n.PointWidth()) 334 | if err != nil { 335 | return nil, err 336 | } 337 | n.vector_block.CopyInto(rv.vector_block) 338 | } 339 | return rv, nil 340 | } 341 | 342 | func (n *QTreeNode) EndTime() int64 { 343 | if n.isLeaf { 344 | //We do this because out point width might not be *KFACTOR as we might be 345 | //at the lowest level 346 | return n.StartTime() + (1 << n.Parent().PointWidth()) 347 | } else { 348 | //A core node has multiple buckets 349 | return n.StartTime() + (1< 100 { 214 | total := 0 215 | for _, v:= range gen.vblocks { 216 | total += int(v.Len) 217 | } 218 | log.Critical("Triggered vblock examination: %v blocks, %v points, %v avg", len(gen.vblocks), total, total/len(gen.vblocks)) 219 | }*/ 220 | gen.vblocks = nil 221 | gen.cblocks = nil 222 | 223 | fsb := fake_sblock{ 224 | Uuid: gen.New_SB.uuid.String(), 225 | Gen: gen.New_SB.gen, 226 | Root: gen.New_SB.root, 227 | } 228 | if err := gen.blockstore.db.C("superblocks").Insert(fsb); err != nil { 229 | lg.Panic(err) 230 | } 231 | gen.flushed = true 232 | gen.blockstore.glock.RLock() 233 | //log.Printf("bs is %v, wlocks is %v", gen.blockstore, gen.blockstore._wlocks) 234 | gen.blockstore._wlocks[UUIDToMapKey(*gen.Uuid())].Unlock() 235 | gen.blockstore.glock.RUnlock() 236 | return address_map, nil 237 | } 238 | 239 | func (bs *BlockStore) datablockBarrier(fi int) { 240 | //Gonuts group says that I don't need to call Sync() 241 | 242 | //Block until all datablocks have finished writing 243 | /*bs.blockmtx[fi].Lock() 244 | err := bs.dbf[fi].Sync() 245 | if err != nil { 246 | log.Panic(err) 247 | } 248 | bs.blockmtx[fi].Unlock()*/ 249 | //bs.ses.Fsync(false) 250 | } 251 | 252 | func (bs *BlockStore) allocateBlock() uint64 { 253 | relocation_address := <-bs.alloc 254 | return relocation_address 255 | } 256 | 257 | /** 258 | * The real function is supposed to allocate an address for the data 259 | * block, reserving it on disk, and then give back the data block that 260 | * can be filled in 261 | * This stub makes up an address, and mongo pretends its real 262 | */ 263 | func (gen *Generation) AllocateCoreblock() (*Coreblock, error) { 264 | cblock := &Coreblock{} 265 | cblock.Identifier = gen.blockstore.allocateBlock() 266 | cblock.Generation = gen.Number() 267 | gen.cblocks = append(gen.cblocks, cblock) 268 | return cblock, nil 269 | } 270 | 271 | func (gen *Generation) AllocateVectorblock() (*Vectorblock, error) { 272 | vblock := &Vectorblock{} 273 | vblock.Identifier = gen.blockstore.allocateBlock() 274 | vblock.Generation = gen.Number() 275 | gen.vblocks = append(gen.vblocks, vblock) 276 | return vblock, nil 277 | } 278 | 279 | func (bs *BlockStore) FreeCoreblock(cb **Coreblock) { 280 | *cb = nil 281 | } 282 | 283 | func (bs *BlockStore) FreeVectorblock(vb **Vectorblock) { 284 | *vb = nil 285 | } 286 | 287 | func (bs *BlockStore) DEBUG_DELETE_UUID(id uuid.UUID) { 288 | lg.Info("DEBUG removing uuid '%v' from database", id.String()) 289 | _, err := bs.db.C("superblocks").RemoveAll(bson.M{"uuid": id.String()}) 290 | if err != nil && err != mgo.ErrNotFound { 291 | lg.Panic(err) 292 | } 293 | if err == mgo.ErrNotFound { 294 | lg.Info("Quey did not find supeblock to delete") 295 | } else { 296 | lg.Info("err was nik") 297 | } 298 | //bs.datablockBarrier() 299 | } 300 | 301 | func (bs *BlockStore) ReadDatablock(uuid uuid.UUID, addr uint64, impl_Generation uint64, impl_Pointwidth uint8, impl_StartTime int64) Datablock { 302 | //Try hit the cache first 303 | db := bs.cacheGet(addr) 304 | if db != nil { 305 | return db 306 | } 307 | syncbuf := block_buf_pool.Get().([]byte) 308 | trimbuf := bs.store.Read([]byte(uuid), addr, syncbuf) 309 | switch DatablockGetBufferType(trimbuf) { 310 | case Core: 311 | rv := &Coreblock{} 312 | rv.Deserialize(trimbuf) 313 | block_buf_pool.Put(syncbuf) 314 | rv.Identifier = addr 315 | rv.Generation = impl_Generation 316 | rv.PointWidth = impl_Pointwidth 317 | rv.StartTime = impl_StartTime 318 | bs.cachePut(addr, rv) 319 | return rv 320 | case Vector: 321 | rv := &Vectorblock{} 322 | rv.Deserialize(trimbuf) 323 | block_buf_pool.Put(syncbuf) 324 | rv.Identifier = addr 325 | rv.Generation = impl_Generation 326 | rv.PointWidth = impl_Pointwidth 327 | rv.StartTime = impl_StartTime 328 | bs.cachePut(addr, rv) 329 | return rv 330 | } 331 | lg.Panic("Strange datablock type") 332 | return nil 333 | } 334 | 335 | type fake_sblock struct { 336 | Uuid string 337 | Gen uint64 338 | Root uint64 339 | Unlinked bool 340 | } 341 | 342 | func (bs *BlockStore) LoadSuperblock(id uuid.UUID, generation uint64) *Superblock { 343 | var sb = fake_sblock{} 344 | if generation == LatestGeneration { 345 | //log.Info("loading superblock uuid=%v (lgen)", id.String()) 346 | qry := bs.db.C("superblocks").Find(bson.M{"uuid": id.String()}) 347 | if err := qry.Sort("-gen").One(&sb); err != nil { 348 | if err == mgo.ErrNotFound { 349 | lg.Info("sb notfound!") 350 | return nil 351 | } else { 352 | lg.Panic(err) 353 | } 354 | } 355 | } else { 356 | qry := bs.db.C("superblocks").Find(bson.M{"uuid": id.String(), "gen": generation}) 357 | if err := qry.One(&sb); err != nil { 358 | if err == mgo.ErrNotFound { 359 | return nil 360 | } else { 361 | lg.Panic(err) 362 | } 363 | } 364 | } 365 | rv := Superblock{ 366 | uuid: id, 367 | gen: sb.Gen, 368 | root: sb.Root, 369 | unlinked: sb.Unlinked, 370 | } 371 | return &rv 372 | } 373 | 374 | func CreateDatabase(params map[string]string) { 375 | ses, err := mgo.Dial(params["mongoserver"]) 376 | if err != nil { 377 | lg.Critical("Could not connect to mongo database", err) 378 | os.Exit(1) 379 | } 380 | db := ses.DB(params["collection"]) 381 | idx := mgo.Index{ 382 | Key: []string{"uuid", "-gen"}, 383 | Unique: true, 384 | DropDups: true, 385 | Background: true, 386 | Sparse: false, 387 | } 388 | db.C("superblocks").EnsureIndex(idx) 389 | switch params["provider"] { 390 | case "file": 391 | if err := os.MkdirAll(params["dbpath"], 0755); err != nil { 392 | lg.Panic(err) 393 | } 394 | fp := new(fileprovider.FileStorageProvider) 395 | err := fp.CreateDatabase(params) 396 | if err != nil { 397 | lg.Critical("Error on create: %v", err) 398 | os.Exit(1) 399 | } 400 | case "ceph": 401 | cp := new(cephprovider.CephStorageProvider) 402 | err := cp.CreateDatabase(params) 403 | if err != nil { 404 | lg.Critical("Error on create: %v", err) 405 | os.Exit(1) 406 | } 407 | } 408 | } 409 | -------------------------------------------------------------------------------- /quasar_test.go: -------------------------------------------------------------------------------- 1 | package btrdb 2 | 3 | import ( 4 | "fmt" 5 | _ "log" 6 | "math/rand" 7 | "testing" 8 | "time" 9 | 10 | "github.com/pborman/uuid" 11 | "github.com/SoftwareDefinedBuildings/btrdb/qtree" 12 | ) 13 | 14 | const MICROSECOND = 1000 15 | const MILLISECOND = 1000 * MICROSECOND 16 | const SECOND = 1000 * MILLISECOND 17 | const MINUTE = 60 * SECOND 18 | const HOUR = 60 * MINUTE 19 | const DAY = 24 * HOUR 20 | 21 | /* 22 | func TestMultInsert(t *testing.T) { 23 | testuuid := uuid.NewRandom() 24 | cfg := &DefaultQuasarConfig 25 | cfg.BlockPath = "/srv/quasartestdb" 26 | q, err := NewQuasar(cfg) 27 | if err != nil { 28 | log.Panic(err) 29 | } 30 | vals := []qtree.Record{{10, 10}, {20, 20}} 31 | q.InsertValues(testuuid, vals) 32 | q.InsertValues(testuuid, vals) 33 | } 34 | */ 35 | func init() { 36 | sd := time.Now().Unix() 37 | fmt.Printf(">>>> USING %v AS SEED <<<<<", sd) 38 | rand.Seed(sd) 39 | } 40 | 41 | /* 42 | var _bs *bstore.BlockStore = nil 43 | 44 | func mBS() { 45 | if _bs == nil { 46 | nbs, err := bstore.NewBlockStore("localhost", 0, "/srv/quasartestdb/") 47 | if err != nil { 48 | log.Panic(err) 49 | } 50 | _bs = nbs 51 | } 52 | } 53 | func GenBrk(avg uint64, spread uint64) chan uint64 { 54 | rv := make(chan uint64) 55 | go func() { 56 | for { 57 | num := int64(avg) 58 | num -= int64(spread / 2) 59 | num += rand.Int63n(int64(spread)) 60 | rv <- uint64(num) 61 | } 62 | }() 63 | return rv 64 | } 65 | func GenData(s int64, e int64, avgTimeBetweenSamples uint64, 66 | spread uint64, dat func(int64) float64) []qtree.Record { 67 | if avgTimeBetweenSamples == 0 { 68 | panic("lolwut") 69 | } 70 | if e <= s { 71 | panic("s<=e") 72 | } 73 | log.Printf("e %v s %v avt %v", s, e, avgTimeBetweenSamples) 74 | p3 := uint64((e-s))/avgTimeBetweenSamples + 100 75 | log.Printf("p3: ", p3) 76 | rv := make([]qtree.Record, 0, p3) 77 | r := qtree.Record{} 78 | for t := s; t < e; { 79 | r.Time = t 80 | r.Val = dat(t) 81 | rv = append(rv, r) 82 | nt := t + int64(avgTimeBetweenSamples) 83 | if spread != 0 { 84 | nt -= int64(spread / 2) 85 | nt += rand.Int63n(int64(spread)) 86 | } 87 | if nt > t { 88 | t = nt 89 | } 90 | } 91 | return rv 92 | } 93 | func MakeWTree() (*qtree.QTree, uuid.UUID) { 94 | id := uuid.NewRandom() 95 | mBS() 96 | tr, err := qtree.NewWriteQTree(_bs, id) 97 | if err != nil { 98 | log.Panic(err) 99 | } 100 | return tr, id 101 | } 102 | */ 103 | func CompareData(lhs []qtree.Record, rhs []qtree.Record) { 104 | if len(lhs) != len(rhs) { 105 | log.Panicf("lhs != rhs len %d vs %d\n", len(lhs), len(rhs)) 106 | } 107 | for i, v := range lhs { 108 | if rhs[i] != v { 109 | log.Panic("data differs") 110 | } 111 | } 112 | } 113 | 114 | /* 115 | func LoadWTree(id uuid.UUID) *qtree.QTree { 116 | mBS() 117 | tr, err := qtree.NewWriteQTree(_bs, id) 118 | if err != nil { 119 | log.Panic(err) 120 | } 121 | return tr 122 | } 123 | 124 | //This flushes, for now 125 | func TestInsertFlush(t *testing.T) { 126 | gs := int64(23) * 365 * DAY 127 | ge := int64(25) * 365 * DAY 128 | freq := uint64(100 * MINUTE) 129 | varn := uint64(10 * MINUTE) 130 | tdat := GenData(gs, ge, freq, varn, 131 | func(_ int64) float64 { return rand.Float64() }) 132 | log.Printf("generated %v records", len(tdat)) 133 | 134 | cfg := &DefaultQuasarConfig 135 | cfg.BlockPath = "/srv/quasartestdb" 136 | q, err := NewQuasar(cfg) 137 | if err != nil { 138 | log.Panic(err) 139 | } 140 | 141 | id := uuid.NewRandom() 142 | log.Printf("Generating uuid=%s", id) 143 | brk := GenBrk(100, 50) 144 | idx := 0 145 | for idx < len(tdat) { 146 | time.Sleep(100 * time.Millisecond) 147 | ln := int(<-brk) 148 | end := idx + ln 149 | if end > len(tdat) { 150 | end = len(tdat) 151 | } 152 | q.InsertValues(id, tdat[idx:end]) 153 | q.Flush(id) 154 | idx += ln 155 | } 156 | 157 | q.Flush(id) 158 | 159 | dat, gen, err := q.QueryValues(id, gs, ge, LatestGeneration) 160 | if err != nil { 161 | log.Panic(err) 162 | } 163 | log.Printf("Test gen was: %v", gen) 164 | CompareData(dat, tdat) 165 | 166 | } 167 | */ 168 | func TestArbWindow(t *testing.T) { 169 | Params := map[string]string{ 170 | "mongoserver": "localhost", 171 | "provider": "file", 172 | "cachesize": "16000", 173 | "collection": "testdb", 174 | "dbpath": "/srv/testqdb/", 175 | } 176 | cfg := QuasarConfig{ 177 | DatablockCacheSize: uint64(0), 178 | TransactionCoalesceEnable: true, 179 | TransactionCoalesceInterval: uint64(5000), 180 | TransactionCoalesceEarlyTrip: uint64(16000), 181 | Params: Params, 182 | } 183 | q, err := NewQuasar(&cfg) 184 | if err != nil { 185 | log.Panicf("error: ", err) 186 | } 187 | startt := 0 188 | deltat := 1000000000 189 | tnum := 50000 190 | tdat := make([]qtree.Record, tnum) 191 | id := uuid.NewRandom() 192 | for i := 0; i < tnum; i++ { 193 | tdat[i].Time = int64(startt) + int64(deltat*i) 194 | tdat[i].Val = float64(i) 195 | } 196 | q.InsertValues(id, tdat) 197 | for i := 0; i < tnum; i++ { 198 | tdat[i].Time = int64(startt) + int64(deltat*i) + int64(tnum*2*deltat) 199 | tdat[i].Val = float64(i) 200 | } 201 | q.InsertValues(id, tdat) 202 | q.Flush(id) 203 | time.Sleep(2 * time.Second) 204 | log.Info("Stream: %+v\n", id) 205 | var rstart int64 = int64(startt) - int64(4000*deltat) 206 | var rend int64 = int64(startt + deltat*250000 + 5000000000) 207 | rvalc, _ := q.QueryWindow(id, rstart, rend, LatestGeneration, uint64(deltat)*700, 0) 208 | for { 209 | v, ok := <-rvalc 210 | log.Info("reading: %+v", v) 211 | if !ok { 212 | panic("eof") 213 | } 214 | /*exp := float64(v.Time+v.Time+int64(deltat)) / float64(deltat) / 2.0 215 | if math.Abs(v.Mean-exp) > 0.00001 { 216 | log.Panicf("got bad %+v\n expected mean: ", v, exp) 217 | }*/ 218 | } 219 | } 220 | 221 | /* 222 | func TestUnlinkBlocks(t *testing.T) { 223 | 224 | gs := int64(24) * 365 * DAY 225 | ge := int64(25) * 365 * DAY 226 | freq := uint64(300 * MINUTE) 227 | varn := uint64(10 * MINUTE) 228 | tdat := GenData(gs, ge, freq, varn, 229 | func(_ int64) float64 { return rand.Float64() }) 230 | log.Printf("generated %v records", len(tdat)) 231 | 232 | cfg := &DefaultQuasarConfig 233 | cfg.BlockPath = "/srv/quasartestdb" 234 | q, err := NewQuasar(cfg) 235 | if err != nil { 236 | log.Panic(err) 237 | } 238 | 239 | { 240 | alloced, free, strange, leaked := q.bs.InspectBlocks() 241 | log.Printf("BEFORE SUMMARY:") 242 | log.Printf("ALLOCED: %d", alloced) 243 | log.Printf("FREE : %d", free) 244 | log.Printf("STRANGE: %d", strange) 245 | log.Printf("LEAKED : %d", leaked) 246 | log.Printf("USAGE : %.2f %%\n", float64(alloced)/float64(alloced+free)*100) 247 | } 248 | id := uuid.NewRandom() 249 | log.Printf("Generating uuid=%s", id) 250 | brk := GenBrk(100, 50) 251 | idx := 0 252 | for idx < len(tdat) { 253 | time.Sleep(1 * time.Second) 254 | ln := int(<-brk) 255 | end := idx + ln 256 | if end > len(tdat) { 257 | end = len(tdat) 258 | } 259 | q.InsertValues(id, tdat[idx:end]) 260 | idx += ln 261 | } 262 | //Allow for coalescence 263 | time.Sleep(10 * time.Second) 264 | 265 | { 266 | alloced, free, strange, leaked := q.bs.InspectBlocks() 267 | log.Printf("AFTER SUMMARY:") 268 | log.Printf("ALLOCED: %d", alloced) 269 | log.Printf("FREE : %d", free) 270 | log.Printf("STRANGE: %d", strange) 271 | log.Printf("LEAKED : %d", leaked) 272 | log.Printf("USAGE : %.2f %%\n", float64(alloced)/float64(alloced+free)*100) 273 | } 274 | { 275 | dat, gen, err := q.QueryValues(id, gs, ge, LatestGeneration) 276 | if err != nil { 277 | log.Panic(err) 278 | } 279 | log.Printf("Test gen was: %v", gen) 280 | CompareData(dat, tdat) 281 | err = q.UnlinkBlocks([]uuid.UUID{id}, []uint64{0}, []uint64{gen - 1}) 282 | if err != nil { 283 | log.Panic(err) 284 | } 285 | } 286 | 287 | { 288 | dat, gen, err := q.QueryValues(id, gs, ge, LatestGeneration) 289 | if err != nil { 290 | log.Panic(err) 291 | } 292 | log.Printf("Test gen was: %v", gen) 293 | CompareData(dat, tdat) 294 | } 295 | 296 | { 297 | alloced, free, strange, leaked := q.bs.InspectBlocks() 298 | log.Printf("AFTER2 SUMMARY:") 299 | log.Printf("ALLOCED: %d", alloced) 300 | log.Printf("FREE : %d", free) 301 | log.Printf("STRANGE: %d", strange) 302 | log.Printf("LEAKED : %d", leaked) 303 | log.Printf("USAGE : %.2f %%\n", float64(alloced)/float64(alloced+free)*100) 304 | } 305 | } 306 | func TestCompleteDelete(t *testing.T) { 307 | gs := int64(24) * 365 * DAY 308 | ge := int64(25) * 365 * DAY 309 | freq := uint64(300 * MINUTE) 310 | varn := uint64(10 * MINUTE) 311 | tdat := GenData(gs, ge, freq, varn, 312 | func(_ int64) float64 { return rand.Float64() }) 313 | log.Printf("generated %v records", len(tdat)) 314 | id := uuid.NewRandom() 315 | cfg := &DefaultQuasarConfig 316 | cfg.BlockPath = "/srv/quasartestdb" 317 | q, err := NewQuasar(cfg) 318 | if err != nil { 319 | log.Panic(err) 320 | } 321 | { 322 | q.InsertValues(id, tdat) 323 | q.Flush(id) 324 | } 325 | { 326 | dat, _, err := q.QueryValues(id, gs, ge, LatestGeneration) 327 | if err != nil { 328 | log.Panic(err) 329 | } 330 | CompareData(dat, tdat) 331 | } 332 | { 333 | q.DeleteRange(id, gs, ge+1) 334 | dat, _, err := q.QueryValues(id, gs, ge, LatestGeneration) 335 | if err != nil { 336 | log.Panic(err) 337 | } 338 | if len(dat) != 0 { 339 | t.Log("dat length wrong") 340 | t.Fail() 341 | } 342 | } 343 | { 344 | q.InsertValues(id, tdat) 345 | q.Flush(id) 346 | } 347 | { 348 | dat, _, err := q.QueryValues(id, gs, ge, LatestGeneration) 349 | if err != nil { 350 | log.Panic(err) 351 | } 352 | CompareData(dat, tdat) 353 | } 354 | 355 | } 356 | func TestUnlinkBlocks2(t *testing.T) { 357 | 358 | gs := int64(24) * 365 * DAY 359 | ge := int64(25) * 365 * DAY 360 | freq := uint64(300 * MINUTE) 361 | varn := uint64(10 * MINUTE) 362 | tdat := GenData(gs, ge, freq, varn, 363 | func(_ int64) float64 { return rand.Float64() }) 364 | log.Printf("generated %v records", len(tdat)) 365 | 366 | cfg := &DefaultQuasarConfig 367 | cfg.BlockPath = "/srv/quasartestdb" 368 | q, err := NewQuasar(cfg) 369 | if err != nil { 370 | log.Panic(err) 371 | } 372 | 373 | { 374 | alloced, free, strange, leaked := q.bs.InspectBlocks() 375 | log.Printf("BEFORE SUMMARY:") 376 | log.Printf("ALLOCED: %d", alloced) 377 | log.Printf("FREE : %d", free) 378 | log.Printf("STRANGE: %d", strange) 379 | log.Printf("LEAKED : %d", leaked) 380 | log.Printf("USAGE : %.2f %%\n", float64(alloced)/float64(alloced+free)*100) 381 | } 382 | id := uuid.NewRandom() 383 | log.Printf("Generating uuid=%s", id) 384 | brk := GenBrk(100, 50) 385 | idx := 0 386 | for idx < len(tdat) { 387 | time.Sleep(1 * time.Second) 388 | ln := int(<-brk) 389 | end := idx + ln 390 | if end > len(tdat) { 391 | end = len(tdat) 392 | } 393 | q.InsertValues(id, tdat[idx:end]) 394 | idx += ln 395 | } 396 | //Allow for coalescence 397 | time.Sleep(10 * time.Second) 398 | { 399 | alloced, free, strange, leaked := q.bs.InspectBlocks() 400 | log.Printf("BEFORE DELETE:") 401 | log.Printf("ALLOCED: %d", alloced) 402 | log.Printf("FREE : %d", free) 403 | log.Printf("STRANGE: %d", strange) 404 | log.Printf("LEAKED : %d", leaked) 405 | log.Printf("USAGE : %.2f %%\n", float64(alloced)/float64(alloced+free)*100) 406 | } 407 | { 408 | err := q.DeleteRange(id, tdat[1].Time, ge) 409 | if err != nil { 410 | t.Error(err) 411 | } 412 | } 413 | { 414 | q.InsertValues(id, []qtree.Record{{0, 100}}) 415 | q.Flush(id) 416 | } 417 | { 418 | alloced, free, strange, leaked := q.bs.InspectBlocks() 419 | log.Printf("AFTER DELETE:") 420 | log.Printf("ALLOCED: %d", alloced) 421 | log.Printf("FREE : %d", free) 422 | log.Printf("STRANGE: %d", strange) 423 | log.Printf("LEAKED : %d", leaked) 424 | log.Printf("USAGE : %.2f %%\n", float64(alloced)/float64(alloced+free)*100) 425 | } 426 | { 427 | _, gen, err := q.QueryValues(id, gs, ge, LatestGeneration) 428 | if err != nil { 429 | log.Panic(err) 430 | } 431 | err = q.UnlinkBlocks([]uuid.UUID{id}, []uint64{0}, []uint64{gen}) 432 | if err != nil { 433 | log.Panic(err) 434 | } 435 | } 436 | 437 | { 438 | alloced, free, strange, leaked := q.bs.InspectBlocks() 439 | log.Printf("AFTER FREE:") 440 | log.Printf("ALLOCED: %d", alloced) 441 | log.Printf("FREE : %d", free) 442 | log.Printf("STRANGE: %d", strange) 443 | log.Printf("LEAKED : %d", leaked) 444 | log.Printf("USAGE : %.2f %%\n", float64(alloced)/float64(alloced+free)*100) 445 | } 446 | } 447 | */ 448 | -------------------------------------------------------------------------------- /qtree/qtree2_test.go: -------------------------------------------------------------------------------- 1 | package qtree 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "math/rand" 7 | "testing" 8 | "time" 9 | 10 | "github.com/SoftwareDefinedBuildings/btrdb/internal/bstore" 11 | ) 12 | 13 | func init() { 14 | sd := time.Now().Unix() 15 | fmt.Printf(">>>> USING %v AS SEED <<<<<", sd) 16 | //rand.Seed(1417417715) 17 | rand.Seed(sd) 18 | } 19 | func GenBrk(avg uint64, spread uint64) chan uint64 { 20 | rv := make(chan uint64) 21 | go func() { 22 | for { 23 | num := int64(avg) 24 | num -= int64(spread / 2) 25 | num += rand.Int63n(int64(spread)) 26 | rv <- uint64(num) 27 | } 28 | }() 29 | return rv 30 | } 31 | 32 | //TODO PW test at range with no data 33 | func TestQT2_PW2(t *testing.T) { 34 | log.Printf("Inserting data 0-4096") 35 | te := int64(4096) 36 | tdat := GenData(0, 4096, 1, 0, func(_ int64) float64 { return rand.Float64() }) 37 | if int64(len(tdat)) != te { 38 | log.Panic("GenDat messed up a bit") 39 | } 40 | tr, uuid := MakeWTree() 41 | tr.InsertValues(tdat) 42 | tr.Commit() 43 | var err error 44 | tr, err = NewReadQTree(_bs, uuid, bstore.LatestGeneration) 45 | if err != nil { 46 | t.Error(err) 47 | } 48 | 49 | moddat := make([]StatRecord, len(tdat)) 50 | for i, v := range tdat { 51 | moddat[i] = StatRecord{ 52 | Time: v.Time, 53 | Count: 1, 54 | Min: v.Val, 55 | Mean: v.Val, 56 | Max: v.Val, 57 | } 58 | } 59 | expected_qty := 4096 60 | for pwi := uint8(0); pwi < 63; pwi++ { 61 | qrydat, err := tr.QueryStatisticalValuesBlock(-(16 << 56), 48<<56, pwi) 62 | if err != nil { 63 | log.Panic(err) 64 | } 65 | //log.Printf("for pwi %v, we got len %v",pwi, len(qrydat)) 66 | if len(qrydat) != expected_qty { 67 | log.Printf("qdat: %v", qrydat) 68 | log.Printf("expected %v, got %v", expected_qty, len(qrydat)) 69 | t.FailNow() 70 | } 71 | if expected_qty != 1 { 72 | expected_qty >>= 1 73 | } 74 | } 75 | } 76 | func TestQT2_PW(t *testing.T) { 77 | log.Printf("Inserting data 0-4096") 78 | te := int64(4096) 79 | tdat := GenData(0, 4096, 1, 0, func(_ int64) float64 { return rand.Float64() }) 80 | if int64(len(tdat)) != te { 81 | log.Panic("GenDat messed up a bit") 82 | } 83 | tr, uuid := MakeWTree() 84 | err := tr.InsertValues(tdat) 85 | if err != nil { 86 | t.Error(err) 87 | } 88 | tr.Commit() 89 | tr, err = NewReadQTree(_bs, uuid, bstore.LatestGeneration) 90 | if err != nil { 91 | t.Error(err) 92 | } 93 | 94 | moddat := make([]StatRecord, len(tdat)) 95 | for i, v := range tdat { 96 | moddat[i] = StatRecord{ 97 | Time: v.Time, 98 | Count: 1, 99 | Min: v.Val, 100 | Mean: v.Val, 101 | Max: v.Val, 102 | } 103 | } 104 | for pwi := uint8(0); pwi < 12; pwi++ { 105 | qrydat, err := tr.QueryStatisticalValuesBlock(0, te, pwi) 106 | if err != nil { 107 | log.Panic(err) 108 | } 109 | if int64(len(qrydat)) != te>>pwi { 110 | t.Log("len of qrydat mismatch %v vs %v", len(qrydat), te>>pwi) 111 | log.Printf("qry dat %+v", qrydat) 112 | t.FailNow() 113 | } else { 114 | t.Log("LEN MATCH %v", len(qrydat)) 115 | } 116 | min := func(a float64, b float64) float64 { 117 | if a < b { 118 | return a 119 | } 120 | return b 121 | } 122 | max := func(a float64, b float64) float64 { 123 | if a > b { 124 | return a 125 | } 126 | return b 127 | } 128 | moddat2 := make([]StatRecord, len(moddat)/2) 129 | for i := 0; i < len(moddat)/2; i++ { 130 | nmean := moddat[2*i].Mean*float64(moddat[2*i].Count) + 131 | moddat[2*i+1].Mean*float64(moddat[2*i+1].Count) 132 | nmean /= float64(moddat[2*i].Count + moddat[2*i+1].Count) 133 | 134 | moddat2[i] = StatRecord{ 135 | Time: moddat[2*i].Time, 136 | Count: moddat[2*i].Count + moddat[2*i+1].Count, 137 | Min: min(moddat[2*i].Min, moddat[2*i+1].Min), 138 | Mean: nmean, 139 | Max: max(moddat[2*i].Max, moddat[2*i+1].Max), 140 | } 141 | } 142 | } 143 | } 144 | func TestQT2_A(t *testing.T) { 145 | gs := int64(20+rand.Intn(10)) * 365 * DAY 146 | ge := int64(30+rand.Intn(10)) * 365 * DAY 147 | freq := uint64(rand.Intn(10)+1) * HOUR 148 | varn := uint64(30 * MINUTE) 149 | tdat := GenData(gs, ge, freq, varn, 150 | func(_ int64) float64 { return rand.Float64() }) 151 | log.Printf("generated %v records", len(tdat)) 152 | tr, uuid := MakeWTree() 153 | log.Printf("geneated tree %v", tr.gen.Uuid().String()) 154 | tr.Commit() 155 | 156 | idx := uint64(0) 157 | brks := GenBrk(100, 50) 158 | loops := GenBrk(4, 4) 159 | for idx < uint64(len(tdat)) { 160 | tr := LoadWTree(uuid) 161 | loop := <-loops 162 | for i := uint64(0); i < loop; i++ { 163 | brk := <-brks 164 | if idx+brk >= uint64(len(tdat)) { 165 | brk = uint64(len(tdat)) - idx 166 | } 167 | if brk == 0 { 168 | continue 169 | } 170 | tr.InsertValues(tdat[idx : idx+brk]) 171 | idx += brk 172 | } 173 | tr.Commit() 174 | } 175 | 176 | rtr, err := NewReadQTree(_bs, uuid, bstore.LatestGeneration) 177 | if err != nil { 178 | log.Panic(err) 179 | } 180 | rval, err := rtr.ReadStandardValuesBlock(gs, ge+int64(2*varn)) 181 | if err != nil { 182 | log.Panic(err) 183 | } 184 | log.Printf("wrote %v, read %v", len(tdat), len(rval)) 185 | CompareData(tdat, rval) 186 | } 187 | 188 | func TestQT2_Superdense(t *testing.T) { 189 | tdat := make([]Record, 10000) 190 | for i := 0; i < 10000; i++ { 191 | tdat[i] = Record{Time: 5, Val: i} 192 | } 193 | tr, uuid := MakeWTree() 194 | log.Printf("geneated tree %v", tr.gen.Uuid().String()) 195 | tr.Commit() 196 | 197 | idx := uint64(0) 198 | brks := GenBrk(100, 50) 199 | loops := GenBrk(4, 4) 200 | for idx < uint64(len(tdat)) { 201 | tr := LoadWTree(uuid) 202 | loop := <-loops 203 | for i := uint64(0); i < loop; i++ { 204 | brk := <-brks 205 | if idx+brk >= uint64(len(tdat)) { 206 | brk = uint64(len(tdat)) - idx 207 | } 208 | if brk == 0 { 209 | continue 210 | } 211 | tr.InsertValues(tdat[idx : idx+brk]) 212 | idx += brk 213 | } 214 | tr.Commit() 215 | } 216 | 217 | rtr, err := NewReadQTree(_bs, uuid, bstore.LatestGeneration) 218 | if err != nil { 219 | log.Panic(err) 220 | } 221 | rval, err := rtr.ReadStandardValuesBlock(gs, ge+int64(2*varn)) 222 | if err != nil { 223 | log.Panic(err) 224 | } 225 | log.Printf("wrote %v, read %v", len(tdat), len(rval)) 226 | CompareData(tdat, rval) 227 | } 228 | 229 | func TestQT2_Nearest(t *testing.T) { 230 | vals := []Record{ 231 | {int64(1 << 56), 1}, 232 | {int64(2 << 56), 2}, 233 | {int64(3 << 56), 3}, 234 | } 235 | tr, uuid := MakeWTree() 236 | err := tr.InsertValues(vals) 237 | if err != nil { 238 | t.Error(err) 239 | } 240 | tr.Commit() 241 | rtr, err := NewReadQTree(_bs, uuid, bstore.LatestGeneration) 242 | if err != nil { 243 | log.Panic(err) 244 | } 245 | tparams := []struct { 246 | time int64 247 | backwards bool 248 | expectOk bool 249 | val float64 250 | }{ 251 | {(2 << 56) + 1, true, true, 2}, 252 | {(2 << 56), true, true, 1}, 253 | {(2 << 56), false, true, 2}, 254 | {(2 << 56) + 1, false, true, 3}, 255 | {0, false, true, 1}, 256 | {4 << 56, true, true, 3}, 257 | {0, true, false, -1}, 258 | {4 << 56, false, false, -1}, 259 | } 260 | for i, v := range tparams { 261 | rv, err := rtr.FindNearestValue(v.time, v.backwards) 262 | if v.expectOk { 263 | if err != nil || rv.Val != v.val { 264 | t.Fatal("subtest [%v] = %+v", i, v) 265 | } 266 | } else { 267 | if err != ErrNoSuchPoint { 268 | t.Fatal("subtest [%v] = %+v", i, v) 269 | } 270 | } 271 | } 272 | } 273 | 274 | func TestQT2_DEL(t *testing.T) { 275 | gs := int64(20+rand.Intn(10)) * 365 * DAY 276 | ge := int64(30+rand.Intn(10)) * 365 * DAY 277 | freq := uint64(rand.Intn(10)+1) * HOUR 278 | varn := uint64(30 * MINUTE) 279 | tdat := GenData(gs, ge, freq, varn, 280 | func(_ int64) float64 { return rand.Float64() }) 281 | log.Printf("generated %v records", len(tdat)) 282 | tr, uuid := MakeWTree() 283 | log.Printf("geneated tree %v", tr.gen.Uuid().String()) 284 | tr.Commit() 285 | 286 | idx := uint64(0) 287 | brks := GenBrk(100, 50) 288 | loops := GenBrk(4, 4) 289 | for idx < uint64(len(tdat)) { 290 | tr := LoadWTree(uuid) 291 | loop := <-loops 292 | for i := uint64(0); i < loop; i++ { 293 | brk := <-brks 294 | if idx+brk >= uint64(len(tdat)) { 295 | brk = uint64(len(tdat)) - idx 296 | } 297 | if brk == 0 { 298 | continue 299 | } 300 | tr.InsertValues(tdat[idx : idx+brk]) 301 | idx += brk 302 | } 303 | tr.Commit() 304 | } 305 | 306 | rtr, err := NewReadQTree(_bs, uuid, bstore.LatestGeneration) 307 | if err != nil { 308 | log.Panic(err) 309 | } 310 | rval, err := rtr.ReadStandardValuesBlock(gs, ge+int64(2*varn)) 311 | if err != nil { 312 | log.Panic(err) 313 | } 314 | log.Printf("wrote %v, read %v", len(tdat), len(rval)) 315 | CompareData(tdat, rval) 316 | 317 | dtr, err := NewWriteQTree(_bs, uuid) 318 | dtr.DeleteRange(tdat[1].Time, tdat[len(tdat)-2].Time+1) 319 | dtr.Commit() 320 | { 321 | rtr, err := NewReadQTree(_bs, uuid, bstore.LatestGeneration) 322 | if err != nil { 323 | log.Panic(err) 324 | } 325 | rval, err := rtr.ReadStandardValuesBlock(gs, ge+int64(2*varn)) 326 | if err != nil { 327 | log.Panic(err) 328 | } 329 | 330 | if len(rval) != 2 { 331 | t.Log("Mismatch in expected length") 332 | t.Fail() 333 | } 334 | } 335 | { 336 | rtr, err := NewReadQTree(_bs, uuid, bstore.LatestGeneration) 337 | if err != nil { 338 | log.Panic(err) 339 | } 340 | rch := rtr.GetAllReferencedVAddrs() 341 | refd := make([]uint64, 0, 10) 342 | for v := range rch { 343 | log.Printf("Referenced: 0x%016x", v) 344 | refd = append(refd, v) 345 | } 346 | /* 347 | if len(refd) != 5 { 348 | t.Log("Referencing != 5 nodes (%v)", len(refd)) 349 | t.Fail() 350 | }*/ 351 | } 352 | } 353 | 354 | func TestQT2_CRNG(t *testing.T) { 355 | gs := int64(20+rand.Intn(10)) * 365 * DAY 356 | ge := int64(30+rand.Intn(10)) * 365 * DAY 357 | freq := uint64(rand.Intn(10)+1) * HOUR 358 | varn := uint64(30 * MINUTE) 359 | tdat := GenData(gs, ge, freq, varn, 360 | func(_ int64) float64 { return rand.Float64() }) 361 | log.Printf("generated %v records", len(tdat)) 362 | tr, uuid := MakeWTree() 363 | log.Printf("geneated tree %v", tr.gen.Uuid().String()) 364 | tr.Commit() 365 | 366 | idx := uint64(0) 367 | brks := GenBrk(100, 50) 368 | loops := GenBrk(4, 4) 369 | for idx < uint64(len(tdat)) { 370 | tr := LoadWTree(uuid) 371 | loop := <-loops 372 | for i := uint64(0); i < loop; i++ { 373 | brk := <-brks 374 | if idx+brk >= uint64(len(tdat)) { 375 | brk = uint64(len(tdat)) - idx 376 | } 377 | if brk == 0 { 378 | continue 379 | } 380 | tr.InsertValues(tdat[idx : idx+brk]) 381 | idx += brk 382 | } 383 | tr.Commit() 384 | } 385 | 386 | rtr, err := NewReadQTree(_bs, uuid, bstore.LatestGeneration) 387 | if err != nil { 388 | log.Panic(err) 389 | } 390 | rval, err := rtr.ReadStandardValuesBlock(gs, ge+int64(2*varn)) 391 | if err != nil { 392 | log.Panic(err) 393 | } 394 | initial_gen := rtr.Generation() 395 | log.Printf("wrote %v, read %v", len(tdat), len(rval)) 396 | CompareData(tdat, rval) 397 | 398 | dtr, err := NewWriteQTree(_bs, uuid) 399 | dtr.DeleteRange(tdat[0].Time, tdat[5].Time) 400 | dtr.Commit() 401 | { 402 | rtr, err := NewReadQTree(_bs, uuid, bstore.LatestGeneration) 403 | if err != nil { 404 | log.Panic(err) 405 | } 406 | rval, err := rtr.ReadStandardValuesBlock(gs, ge+int64(2*varn)) 407 | if err != nil { 408 | log.Panic(err) 409 | } 410 | if len(rval) != len(tdat)-5 { 411 | t.Log("Mismatch in expected length %v %v %v", len(rval), len(tdat)-5, len(tdat)) 412 | t.Fail() 413 | } 414 | log.Printf("gen was, gen is: %v / %v", initial_gen, rtr.Generation()) 415 | log.Printf("========== STARTING CHANGED RANGE INVOCATION ==============") 416 | changed_ranges := rtr.FindChangedSinceSlice(initial_gen, 0) 417 | log.Printf("Changed ranges: %+v", changed_ranges) 418 | s, e, ds, de := tdat[0].Time, tdat[5].Time, changed_ranges[0].Start-tdat[0].Time, changed_ranges[0].End-tdat[5].Time 419 | dsm := float64(ds) / (1E9 * 60) 420 | dem := float64(de) / (1E9 * 60) 421 | log.Printf("We deleted from %v to %v \n(delta %v %v) (delta min %.3f %.3f)", s, e, ds, de, dsm, dem) 422 | rtr.root.PrintCounts(0) 423 | } 424 | 425 | { 426 | dtr, err := NewWriteQTree(_bs, uuid) 427 | dtr.InsertValues([]Record{{ge - 1000, 100}}) 428 | dtr.Commit() 429 | rtr, err := NewReadQTree(_bs, uuid, bstore.LatestGeneration) 430 | if err != nil { 431 | log.Panic(err) 432 | } 433 | rval, err := rtr.ReadStandardValuesBlock(gs, ge+int64(2*varn)) 434 | if err != nil { 435 | log.Panic(err) 436 | } 437 | if len(rval) != len(tdat)-4 { 438 | t.Log("Mismatch in expected length %v %v %v", len(rval), len(tdat)-5, len(tdat)) 439 | t.Fail() 440 | } 441 | log.Printf("gen was, gen is: %v / %v", initial_gen, rtr.Generation()) 442 | log.Printf("========== STARTING CHANGED RANGE INVOCATION ==============") 443 | changed_ranges := rtr.FindChangedSinceSlice(initial_gen, 0) 444 | log.Printf("Changed ranges: %+v", changed_ranges) 445 | s, e, ds, de := tdat[0].Time, tdat[5].Time, changed_ranges[0].Start-tdat[0].Time, changed_ranges[0].End-tdat[5].Time 446 | dsm := float64(ds) / (1E9 * 60) 447 | dem := float64(de) / (1E9 * 60) 448 | log.Printf("We deleted from %v to %v \n(delta %v %v) (delta min %.3f %.3f)", s, e, ds, de, dsm, dem) 449 | rtr.root.PrintCounts(0) 450 | } 451 | } 452 | -------------------------------------------------------------------------------- /cpinterface/cpinterface.go: -------------------------------------------------------------------------------- 1 | package cpinterface 2 | 3 | import ( 4 | "net" 5 | "os" 6 | "os/signal" 7 | "sync" 8 | 9 | "github.com/pborman/uuid" 10 | "github.com/SoftwareDefinedBuildings/btrdb" 11 | "github.com/SoftwareDefinedBuildings/btrdb/qtree" 12 | capn "github.com/glycerine/go-capnproto" 13 | "github.com/op/go-logging" 14 | ) 15 | 16 | var log *logging.Logger 17 | 18 | func init() { 19 | log = logging.MustGetLogger("log") 20 | } 21 | 22 | type CPInterface struct { 23 | isShuttingDown bool 24 | } 25 | 26 | func ServeCPNP(q *btrdb.Quasar, ntype string, laddr string) *CPInterface { 27 | rv := &CPInterface{} 28 | go func() { 29 | sigchan := make(chan os.Signal, 1) 30 | signal.Notify(sigchan, os.Interrupt) 31 | _ = <-sigchan 32 | rv.isShuttingDown = true 33 | }() 34 | l, err := net.Listen(ntype, laddr) 35 | if err != nil { 36 | log.Panic(err) 37 | } 38 | defer l.Close() 39 | for !rv.isShuttingDown { 40 | conn, err := l.Accept() 41 | if err != nil { 42 | log.Panic(err) 43 | } 44 | go func(c net.Conn) { 45 | rv.dispatchCommands(q, c) 46 | }(conn) 47 | } 48 | return rv 49 | } 50 | 51 | func (c *CPInterface) Shutdown() { 52 | c.isShuttingDown = true 53 | } 54 | 55 | func (c *CPInterface) dispatchCommands(q *btrdb.Quasar, conn net.Conn) { 56 | //This governs the stream 57 | rmtx := sync.Mutex{} 58 | wmtx := sync.Mutex{} 59 | log.Info("cpnp connection") 60 | for !c.isShuttingDown { 61 | rmtx.Lock() 62 | seg, err := capn.ReadFromStream(conn, nil) 63 | if err != nil { 64 | log.Warning("ERR (%v) :: %v", conn.RemoteAddr(), err) 65 | conn.Close() 66 | break 67 | } 68 | rmtx.Unlock() 69 | go func() { 70 | seg := seg 71 | req := ReadRootRequest(seg) 72 | mkresp := func() (Response, *capn.Segment) { 73 | rvseg := capn.NewBuffer(nil) 74 | resp := NewRootResponse(rvseg) 75 | resp.SetEchoTag(req.EchoTag()) 76 | return resp, rvseg 77 | } 78 | sendresp := func(seg *capn.Segment) { 79 | wmtx.Lock() 80 | seg.WriteTo(conn) 81 | wmtx.Unlock() 82 | } 83 | switch req.Which() { 84 | case REQUEST_QUERYSTANDARDVALUES: 85 | //log.Info("QSV\n") 86 | st := req.QueryStandardValues().StartTime() 87 | et := req.QueryStandardValues().EndTime() 88 | uuid := uuid.UUID(req.QueryStandardValues().Uuid()) 89 | ver := req.QueryStandardValues().Version() 90 | //log.Info("[REQ=QsV] st=%v, et=%v, uuid=%v, gen=%v", st, et, uuid, ver) 91 | if ver == 0 { 92 | ver = btrdb.LatestGeneration 93 | } 94 | recordc, errorc, gen := q.QueryValuesStream(uuid, st, et, ver) 95 | if recordc == nil { 96 | log.Warning("RESPONDING ERR: %v", err) 97 | resp, rvseg := mkresp() 98 | resp.SetStatusCode(STATUSCODE_INTERNALERROR) 99 | resp.SetFinal(true) 100 | sendresp(rvseg) 101 | return 102 | } else { 103 | bufarr := make([]qtree.Record, 0, 4096) 104 | for { 105 | resp, rvseg := mkresp() 106 | fail := false 107 | fin := false 108 | for { 109 | select { 110 | case _, ok := <-errorc: 111 | if ok { 112 | fin = true 113 | fail = true 114 | goto donestandard 115 | } 116 | case r, ok := <-recordc: 117 | if !ok { 118 | fin = true 119 | goto donestandard 120 | } 121 | bufarr = append(bufarr, r) 122 | if len(bufarr) == cap(bufarr) { 123 | goto donestandard 124 | } 125 | } 126 | } 127 | donestandard: 128 | if fail { 129 | resp.SetStatusCode(STATUSCODE_INTERNALERROR) 130 | resp.SetFinal(true) 131 | //consume channels 132 | go func() { 133 | for _ = range recordc { 134 | } 135 | }() 136 | go func() { 137 | for _ = range errorc { 138 | } 139 | }() 140 | sendresp(rvseg) 141 | return 142 | } 143 | records := NewRecords(rvseg) 144 | rl := NewRecordList(rvseg, len(bufarr)) 145 | rla := rl.ToArray() 146 | for i, v := range bufarr { 147 | rla[i].SetTime(v.Time) 148 | rla[i].SetValue(v.Val) 149 | } 150 | records.SetVersion(gen) 151 | records.SetValues(rl) 152 | resp.SetRecords(records) 153 | resp.SetStatusCode(STATUSCODE_OK) 154 | if fin { 155 | resp.SetFinal(true) 156 | } 157 | sendresp(rvseg) 158 | bufarr = bufarr[:0] 159 | if fin { 160 | return 161 | } 162 | } 163 | } 164 | case REQUEST_QUERYWINDOWVALUES: 165 | st := req.QueryWindowValues().StartTime() 166 | et := req.QueryWindowValues().EndTime() 167 | id := uuid.UUID(req.QueryWindowValues().Uuid()) 168 | width := req.QueryWindowValues().Width() 169 | ver := req.QueryWindowValues().Version() 170 | depth := req.QueryWindowValues().Depth() 171 | if ver == 0 { 172 | ver = btrdb.LatestGeneration 173 | } 174 | recordc, gen := q.QueryWindow(id, st, et, ver, width, depth) 175 | if recordc == nil { 176 | log.Warning("RESPONDING ERR: %v", err) 177 | resp, rvseg := mkresp() 178 | resp.SetStatusCode(STATUSCODE_INTERNALERROR) 179 | resp.SetFinal(true) 180 | sendresp(rvseg) 181 | return 182 | } else { 183 | bufarr := make([]qtree.StatRecord, 0, 4096) 184 | for { 185 | resp, rvseg := mkresp() 186 | fail := false 187 | fin := false 188 | for { 189 | select { 190 | case r, ok := <-recordc: 191 | if !ok { 192 | fin = true 193 | goto donewindow 194 | } 195 | bufarr = append(bufarr, r) 196 | if len(bufarr) == cap(bufarr) { 197 | goto donewindow 198 | } 199 | } 200 | } 201 | donewindow: 202 | if fail { 203 | resp.SetStatusCode(STATUSCODE_INTERNALERROR) 204 | resp.SetFinal(true) 205 | //consume channels 206 | go func() { 207 | for _ = range recordc { 208 | } 209 | }() 210 | sendresp(rvseg) 211 | return 212 | } 213 | records := NewStatisticalRecords(rvseg) 214 | rl := NewStatisticalRecordList(rvseg, len(bufarr)) 215 | rla := rl.ToArray() 216 | for i, v := range bufarr { 217 | rla[i].SetTime(v.Time) 218 | rla[i].SetCount(v.Count) 219 | rla[i].SetMin(v.Min) 220 | rla[i].SetMean(v.Mean) 221 | rla[i].SetMax(v.Max) 222 | } 223 | records.SetVersion(gen) 224 | records.SetValues(rl) 225 | resp.SetStatisticalRecords(records) 226 | resp.SetStatusCode(STATUSCODE_OK) 227 | if fin { 228 | resp.SetFinal(true) 229 | } 230 | sendresp(rvseg) 231 | bufarr = bufarr[:0] 232 | if fin { 233 | return 234 | } 235 | } 236 | } 237 | case REQUEST_QUERYSTATISTICALVALUES: 238 | st := req.QueryStatisticalValues().StartTime() 239 | et := req.QueryStatisticalValues().EndTime() 240 | uuid := uuid.UUID(req.QueryStatisticalValues().Uuid()) 241 | pw := req.QueryStatisticalValues().PointWidth() 242 | ver := req.QueryStatisticalValues().Version() 243 | if ver == 0 { 244 | ver = btrdb.LatestGeneration 245 | } 246 | recordc, errorc, gen := q.QueryStatisticalValuesStream(uuid, st, et, ver, pw) 247 | if recordc == nil { 248 | log.Warning("RESPONDING ERR: %v", err) 249 | resp, rvseg := mkresp() 250 | resp.SetStatusCode(STATUSCODE_INTERNALERROR) 251 | resp.SetFinal(true) 252 | sendresp(rvseg) 253 | return 254 | } else { 255 | bufarr := make([]qtree.StatRecord, 0, 4096) 256 | for { 257 | resp, rvseg := mkresp() 258 | fail := false 259 | fin := false 260 | for { 261 | select { 262 | case _, ok := <-errorc: 263 | if ok { 264 | fin = true 265 | fail = true 266 | goto donestat 267 | } 268 | case r, ok := <-recordc: 269 | if !ok { 270 | fin = true 271 | goto donestat 272 | } 273 | bufarr = append(bufarr, r) 274 | if len(bufarr) == cap(bufarr) { 275 | goto donestat 276 | } 277 | } 278 | } 279 | donestat: 280 | if fail { 281 | resp.SetStatusCode(STATUSCODE_INTERNALERROR) 282 | resp.SetFinal(true) 283 | //consume channels 284 | go func() { 285 | for _ = range recordc { 286 | } 287 | }() 288 | go func() { 289 | for _ = range errorc { 290 | } 291 | }() 292 | sendresp(rvseg) 293 | return 294 | } 295 | records := NewStatisticalRecords(rvseg) 296 | rl := NewStatisticalRecordList(rvseg, len(bufarr)) 297 | rla := rl.ToArray() 298 | for i, v := range bufarr { 299 | rla[i].SetTime(v.Time) 300 | rla[i].SetCount(v.Count) 301 | rla[i].SetMin(v.Min) 302 | rla[i].SetMean(v.Mean) 303 | rla[i].SetMax(v.Max) 304 | } 305 | records.SetVersion(gen) 306 | records.SetValues(rl) 307 | resp.SetStatisticalRecords(records) 308 | resp.SetStatusCode(STATUSCODE_OK) 309 | if fin { 310 | resp.SetFinal(true) 311 | } 312 | sendresp(rvseg) 313 | bufarr = bufarr[:0] 314 | if fin { 315 | return 316 | } 317 | } 318 | } 319 | case REQUEST_QUERYVERSION: 320 | //ul := req. 321 | ul := req.QueryVersion().Uuids() 322 | ull := ul.ToArray() 323 | resp, rvseg := mkresp() 324 | rvers := NewVersions(rvseg) 325 | vlist := rvseg.NewUInt64List(len(ull)) 326 | ulist := rvseg.NewDataList(len(ull)) 327 | for i, v := range ull { 328 | ver, err := q.QueryGeneration(uuid.UUID(v)) 329 | if err != nil { 330 | resp.SetStatusCode(STATUSCODE_INTERNALERROR) 331 | resp.SetFinal(true) 332 | sendresp(rvseg) 333 | return 334 | } 335 | //I'm not sure that the array that sits behind the uuid slice will stick around 336 | //so I'm copying it. 337 | uuid := make([]byte, 16) 338 | copy(uuid, v) 339 | vlist.Set(i, ver) 340 | ulist.Set(i, uuid) 341 | } 342 | resp.SetStatusCode(STATUSCODE_OK) 343 | rvers.SetUuids(ulist) 344 | rvers.SetVersions(vlist) 345 | resp.SetVersionList(rvers) 346 | resp.SetFinal(true) 347 | sendresp(rvseg) 348 | case REQUEST_QUERYNEARESTVALUE: 349 | resp, rvseg := mkresp() 350 | t := req.QueryNearestValue().Time() 351 | id := uuid.UUID(req.QueryNearestValue().Uuid()) 352 | ver := req.QueryNearestValue().Version() 353 | if ver == 0 { 354 | ver = btrdb.LatestGeneration 355 | } 356 | back := req.QueryNearestValue().Backward() 357 | rv, gen, err := q.QueryNearestValue(id, t, back, ver) 358 | switch err { 359 | case nil: 360 | resp.SetStatusCode(STATUSCODE_OK) 361 | records := NewRecords(rvseg) 362 | rl := NewRecordList(rvseg, 1) 363 | rla := rl.ToArray() 364 | rla[0].SetTime(rv.Time) 365 | rla[0].SetValue(rv.Val) 366 | records.SetVersion(gen) 367 | records.SetValues(rl) 368 | resp.SetRecords(records) 369 | case qtree.ErrNoSuchPoint: 370 | resp.SetStatusCode(STATUSCODE_NOSUCHPOINT) 371 | default: 372 | resp.SetStatusCode(STATUSCODE_INTERNALERROR) 373 | } 374 | resp.SetFinal(true) 375 | sendresp(rvseg) 376 | case REQUEST_QUERYCHANGEDRANGES: 377 | resp, rvseg := mkresp() 378 | id := uuid.UUID(req.QueryChangedRanges().Uuid()) 379 | sgen := req.QueryChangedRanges().FromGeneration() 380 | egen := req.QueryChangedRanges().ToGeneration() 381 | if egen == 0 { 382 | egen = btrdb.LatestGeneration 383 | } 384 | resolution := req.QueryChangedRanges().Resolution() 385 | rv, ver, err := q.QueryChangedRanges(id, sgen, egen, resolution) 386 | switch err { 387 | case nil: 388 | resp.SetStatusCode(STATUSCODE_OK) 389 | ranges := NewRanges(rvseg) 390 | ranges.SetVersion(ver) 391 | crl := NewChangedRangeList(rvseg, len(rv)) 392 | crla := crl.ToArray() 393 | for i := 0; i < len(rv); i++ { 394 | crla[i].SetStartTime(rv[i].Start) 395 | crla[i].SetEndTime(rv[i].End) 396 | } 397 | ranges.SetValues(crl) 398 | resp.SetChangedRngList(ranges) 399 | default: 400 | log.Critical("qcr error: ", err) 401 | resp.SetStatusCode(STATUSCODE_INTERNALERROR) 402 | } 403 | resp.SetFinal(true) 404 | sendresp(rvseg) 405 | 406 | case REQUEST_INSERTVALUES: 407 | resp, rvseg := mkresp() 408 | uuid := uuid.UUID(req.InsertValues().Uuid()) 409 | rl := req.InsertValues().Values() 410 | rla := rl.ToArray() 411 | if len(rla) != 0 { 412 | qtr := make([]qtree.Record, len(rla)) 413 | for i, v := range rla { 414 | qtr[i] = qtree.Record{Time: v.Time(), Val: v.Value()} 415 | } 416 | q.InsertValues(uuid, qtr) 417 | } 418 | if req.InsertValues().Sync() { 419 | q.Flush(uuid) 420 | } 421 | resp.SetFinal(true) 422 | resp.SetStatusCode(STATUSCODE_OK) 423 | sendresp(rvseg) 424 | case REQUEST_DELETEVALUES: 425 | resp, rvseg := mkresp() 426 | id := uuid.UUID(req.DeleteValues().Uuid()) 427 | stime := req.DeleteValues().StartTime() 428 | etime := req.DeleteValues().EndTime() 429 | err := q.DeleteRange(id, stime, etime) 430 | switch err { 431 | case nil: 432 | resp.SetStatusCode(STATUSCODE_OK) 433 | default: 434 | resp.SetStatusCode(STATUSCODE_INTERNALERROR) 435 | } 436 | resp.SetFinal(true) 437 | sendresp(rvseg) 438 | default: 439 | log.Critical("weird segment") 440 | } 441 | }() 442 | } 443 | } 444 | 445 | /* 446 | func EncodeMsg() *bytes.Buffer { 447 | rv := bytes.Buffer{} 448 | seg := capn.NewBuffer(nil) 449 | cmd := NewRootRequest(seg) 450 | 451 | qsv := NewCmdQueryStandardValues(seg) 452 | cmd.SetEchoTag(500) 453 | qsv.SetStartTime(0x5a5a) 454 | qsv.SetEndTime(0xf7f7) 455 | cmd.SetQueryStandardValues(qsv) 456 | seg.WriteTo(&rv) 457 | return &rv 458 | } 459 | 460 | func DecodeMsg(b *bytes.Buffer) { 461 | seg, err := capn.ReadFromStream(b, nil) 462 | if err != nil { 463 | log.Panic(err) 464 | } 465 | cmd := ReadRootRequest(seg) 466 | switch cmd.Which() { 467 | case REQUEST_QUERYSTANDARDVALUES: 468 | ca := cmd.QueryStandardValues() 469 | default: 470 | log.Critical("wtf") 471 | } 472 | } 473 | */ 474 | -------------------------------------------------------------------------------- /internal/cephprovider/cephprovider.go: -------------------------------------------------------------------------------- 1 | package cephprovider 2 | 3 | // #cgo LDFLAGS: -lrados 4 | // #include "cephprovider.h" 5 | // #include 6 | import "C" 7 | 8 | import ( 9 | "strconv" 10 | "sync" 11 | "unsafe" 12 | 13 | "github.com/SoftwareDefinedBuildings/btrdb/internal/bprovider" 14 | "github.com/op/go-logging" 15 | ) 16 | 17 | var log *logging.Logger 18 | 19 | func init() { 20 | log = logging.MustGetLogger("log") 21 | } 22 | 23 | const NUM_RHANDLES = 200 24 | 25 | //We know we won't get any addresses here, because this is the relocation base as well 26 | const METADATA_BASE = 0xFF00000000000000 27 | 28 | //4096 blocks per addr lock 29 | const ADDR_LOCK_SIZE = 0x1000000000 30 | const ADDR_OBJ_SIZE = 0x0001000000 31 | 32 | //Just over the DBSIZE 33 | const MAX_EXPECTED_OBJECT_SIZE = 20485 34 | 35 | //The number of RADOS blocks to cache (up to 16MB each, probably only 1.6MB each) 36 | const RADOS_CACHE_SIZE = NUM_RHANDLES * 2 37 | 38 | const OFFSET_MASK = 0xFFFFFF 39 | const R_CHUNKSIZE = 1 << 20 40 | 41 | //This is how many uuid/address pairs we will keep to facilitate appending to segments 42 | //instead of creating new ones. 43 | const WORTH_CACHING = OFFSET_MASK - MAX_EXPECTED_OBJECT_SIZE 44 | const SEGCACHE_SIZE = 1024 45 | 46 | // 1MB for write cache, I doubt we will ever hit this tbh 47 | const WCACHE_SIZE = 1 << 20 48 | 49 | func UUIDSliceToArr(id []byte) [16]byte { 50 | rv := [16]byte{} 51 | copy(rv[:], id) 52 | return rv 53 | } 54 | 55 | type CephSegment struct { 56 | h C.phandle_t 57 | sp *CephStorageProvider 58 | ptr uint64 59 | naddr uint64 60 | base uint64 //Not the same as the provider's base 61 | warrs [][]byte 62 | uid [16]byte 63 | wcache []byte 64 | wcache_base uint64 65 | } 66 | 67 | type chunkreqindex struct { 68 | UUID [16]byte 69 | Addr uint64 70 | } 71 | 72 | type CephStorageProvider struct { 73 | rh []C.phandle_t 74 | rhidx chan int 75 | rhidx_ret chan int 76 | rh_avail []bool 77 | ptr uint64 78 | alloc chan uint64 79 | segaddrcache map[[16]byte]uint64 80 | segcachelock sync.Mutex 81 | 82 | chunklock sync.Mutex 83 | chunkgate map[chunkreqindex][]chan []byte 84 | 85 | rcache *CephCache 86 | } 87 | 88 | //Returns the address of the first free word in the segment when it was locked 89 | func (seg *CephSegment) BaseAddress() uint64 { 90 | return seg.base 91 | } 92 | 93 | //Unlocks the segment for the StorageProvider to give to other consumers 94 | //Implies a flush 95 | func (seg *CephSegment) Unlock() { 96 | seg.flushWrite() 97 | _, err := C.handle_close(seg.h) 98 | if err != nil { 99 | log.Panic("CGO ERROR: %v", err) 100 | } 101 | seg.warrs = nil 102 | if (seg.naddr & OFFSET_MASK) < WORTH_CACHING { 103 | seg.sp.segcachelock.Lock() 104 | seg.sp.pruneSegCache() 105 | seg.sp.segaddrcache[seg.uid] = seg.naddr 106 | seg.sp.segcachelock.Unlock() 107 | } 108 | 109 | } 110 | 111 | func (seg *CephSegment) flushWrite() { 112 | if len(seg.wcache) == 0 { 113 | return 114 | } 115 | C.handle_write(seg.h, (*C.uint8_t)(unsafe.Pointer(&seg.uid[0])), C.uint64_t(seg.wcache_base), 116 | (*C.char)(unsafe.Pointer(&seg.wcache[0])), C.int(len(seg.wcache)), 0) 117 | 118 | for i := 0; i < len(seg.wcache); i += R_CHUNKSIZE { 119 | seg.sp.rcache.cacheInvalidate((uint64(i) + seg.wcache_base) & R_ADDRMASK) 120 | } 121 | //The C code does not finish immediately, so we need to keep a reference to the old 122 | //wcache array until the segment is unlocked 123 | seg.warrs = append(seg.warrs, seg.wcache) 124 | seg.wcache = make([]byte, 0, WCACHE_SIZE) 125 | seg.wcache_base = seg.naddr 126 | 127 | } 128 | 129 | //Writes a slice to the segment, returns immediately 130 | //Returns nil if op is OK, otherwise ErrNoSpace or ErrInvalidArgument 131 | //It is up to the implementer to work out how to report no space immediately 132 | //The uint64 is the address to be used for the next write 133 | func (seg *CephSegment) Write(uuid []byte, address uint64, data []byte) (uint64, error) { 134 | //We don't put written blocks into the cache, because those will be 135 | //in the dblock cache much higher up. 136 | if address != seg.naddr { 137 | log.Panic("Non-sequential write") 138 | } 139 | 140 | if len(seg.wcache)+len(data)+2 > cap(seg.wcache) { 141 | seg.flushWrite() 142 | } 143 | 144 | base := len(seg.wcache) 145 | seg.wcache = seg.wcache[:base+2] 146 | seg.wcache[base] = byte(len(data)) 147 | seg.wcache[base+1] = byte(len(data) >> 8) 148 | seg.wcache = append(seg.wcache, data...) 149 | 150 | naddr := address + uint64(len(data)+2) 151 | 152 | //OLD NOTE: 153 | //Note that it is ok for an object to "go past the end of the allocation". Naddr could be one byte before 154 | //the end of the allocation for example. This is not a problem as we never address anything except the 155 | //start of an object. This is why we do not add the object max size here 156 | //NEW NOTE: 157 | //We cannot go past the end of the allocation anymore because it would break the read cache 158 | if ((naddr + MAX_EXPECTED_OBJECT_SIZE + 2) >> 24) != (address >> 24) { 159 | //We are gonna need a new object addr 160 | naddr = <-seg.sp.alloc 161 | seg.naddr = naddr 162 | seg.flushWrite() 163 | return naddr, nil 164 | } 165 | seg.naddr = naddr 166 | 167 | return naddr, nil 168 | } 169 | 170 | //Block until all writes are complete. Note this does not imply a flush of the underlying files. 171 | func (seg *CephSegment) Flush() { 172 | //Not sure we need to do stuff here, we can do it in unlock 173 | } 174 | 175 | //Must be called with the cache lock held 176 | func (sp *CephStorageProvider) pruneSegCache() { 177 | //This is extremely rare, so its best to handle it simply 178 | //If we drop the cache, we will get one shortsized object per stream, 179 | //and it won't necessarily be _very_ short. 180 | if len(sp.segaddrcache) >= SEGCACHE_SIZE { 181 | sp.segaddrcache = make(map[[16]byte]uint64, SEGCACHE_SIZE) 182 | } 183 | } 184 | 185 | func (sp *CephStorageProvider) provideReadHandles() { 186 | for { 187 | //Read all returned read handles 188 | ldretfi: 189 | for { 190 | select { 191 | case fi := <-sp.rhidx_ret: 192 | sp.rh_avail[fi] = true 193 | default: 194 | break ldretfi 195 | } 196 | } 197 | 198 | found := false 199 | for i := 0; i < NUM_RHANDLES; i++ { 200 | if sp.rh_avail[i] { 201 | sp.rhidx <- i 202 | sp.rh_avail[i] = false 203 | found = true 204 | } 205 | } 206 | //If we didn't find one, do a blocking read 207 | if !found { 208 | idx := <-sp.rhidx_ret 209 | sp.rh_avail[idx] = true 210 | } 211 | } 212 | } 213 | 214 | func (sp *CephStorageProvider) provideAllocs() { 215 | base := sp.ptr 216 | for { 217 | sp.alloc <- sp.ptr 218 | sp.ptr += ADDR_OBJ_SIZE 219 | if sp.ptr >= base+ADDR_LOCK_SIZE { 220 | sp.ptr = sp.obtainBaseAddress() 221 | base = sp.ptr 222 | } 223 | } 224 | } 225 | 226 | func (sp *CephStorageProvider) obtainBaseAddress() uint64 { 227 | h, err := C.handle_create() 228 | if err != nil { 229 | log.Panic("CGO ERROR: %v", err) 230 | } 231 | addr, err := C.handle_obtainrange(h) 232 | if err != nil { 233 | log.Panic("CGO ERROR: %v", err) 234 | } 235 | return uint64(addr) 236 | } 237 | 238 | //Called at startup of a normal run 239 | func (sp *CephStorageProvider) Initialize(opts map[string]string) { 240 | //Allocate caches 241 | sp.rcache = &CephCache{} 242 | cachesz, _ := strconv.Atoi(opts["cephrcache"]) 243 | if cachesz < 40 { 244 | cachesz = 40 //one per read handle: 40MB 245 | } 246 | sp.rcache.initCache(uint64(cachesz)) 247 | 248 | cephconf := C.CString(opts["cephconf"]) 249 | cephpool := C.CString(opts["cephpool"]) 250 | _, err := C.initialize_provider(cephconf, cephpool) 251 | if err != nil { 252 | log.Panic("CGO ERROR: %v", err) 253 | } 254 | C.free(unsafe.Pointer(cephconf)) 255 | C.free(unsafe.Pointer(cephpool)) 256 | 257 | sp.rh = make([]C.phandle_t, NUM_RHANDLES) 258 | sp.rh_avail = make([]bool, NUM_RHANDLES) 259 | sp.rhidx = make(chan int, NUM_RHANDLES+1) 260 | sp.rhidx_ret = make(chan int, NUM_RHANDLES+1) 261 | sp.alloc = make(chan uint64, 128) 262 | sp.segaddrcache = make(map[[16]byte]uint64, SEGCACHE_SIZE) 263 | sp.chunkgate = make(map[chunkreqindex][]chan []byte) 264 | 265 | for i := 0; i < NUM_RHANDLES; i++ { 266 | sp.rh_avail[i] = true 267 | h, err := C.handle_create() 268 | if err != nil { 269 | log.Panic("CGO ERROR: %v", err) 270 | } 271 | sp.rh[i] = h 272 | } 273 | 274 | //Obtain base address 275 | sp.ptr = sp.obtainBaseAddress() 276 | if sp.ptr == 0 { 277 | log.Panic("Could not read allocator! DB not created properly?") 278 | } 279 | log.Info("Base address obtained as 0x%016x", sp.ptr) 280 | 281 | //Start serving read handles 282 | go sp.provideReadHandles() 283 | 284 | //Start providing address allocations 285 | go sp.provideAllocs() 286 | 287 | } 288 | 289 | //Called to create the database for the first time 290 | func (sp *CephStorageProvider) CreateDatabase(opts map[string]string) error { 291 | cephconf := C.CString(opts["cephconf"]) 292 | cephpool := C.CString(opts["cephpool"]) 293 | _, err := C.initialize_provider(cephconf, cephpool) 294 | if err != nil { 295 | log.Panic("CGO ERROR: %v", err) 296 | } 297 | C.free(unsafe.Pointer(cephconf)) 298 | C.free(unsafe.Pointer(cephpool)) 299 | h, err := C.handle_create() 300 | if err != nil { 301 | log.Panic("CGO ERROR: %v", err) 302 | } 303 | C.handle_init_allocator(h) 304 | _, err = C.handle_close(h) 305 | if err != nil { 306 | log.Panic("CGO ERROR: %v", err) 307 | } 308 | return nil 309 | } 310 | 311 | // Lock a segment, or block until a segment can be locked 312 | // Returns a Segment struct 313 | // Implicit unchecked assumption: you cannot lock more than one segment 314 | // for a given uuid (without unlocking them in between). It will break 315 | // segcache 316 | func (sp *CephStorageProvider) LockSegment(uuid []byte) bprovider.Segment { 317 | rv := new(CephSegment) 318 | rv.sp = sp 319 | h, err := C.handle_create() 320 | if err != nil { 321 | log.Panic("CGO ERROR: %v", err) 322 | } 323 | rv.h = h 324 | rv.ptr = <-sp.alloc 325 | rv.uid = UUIDSliceToArr(uuid) 326 | rv.wcache = make([]byte, 0, WCACHE_SIZE) 327 | sp.segcachelock.Lock() 328 | cached_ptr, ok := sp.segaddrcache[rv.uid] 329 | if ok { 330 | delete(sp.segaddrcache, rv.uid) 331 | } 332 | sp.segcachelock.Unlock() 333 | //ok = false 334 | if ok { 335 | rv.base = cached_ptr 336 | rv.naddr = rv.base 337 | } else { 338 | rv.base = rv.ptr 339 | rv.naddr = rv.base 340 | } 341 | rv.wcache_base = rv.naddr 342 | //Although I don't know this for sure, I am concerned that when we pass the write array pointer to C 343 | //the Go GC may free it before C is done. I prevent this by pinning all the written arrays, which get 344 | //deref'd after the segment is unlocked 345 | rv.warrs = make([][]byte, 0, 64) 346 | return rv 347 | } 348 | 349 | func (sp *CephStorageProvider) rawObtainChunk(uuid []byte, address uint64) []byte { 350 | chunk := sp.rcache.cacheGet(address) 351 | if chunk == nil { 352 | chunk = sp.rcache.getBlank() 353 | rhidx := <-sp.rhidx 354 | rc, err := C.handle_read(sp.rh[rhidx], (*C.uint8_t)(unsafe.Pointer(&uuid[0])), C.uint64_t(address), (*C.char)(unsafe.Pointer(&chunk[0])), R_CHUNKSIZE) 355 | if err != nil { 356 | log.Panic("CGO ERROR: %v", err) 357 | } 358 | chunk = chunk[0:rc] 359 | sp.rhidx_ret <- rhidx 360 | sp.rcache.cachePut(address, chunk) 361 | } 362 | return chunk 363 | } 364 | 365 | func (sp *CephStorageProvider) obtainChunk(uuid []byte, address uint64) []byte { 366 | chunk := sp.rcache.cacheGet(address) 367 | if chunk != nil { 368 | return chunk 369 | } 370 | index := chunkreqindex{UUID: UUIDSliceToArr(uuid), Addr: address} 371 | rvc := make(chan []byte, 1) 372 | sp.chunklock.Lock() 373 | slc, ok := sp.chunkgate[index] 374 | if ok { 375 | sp.chunkgate[index] = append(slc, rvc) 376 | sp.chunklock.Unlock() 377 | } else { 378 | sp.chunkgate[index] = []chan []byte{rvc} 379 | sp.chunklock.Unlock() 380 | go func() { 381 | bslice := sp.rawObtainChunk(uuid, address) 382 | sp.chunklock.Lock() 383 | slc, ok := sp.chunkgate[index] 384 | if !ok { 385 | panic("inconsistency!!") 386 | } 387 | for _, chn := range slc { 388 | chn <- bslice 389 | } 390 | delete(sp.chunkgate, index) 391 | sp.chunklock.Unlock() 392 | }() 393 | } 394 | rv := <-rvc 395 | return rv 396 | } 397 | 398 | // Read the blob into the given buffer: direct read 399 | /* 400 | func (sp *CephStorageProvider) Read(uuid []byte, address uint64, buffer []byte) []byte { 401 | 402 | //Get a read handle 403 | rhidx := <-sp.rhidx 404 | if len(buffer) < MAX_EXPECTED_OBJECT_SIZE { 405 | log.Panic("That doesn't seem safe") 406 | } 407 | rc, err := C.handle_read(sp.rh[rhidx], (*C.uint8_t)(unsafe.Pointer(&uuid[0])), C.uint64_t(address), (*C.char)(unsafe.Pointer(&buffer[0])), MAX_EXPECTED_OBJECT_SIZE) 408 | if err != nil { 409 | log.Panic("CGO ERROR: %v", err) 410 | } 411 | sp.rhidx_ret <- rhidx 412 | ln := int(buffer[0]) + (int(buffer[1]) << 8) 413 | if int(rc) < ln+2 { 414 | //TODO this can happen, it is better to just go back a few superblocks 415 | log.Panic("Short read") 416 | } 417 | return buffer[2 : ln+2] 418 | }*/ 419 | 420 | // Read the blob into the given buffer 421 | func (sp *CephStorageProvider) Read(uuid []byte, address uint64, buffer []byte) []byte { 422 | //Get the first chunk for this object: 423 | chunk1 := sp.obtainChunk(uuid, address&R_ADDRMASK)[address&R_OFFSETMASK:] 424 | var chunk2 []byte 425 | var ln int 426 | 427 | if len(chunk1) < 2 { 428 | //not even long enough for the prefix, must be one byte in the first chunk, one in teh second 429 | chunk2 = sp.obtainChunk(uuid, (address+R_CHUNKSIZE)&R_ADDRMASK) 430 | ln = int(chunk1[0]) + (int(chunk2[0]) << 8) 431 | chunk2 = chunk2[1:] 432 | chunk1 = chunk1[1:] 433 | } else { 434 | ln = int(chunk1[0]) + (int(chunk1[1]) << 8) 435 | chunk1 = chunk1[2:] 436 | } 437 | 438 | if (ln) > MAX_EXPECTED_OBJECT_SIZE { 439 | log.Panic("WTUF: ", ln) 440 | } 441 | 442 | copied := 0 443 | if len(chunk1) > 0 { 444 | //We need some bytes from chunk1 445 | end := ln 446 | if len(chunk1) < ln { 447 | end = len(chunk1) 448 | } 449 | copied = copy(buffer, chunk1[:end]) 450 | } 451 | if copied < ln { 452 | //We need some bytes from chunk2 453 | if chunk2 == nil { 454 | chunk2 = sp.obtainChunk(uuid, (address+R_CHUNKSIZE)&R_ADDRMASK) 455 | } 456 | copy(buffer[copied:], chunk2[:ln-copied]) 457 | 458 | } 459 | if ln < 2 { 460 | log.Panic("This is unexpected") 461 | } 462 | return buffer[:ln] 463 | 464 | } 465 | -------------------------------------------------------------------------------- /internal/bstore/blocktypes.go: -------------------------------------------------------------------------------- 1 | package bstore 2 | 3 | import ( 4 | "math" 5 | 6 | "github.com/pborman/uuid" 7 | ) 8 | 9 | type Superblock struct { 10 | uuid uuid.UUID 11 | gen uint64 12 | root uint64 13 | unlinked bool 14 | } 15 | 16 | func (s *Superblock) Gen() uint64 { 17 | return s.gen 18 | } 19 | 20 | func (s *Superblock) Root() uint64 { 21 | return s.root 22 | } 23 | 24 | func (s *Superblock) Uuid() uuid.UUID { 25 | return s.uuid 26 | } 27 | 28 | func (s *Superblock) Unlinked() bool { 29 | return s.unlinked 30 | } 31 | 32 | func NewSuperblock(id uuid.UUID) *Superblock { 33 | return &Superblock{ 34 | uuid: id, 35 | gen: 1, 36 | root: 0, 37 | } 38 | } 39 | 40 | func (s *Superblock) Clone() *Superblock { 41 | return &Superblock{ 42 | uuid: s.uuid, 43 | gen: s.gen, 44 | root: s.root, 45 | } 46 | } 47 | 48 | type BlockType uint64 49 | 50 | const ( 51 | Vector BlockType = 1 52 | Core BlockType = 2 53 | Bad BlockType = 255 54 | ) 55 | 56 | const FlagsMask uint8 = 3 57 | 58 | type Datablock interface { 59 | GetDatablockType() BlockType 60 | } 61 | 62 | // The leaf datablock type. The tags allow unit tests 63 | // to work out if clone / serdes are working properly 64 | // metadata is not copied when a node is cloned 65 | // implicit is not serialised 66 | type Vectorblock struct { 67 | 68 | //Metadata, not copied on clone 69 | Identifier uint64 "metadata,implicit" 70 | Generation uint64 "metadata,implicit" 71 | 72 | //Payload, copied on clone 73 | Len uint16 74 | PointWidth uint8 "implicit" 75 | StartTime int64 "implicit" 76 | Time [VSIZE]int64 77 | Value [VSIZE]float64 78 | } 79 | 80 | type Coreblock struct { 81 | 82 | //Metadata, not copied 83 | Identifier uint64 "metadata,implicit" 84 | Generation uint64 "metadata,implicit" 85 | 86 | //Payload, copied 87 | PointWidth uint8 "implicit" 88 | StartTime int64 "implicit" 89 | Addr [KFACTOR]uint64 90 | Count [KFACTOR]uint64 91 | Min [KFACTOR]float64 92 | Mean [KFACTOR]float64 93 | Max [KFACTOR]float64 94 | CGeneration [KFACTOR]uint64 95 | } 96 | 97 | func (*Vectorblock) GetDatablockType() BlockType { 98 | return Vector 99 | } 100 | 101 | func (*Coreblock) GetDatablockType() BlockType { 102 | return Core 103 | } 104 | 105 | //Copy a core block, only copying the payload, not the metadata 106 | func (src *Coreblock) CopyInto(dst *Coreblock) { 107 | dst.PointWidth = src.PointWidth 108 | dst.StartTime = src.StartTime 109 | dst.Addr = src.Addr 110 | //dst.Time = src.Time 111 | dst.Count = src.Count 112 | dst.Min = src.Min 113 | dst.Mean = src.Mean 114 | dst.Max = src.Max 115 | dst.CGeneration = src.CGeneration 116 | } 117 | 118 | func (src *Vectorblock) CopyInto(dst *Vectorblock) { 119 | dst.PointWidth = src.PointWidth 120 | dst.StartTime = src.StartTime 121 | dst.Len = src.Len 122 | dst.Time = src.Time 123 | dst.Value = src.Value 124 | } 125 | 126 | func DatablockGetBufferType(buf []byte) BlockType { 127 | switch BlockType(buf[0]) { 128 | case Vector: 129 | return Vector 130 | case Core: 131 | return Core 132 | } 133 | return Bad 134 | } 135 | 136 | // The current algorithm is as follows: 137 | // entry 0: absolute time and value 138 | // entry 1: delta time and value since 0 139 | // entry 2: delta since delta 1 140 | // entry 3: delta from average delta (1+2) 141 | // enrty 4+ delta from average delta (n-1, n-2, n-3) 142 | 143 | func (v *Vectorblock) Serialize(dst []byte) []byte { 144 | idx := 3 145 | dst[0] = byte(Vector) 146 | dst[1] = byte(v.Len) 147 | dst[2] = byte(v.Len >> 8) 148 | 149 | if v.Len == 0 { 150 | return dst[:idx] 151 | } 152 | //First values are written in full 153 | e, m := decompose(v.Value[0]) 154 | idx += writeUnsignedHuff(dst[idx:], m) 155 | idx += writeUnsignedHuff(dst[idx:], uint64(e)) 156 | 157 | //So we are taking a gamble here: I think I will never have negative times. If I do, 158 | //this will use 9 bytes for every time. But I won't. 159 | t := v.Time[0] 160 | idx += writeUnsignedHuff(dst[idx:], uint64(t)) 161 | if v.Len == 1 { 162 | return dst[:idx] 163 | } 164 | 165 | const delta_depth = 3 166 | hist_deltas_t := make([]int64, delta_depth) 167 | hist_deltas_e := make([]int64, delta_depth) 168 | hist_deltas_m := make([]int64, delta_depth) 169 | delta_idx := 0 170 | num_deltas := 0 171 | 172 | em1 := int64(e) 173 | mm1 := int64(m) 174 | tm1 := t 175 | for i := 1; i < int(v.Len); i++ { 176 | var deltas int 177 | if num_deltas > delta_depth { 178 | deltas = delta_depth 179 | } else { 180 | deltas = num_deltas 181 | } 182 | var e, m int64 183 | tmpe, tmpm := decompose(v.Value[i]) 184 | e = int64(tmpe) 185 | m = int64(tmpm) 186 | t := v.Time[i] 187 | 188 | //Calculate the delta for this record 189 | dt := t - tm1 190 | de := e - em1 191 | dm := m - mm1 192 | 193 | //Calculate average deltas 194 | var dt_total int64 = 0 195 | var dm_total int64 = 0 196 | var de_total int64 = 0 197 | for d := 0; d < deltas; d++ { 198 | dt_total += hist_deltas_t[d] 199 | dm_total += hist_deltas_m[d] 200 | de_total += hist_deltas_e[d] 201 | } 202 | var adt, ade, adm int64 = 0, 0, 0 203 | if deltas != 0 { 204 | adt = dt_total / int64(deltas) 205 | ade = de_total / int64(deltas) 206 | adm = dm_total / int64(deltas) 207 | } 208 | //Calculate the delta delta 209 | ddt := dt - adt 210 | dde := de - ade 211 | ddm := dm - adm 212 | 213 | //Add in the delta for this record 214 | hist_deltas_t[delta_idx] = dt 215 | hist_deltas_e[delta_idx] = de 216 | hist_deltas_m[delta_idx] = dm 217 | delta_idx++ 218 | if delta_idx == delta_depth { 219 | delta_idx = 0 220 | } 221 | num_deltas++ 222 | 223 | //Encode dde nz and ddt nz into ddm 224 | ddm <<= 2 225 | if dde != 0 { 226 | ddm |= 2 227 | } 228 | if ddt != 0 { 229 | ddm |= 1 230 | } 231 | 232 | //Write it out 233 | idx += writeSignedHuff(dst[idx:], ddm) 234 | if dde != 0 { 235 | idx += writeSignedHuff(dst[idx:], dde) 236 | } 237 | if ddt != 0 { 238 | idx += writeSignedHuff(dst[idx:], ddt) 239 | } 240 | 241 | em1 = e 242 | tm1 = t 243 | mm1 = m 244 | } 245 | return dst[:idx] 246 | } 247 | 248 | func (v *Vectorblock) Deserialize(src []byte) { 249 | blocktype := src[0] 250 | if BlockType(blocktype) != Vector { 251 | lg.Panicf("This is not a vector block") 252 | } 253 | 254 | v.Len = uint16(src[1]) + (uint16(src[2]) << 8) 255 | length := int(v.Len) 256 | idx := 3 257 | 258 | m, l, _ := readUnsignedHuff(src[idx:]) 259 | idx += l 260 | e, l, _ := readUnsignedHuff(src[idx:]) 261 | idx += l 262 | t, l, _ := readUnsignedHuff(src[idx:]) 263 | idx += l 264 | v.Time[0] = int64(t) 265 | v.Value[0] = recompose(uint16(e), uint64(m)) 266 | 267 | //Keep delta history 268 | const delta_depth = 3 269 | hist_deltas_t := make([]int64, delta_depth) 270 | hist_deltas_e := make([]int64, delta_depth) 271 | hist_deltas_m := make([]int64, delta_depth) 272 | delta_idx := 0 273 | num_deltas := 0 274 | 275 | mm1 := int64(m) 276 | em1 := int64(e) 277 | tm1 := int64(t) 278 | for i := 1; i < length; i++ { 279 | //How many deltas do we have 280 | var deltas int 281 | if num_deltas > delta_depth { 282 | deltas = delta_depth 283 | } else { 284 | deltas = num_deltas 285 | } 286 | 287 | //Calculate average deltas 288 | var dt_total int64 = 0 289 | var dm_total int64 = 0 290 | var de_total int64 = 0 291 | for d := 0; d < deltas; d++ { 292 | dt_total += hist_deltas_t[d] 293 | dm_total += hist_deltas_m[d] 294 | de_total += hist_deltas_e[d] 295 | } 296 | var adt, ade, adm int64 = 0, 0, 0 297 | if deltas != 0 { 298 | adt = dt_total / int64(deltas) 299 | ade = de_total / int64(deltas) 300 | adm = dm_total / int64(deltas) 301 | } 302 | //Read the dd's 303 | ddm, l, _ := readSignedHuff(src[idx:]) 304 | idx += l 305 | var dde, ddt int64 = 0, 0 306 | if ddm&2 != 0 { 307 | //log.Warning("re") 308 | dde, l, _ = readSignedHuff(src[idx:]) 309 | idx += l 310 | } 311 | if ddm&1 != 0 { 312 | //log.Warning("rt") 313 | ddt, l, _ = readSignedHuff(src[idx:]) 314 | idx += l 315 | } 316 | ddm >>= 2 317 | //Convert dd's to d's 318 | dm := ddm + adm 319 | dt := ddt + adt 320 | de := dde + ade 321 | 322 | //Save the deltas in the history 323 | hist_deltas_t[delta_idx] = dt 324 | hist_deltas_m[delta_idx] = dm 325 | hist_deltas_e[delta_idx] = de 326 | delta_idx++ 327 | if delta_idx == delta_depth { 328 | delta_idx = 0 329 | } 330 | num_deltas++ 331 | 332 | //Save values 333 | e := em1 + de 334 | m := mm1 + dm 335 | v.Time[i] = tm1 + dt 336 | v.Value[i] = recompose(uint16(e), uint64(m)) 337 | em1 += de 338 | mm1 += dm 339 | tm1 += dt 340 | } 341 | } 342 | 343 | func (c *Coreblock) Serialize(dst []byte) []byte { 344 | /* 345 | Addr delta-delta / abszero 346 | Count delta +isnz(cgen) 347 | CGeneration delta-delta 348 | Mean delta-delta (mantissa contains isnz(e)) 349 | Min delta-delta (mantissa contains isnz(e)) 350 | Max delta-delta (mantissa contains isnz(e)) 351 | 352 | TL;DR the code is the documentation MWAHAHAHA 353 | */ 354 | 355 | idx := 1 356 | dst[0] = byte(Core) 357 | 358 | const delta_depth = 3 359 | 360 | deltadeltarizer := func(maxdepth int) func(value int64) int64 { 361 | hist_delta := make([]int64, maxdepth) 362 | var depth int = 0 363 | insidx := 0 364 | var last_value int64 365 | dd := func(value int64) int64 { 366 | var total_dt int64 = 0 367 | for i := 0; i < depth; i++ { 368 | total_dt += hist_delta[i] 369 | } 370 | var avg_dt int64 = 0 371 | if depth > 0 { 372 | avg_dt = total_dt / int64(depth) 373 | } 374 | curdelta := value - last_value 375 | last_value = value 376 | ddelta := curdelta - avg_dt 377 | hist_delta[insidx] = curdelta 378 | insidx = (insidx + 1) % maxdepth 379 | depth += 1 380 | if depth > maxdepth { 381 | depth = maxdepth 382 | } 383 | return ddelta 384 | } 385 | return dd 386 | } 387 | dd_addr := deltadeltarizer(delta_depth) 388 | dd_cgen := deltadeltarizer(delta_depth) 389 | dd_count := deltadeltarizer(delta_depth) 390 | dd_mean_m := deltadeltarizer(delta_depth) 391 | dd_mean_e := deltadeltarizer(delta_depth) 392 | dd_min_m := deltadeltarizer(delta_depth) 393 | dd_min_e := deltadeltarizer(delta_depth) 394 | dd_max_m := deltadeltarizer(delta_depth) 395 | dd_max_e := deltadeltarizer(delta_depth) 396 | 397 | //Look for bottomable idx 398 | bottomidx := -1 399 | for i := KFACTOR - 1; i >= 0; i-- { 400 | if c.Addr[i] == 0 && c.CGeneration[i] == 0 { 401 | bottomidx = i 402 | } else { 403 | break 404 | } 405 | } 406 | for i := 0; i < KFACTOR; i++ { 407 | if i == bottomidx { 408 | idx += writeFullZero(dst[idx:]) 409 | break 410 | } 411 | if c.Addr[i] == 0 { 412 | idx += writeAbsZero(dst[idx:]) 413 | idx += writeSignedHuff(dst[idx:], dd_cgen(int64(c.CGeneration[i]))) 414 | } else { 415 | idx += writeSignedHuff(dst[idx:], dd_addr(int64(c.Addr[i]))) 416 | 417 | min_e, min_m := decompose(c.Min[i]) 418 | min_m_dd := dd_min_m(int64(min_m)) 419 | min_e_dd := dd_min_e(int64(min_e)) 420 | min_m_dd <<= 1 421 | if min_e_dd != 0 { 422 | min_m_dd |= 1 423 | } 424 | 425 | mean_e, mean_m := decompose(c.Mean[i]) 426 | mean_m_dd := dd_mean_m(int64(mean_m)) 427 | mean_e_dd := dd_mean_e(int64(mean_e)) 428 | mean_m_dd <<= 1 429 | if mean_e_dd != 0 { 430 | mean_m_dd |= 1 431 | } 432 | 433 | max_e, max_m := decompose(c.Max[i]) 434 | max_m_dd := dd_max_m(int64(max_m)) 435 | max_e_dd := dd_max_e(int64(max_e)) 436 | max_m_dd <<= 1 437 | if max_e_dd != 0 { 438 | max_m_dd |= 1 439 | } 440 | 441 | cgen_dd := dd_cgen(int64(c.CGeneration[i])) 442 | 443 | cnt := dd_count(int64(c.Count[i])) 444 | cnt <<= 1 445 | if cgen_dd != 0 { 446 | cnt |= 1 447 | } 448 | idx += writeSignedHuff(dst[idx:], cnt) 449 | if cgen_dd != 0 { 450 | idx += writeSignedHuff(dst[idx:], cgen_dd) 451 | } 452 | idx += writeSignedHuff(dst[idx:], min_m_dd) 453 | if min_e_dd != 0 { 454 | idx += writeSignedHuff(dst[idx:], min_e_dd) 455 | } 456 | idx += writeSignedHuff(dst[idx:], mean_m_dd) 457 | if mean_e_dd != 0 { 458 | idx += writeSignedHuff(dst[idx:], mean_e_dd) 459 | } 460 | idx += writeSignedHuff(dst[idx:], max_m_dd) 461 | if max_e_dd != 0 { 462 | idx += writeSignedHuff(dst[idx:], max_e_dd) 463 | } 464 | } 465 | //log.Warning("Finished SER %v, idx is %v", i, idx) 466 | } 467 | return dst[:idx] 468 | } 469 | 470 | func (c *Coreblock) Deserialize(src []byte) { 471 | //check 0 for id 472 | if src[0] != byte(Core) { 473 | lg.Panic("This is not a core block") 474 | } 475 | idx := 1 476 | dedeltadeltarizer := func(maxdepth int) func(dd int64) int64 { 477 | hist_delta := make([]int64, maxdepth) 478 | depth := 0 479 | insidx := 0 480 | var last_value int64 = 0 481 | decode := func(dd int64) int64 { 482 | var total_dt int64 = 0 483 | for i := 0; i < depth; i++ { 484 | total_dt += hist_delta[i] 485 | } 486 | var avg_dt int64 = 0 487 | if depth > 0 { 488 | avg_dt = total_dt / int64(depth) 489 | } 490 | curdelta := avg_dt + dd 491 | curvalue := last_value + curdelta 492 | last_value = curvalue 493 | hist_delta[insidx] = curdelta 494 | insidx = (insidx + 1) % maxdepth 495 | depth += 1 496 | if depth > maxdepth { 497 | depth = maxdepth 498 | } 499 | return last_value 500 | } 501 | return decode 502 | } 503 | 504 | const delta_depth = 3 505 | dd_addr := dedeltadeltarizer(delta_depth) 506 | dd_cgen := dedeltadeltarizer(delta_depth) 507 | dd_count := dedeltadeltarizer(delta_depth) 508 | dd_mean_m := dedeltadeltarizer(delta_depth) 509 | dd_mean_e := dedeltadeltarizer(delta_depth) 510 | dd_min_m := dedeltadeltarizer(delta_depth) 511 | dd_min_e := dedeltadeltarizer(delta_depth) 512 | dd_max_m := dedeltadeltarizer(delta_depth) 513 | dd_max_e := dedeltadeltarizer(delta_depth) 514 | 515 | i := 0 516 | for ; i < KFACTOR; i++ { 517 | 518 | //Get addr 519 | addr_dd, used, bottom := readSignedHuff(src[idx:]) 520 | idx += used 521 | if bottom == ABSZERO { 522 | c.Addr[i] = 0 523 | c.Count[i] = 0 524 | //min/mean/max are undefined 525 | //Still have to decode cgen 526 | cgen_dd, used, _ := readSignedHuff(src[idx:]) 527 | idx += used 528 | cgen := uint64(dd_cgen(cgen_dd)) 529 | c.CGeneration[i] = cgen 530 | } else if bottom == FULLZERO { 531 | break 532 | } else { 533 | //Real value 534 | c.Addr[i] = uint64(dd_addr(addr_dd)) 535 | 536 | cnt_dd, used, _ := readSignedHuff(src[idx:]) 537 | idx += used 538 | 539 | var cgen_dd int64 = 0 540 | if cnt_dd&1 != 0 { 541 | cgen_dd, used, _ = readSignedHuff(src[idx:]) 542 | idx += used 543 | } 544 | cnt_dd >>= 1 545 | c.CGeneration[i] = uint64(dd_cgen(cgen_dd)) 546 | c.Count[i] = uint64(dd_count(cnt_dd)) 547 | 548 | min_m_dd, used, _ := readSignedHuff(src[idx:]) 549 | idx += used 550 | var min_e_dd int64 551 | if min_m_dd&1 != 0 { 552 | min_e_dd, used, _ = readSignedHuff(src[idx:]) 553 | idx += used 554 | } else { 555 | min_e_dd = 0 556 | } 557 | min_m_dd >>= 1 558 | c.Min[i] = recompose(uint16(dd_min_e(min_e_dd)), uint64(dd_min_m(min_m_dd))) 559 | 560 | mean_m_dd, used, _ := readSignedHuff(src[idx:]) 561 | idx += used 562 | var mean_e_dd int64 563 | if mean_m_dd&1 != 0 { 564 | mean_e_dd, used, _ = readSignedHuff(src[idx:]) 565 | idx += used 566 | } else { 567 | mean_e_dd = 0 568 | } 569 | mean_m_dd >>= 1 570 | c.Mean[i] = recompose(uint16(dd_mean_e(mean_e_dd)), uint64(dd_mean_m(mean_m_dd))) 571 | 572 | max_m_dd, used, _ := readSignedHuff(src[idx:]) 573 | idx += used 574 | var max_e_dd int64 575 | if max_m_dd&1 != 0 { 576 | max_e_dd, used, _ = readSignedHuff(src[idx:]) 577 | idx += used 578 | } else { 579 | max_e_dd = 0 580 | } 581 | max_m_dd >>= 1 582 | c.Max[i] = recompose(uint16(dd_max_e(max_e_dd)), uint64(dd_max_m(max_m_dd))) 583 | } 584 | //log.Warning("Finishing deser idx %v, idx is %v",i, idx) 585 | } 586 | 587 | //Clear out from a FULLZERO 588 | for ; i < KFACTOR; i++ { 589 | c.Addr[i] = 0 590 | c.Count[i] = 0 591 | c.CGeneration[i] = 0 592 | 593 | } 594 | } 595 | 596 | //These functions allow us to read/write the packed numbers in the datablocks 597 | //These are huffman encoded in big endian 598 | // 0xxx xxxx 7 0x00 599 | // 10xx xxxx +1 14 0x80 600 | // 1100 xxxx +2 20 0xC0 601 | // 1101 xxxx +3 28 0xD0 602 | // 1110 xxxx +4 36 0xE0 603 | // 1111 00xx +5 42 0xF0 604 | // 1111 01xx +6 50 0xF4 605 | // 1111 10xx +7 58 0xF8 606 | // 1111 1100 +8 64 0xFC 607 | // 1111 1101 +0 ABSZERO (special symbol) 0xFD 608 | // 1111 1110 +0 FULLZERO (special symbol) 0xFE 609 | const VALUE = 0 610 | const ABSZERO = 1 611 | const FULLZERO = 2 612 | 613 | func writeUnsignedHuff(dst []byte, val uint64) int { 614 | //log.Warning("wuh called dstlen %v",len(dst)) 615 | i := 0 616 | var do_rest func(n uint8) 617 | do_rest = func(n uint8) { 618 | if n == 0 { 619 | return 620 | } 621 | dst[i] = byte((val >> ((n - 1) * 8)) & 0xFF) 622 | i++ 623 | do_rest(n - 1) 624 | } 625 | if val < (1 << 7) { 626 | dst[i] = byte(val) 627 | i++ 628 | } else if val < (1 << 14) { 629 | dst[i] = byte(0x80 | val>>8) 630 | i++ 631 | do_rest(1) 632 | } else if val < (1 << 20) { 633 | dst[i] = byte(0xC0 | val>>16) 634 | i++ 635 | do_rest(2) 636 | } else if val < (1 << 28) { 637 | dst[i] = byte(0xD0 | val>>24) 638 | i++ 639 | do_rest(3) 640 | } else if val < (1 << 36) { 641 | dst[i] = byte(0xE0 | val>>32) 642 | i++ 643 | do_rest(4) 644 | } else if val < (1 << 42) { 645 | dst[i] = byte(0xF0 | val>>40) 646 | i++ 647 | do_rest(5) 648 | } else if val < (1 << 50) { 649 | dst[i] = byte(0xF4 | val>>48) 650 | i++ 651 | do_rest(6) 652 | } else if val < (1 << 58) { 653 | dst[i] = byte(0xF8 | val>>56) 654 | i++ 655 | do_rest(7) 656 | } else { 657 | dst[i] = 0xFC 658 | i++ 659 | do_rest(8) 660 | } 661 | return i 662 | } 663 | func writeAbsZero(dst []byte) int { 664 | dst[0] = 0xFD 665 | return 1 666 | } 667 | func writeFullZero(dst []byte) int { 668 | dst[0] = 0xFE 669 | return 1 670 | } 671 | func writeSignedHuff(dst []byte, val int64) int { 672 | if val < 0 { 673 | return writeUnsignedHuff(dst, (uint64(-val)<<1 | 1)) 674 | } else { 675 | return writeUnsignedHuff(dst, uint64(val)<<1) 676 | } 677 | } 678 | func readUnsignedHuff(src []byte) (uint64, int, int) { 679 | var rv uint64 680 | i := 1 681 | var do_rest func(n uint8) 682 | do_rest = func(n uint8) { 683 | if n == 0 { 684 | return 685 | } 686 | rv <<= 8 687 | rv |= uint64(src[i]) 688 | i++ 689 | do_rest(n - 1) 690 | } 691 | if src[0] > 0xFE { 692 | lg.Panicf("This huffman symbol is reserved: +v", src[0]) 693 | } else if src[0] == 0xFD { 694 | return 0, 1, ABSZERO 695 | } else if src[0] == 0xFE { 696 | return 0, 1, FULLZERO 697 | } else if src[0] == 0xFC { 698 | do_rest(8) 699 | } else if src[0] >= 0xF8 { 700 | rv = uint64(src[0] & 0x03) 701 | do_rest(7) 702 | } else if src[0] >= 0xF4 { 703 | rv = uint64(src[0] & 0x03) 704 | do_rest(6) 705 | } else if src[0] >= 0xF0 { 706 | rv = uint64(src[0] & 0x03) 707 | do_rest(5) 708 | } else if src[0] >= 0xE0 { 709 | rv = uint64(src[0] & 0x0F) 710 | do_rest(4) 711 | } else if src[0] >= 0xD0 { 712 | rv = uint64(src[0] & 0x0F) 713 | do_rest(3) 714 | } else if src[0] >= 0xC0 { 715 | rv = uint64(src[0] & 0x0F) 716 | do_rest(2) 717 | } else if src[0] >= 0x80 { 718 | rv = uint64(src[0] & 0x3F) 719 | do_rest(1) 720 | } else { 721 | rv = uint64(src[0] & 0x7F) 722 | } 723 | return rv, i, VALUE 724 | } 725 | func readSignedHuff(src []byte) (int64, int, int) { 726 | v, l, bv := readUnsignedHuff(src) 727 | if bv != VALUE { 728 | return 0, 1, bv 729 | } 730 | s := v & 1 731 | v >>= 1 732 | if s == 1 { 733 | return -int64(v), l, VALUE 734 | } 735 | return int64(v), l, VALUE 736 | } 737 | 738 | //This composes a float into a weird representation that was empirically determined to be 739 | //ideal for compression of Quasar streams. 740 | //First we split out the sign, exponent and mantissa from the float 741 | //Then we reverse the bytes in the mantissa (bits are better but slower) 742 | //Then we left shift it and stick the sign bit as the LSB 743 | //The result is the (unsigned) exponent and the mantissa-sortof-thingy 744 | func decompose(val float64) (e uint16, m uint64) { 745 | iv := math.Float64bits(val) 746 | s := iv >> 63 747 | exp := (iv >> 52) & 2047 748 | iv = iv & ((1 << 52) - 1) 749 | //Take the bottom 7 bytes and reverse them. Top byte is left zero 750 | // . . . . . . 751 | m = ((iv&0x00000000000000FF)<<(6*8) | 752 | (iv&0x000000000000FF00)<<(4*8) | 753 | (iv&0x0000000000FF0000)<<(2*8) | 754 | (iv & 0x00000000FF000000) | 755 | (iv&0x000000FF00000000)>>(2*8) | 756 | (iv&0x0000FF0000000000)>>(4*8) | 757 | (iv&0x00FF000000000000)>>(6*8)) 758 | e = (uint16(exp) << 1) | uint16(s) 759 | return 760 | } 761 | 762 | func recompose(e uint16, m uint64) float64 { 763 | s := e & 1 764 | e >>= 1 765 | iv := ((m&0x00000000000000FF)<<(6*8) | 766 | (m&0x000000000000FF00)<<(4*8) | 767 | (m&0x0000000000FF0000)<<(2*8) | 768 | (m & 0x00000000FF000000) | 769 | (m&0x000000FF00000000)>>(2*8) | 770 | (m&0x0000FF0000000000)>>(4*8) | 771 | (m&0x00FF000000000000)>>(6*8)) 772 | iv |= uint64(e) << 52 773 | iv |= uint64(s) << 63 774 | return math.Float64frombits(iv) 775 | } 776 | --------------------------------------------------------------------------------