├── .gitignore ├── Gopkg.lock ├── Gopkg.toml ├── Makefile ├── README.md ├── protos ├── .gen │ ├── rpc.pb.go │ └── rpc.pb.gw.go ├── google │ └── api │ │ ├── annotations.proto │ │ └── http.proto └── rpc.proto ├── resources ├── fdb_index.png ├── fdb_storage.png ├── fdb_time.png ├── m3db_storage.png └── m3db_time.png └── src ├── cmd ├── bench │ └── main.go └── server │ └── main.go ├── encoding ├── common.go ├── decoder.go ├── encoder.go ├── merge.go ├── merge_test.go ├── multi_decoder.go ├── multi_decoder_test.go ├── ostream.go └── round_trip_test.go └── layer ├── dircompress ├── layer.go └── layer_test.go ├── raw └── layer.go ├── rawblock ├── buffer.go ├── buffer_test.go ├── commitlog.go ├── commitlog_test.go ├── common_test.go └── layer.go ├── server └── server.go └── types.go /.gitignore: -------------------------------------------------------------------------------- 1 | /vendor 2 | /main 3 | blog.md 4 | -------------------------------------------------------------------------------- /Gopkg.lock: -------------------------------------------------------------------------------- 1 | # This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. 2 | 3 | 4 | [[projects]] 5 | digest = "1:b39cf81d5f440b9c0757a25058432d33af867e5201109bf53621356d9dab4b73" 6 | name = "github.com/apache/thrift" 7 | packages = ["lib/go/thrift"] 8 | pruneopts = "UT" 9 | revision = "384647d290e2e4a55a14b1b7ef1b7e66293a2c33" 10 | version = "v0.12.0" 11 | 12 | [[projects]] 13 | digest = "1:010ac780f94633dc91ef721a395c5768ace4456b4833dea5ad7c026802ff143e" 14 | name = "github.com/apple/foundationdb" 15 | packages = [ 16 | "bindings/go/src/fdb", 17 | "bindings/go/src/fdb/tuple", 18 | ] 19 | pruneopts = "UT" 20 | revision = "a6c09645bf9ec2f2c5a00839bcf17a37cec87fdb" 21 | version = "6.1.9" 22 | 23 | [[projects]] 24 | digest = "1:d6afaeed1502aa28e80a4ed0981d570ad91b2579193404256ce672ed0a609e0d" 25 | name = "github.com/beorn7/perks" 26 | packages = ["quantile"] 27 | pruneopts = "UT" 28 | revision = "4b2b341e8d7715fae06375aa633dbb6e91b3fb46" 29 | version = "v1.0.0" 30 | 31 | [[projects]] 32 | digest = "1:998cf998358a303ac2430c386ba3fd3398477d6013153d3c6e11432765cc9ae6" 33 | name = "github.com/cespare/xxhash" 34 | packages = ["."] 35 | pruneopts = "UT" 36 | revision = "3b82fb7d186719faeedd0c2864f868c74fbf79a1" 37 | version = "v2.0.0" 38 | 39 | [[projects]] 40 | digest = "1:ffe9824d294da03b391f44e1ae8281281b4afc1bdaa9588c9097785e3af10cec" 41 | name = "github.com/davecgh/go-spew" 42 | packages = ["spew"] 43 | pruneopts = "UT" 44 | revision = "8991bc29aa16c548c550c7ff78260e27b9ab7c73" 45 | version = "v1.1.1" 46 | 47 | [[projects]] 48 | digest = "1:48092bf6632f55839850666c33469f546f6d45fdbd59a66759ec12e84d853dc2" 49 | name = "github.com/gogo/protobuf" 50 | packages = ["proto"] 51 | pruneopts = "UT" 52 | revision = "ba06b47c162d49f2af050fb4c75bcbc86a159d5c" 53 | version = "v1.2.1" 54 | 55 | [[projects]] 56 | digest = "1:be408f349cae090a7c17a279633d6e62b00068e64af66a582cae0983de8890ea" 57 | name = "github.com/golang/mock" 58 | packages = ["gomock"] 59 | pruneopts = "UT" 60 | revision = "9fa652df1129bef0e734c9cf9bf6dbae9ef3b9fa" 61 | version = "1.3.1" 62 | 63 | [[projects]] 64 | digest = "1:7033673f364203b103c09f683c83ec2881d3cd22740ccf21e73dd90145a4a3ec" 65 | name = "github.com/golang/protobuf" 66 | packages = [ 67 | "jsonpb", 68 | "proto", 69 | "protoc-gen-go/descriptor", 70 | "protoc-gen-go/plugin", 71 | "ptypes/any", 72 | "ptypes/duration", 73 | "ptypes/empty", 74 | "ptypes/struct", 75 | "ptypes/timestamp", 76 | "ptypes/wrappers", 77 | ] 78 | pruneopts = "UT" 79 | revision = "b5d812f8a3706043e23a9cd5babf2e5423744d30" 80 | version = "v1.3.1" 81 | 82 | [[projects]] 83 | digest = "1:73bea52c5e1bbd7d80166e9255183b6fd41cbae463f98a2bd32b7f58f7438975" 84 | name = "github.com/jhump/protoreflect" 85 | packages = [ 86 | "desc", 87 | "desc/internal", 88 | "desc/protoparse", 89 | "dynamic", 90 | "internal", 91 | ] 92 | pruneopts = "UT" 93 | revision = "92269e4a44a442365a8824f1e184b8ddbca3ec7a" 94 | version = "v1.4.1" 95 | 96 | [[projects]] 97 | digest = "1:f7b7cc4601639f90815fb2fe02b2775278a83b468d7dbbd800d861405de40d74" 98 | name = "github.com/m3db/m3" 99 | packages = [ 100 | "src/cluster/client", 101 | "src/cluster/generated/proto/metadatapb", 102 | "src/cluster/generated/proto/placementpb", 103 | "src/cluster/kv", 104 | "src/cluster/kv/util/runtime", 105 | "src/cluster/placement", 106 | "src/cluster/placement/algo", 107 | "src/cluster/placement/selector", 108 | "src/cluster/placement/service", 109 | "src/cluster/placement/storage", 110 | "src/cluster/services", 111 | "src/cluster/services/leader/campaign", 112 | "src/cluster/shard", 113 | "src/dbnode/encoding", 114 | "src/dbnode/encoding/m3tsz", 115 | "src/dbnode/generated/proto/namespace", 116 | "src/dbnode/namespace", 117 | "src/dbnode/retention", 118 | "src/dbnode/ts", 119 | "src/dbnode/x/xio", 120 | "src/dbnode/x/xpool", 121 | "src/metrics/metric/id", 122 | "src/x/checked", 123 | "src/x/clock", 124 | "src/x/close", 125 | "src/x/context", 126 | "src/x/errors", 127 | "src/x/ident", 128 | "src/x/instrument", 129 | "src/x/opentracing", 130 | "src/x/pool", 131 | "src/x/process", 132 | "src/x/resource", 133 | "src/x/serialize", 134 | "src/x/time", 135 | "src/x/watch", 136 | ] 137 | pruneopts = "UT" 138 | revision = "c9820911fadc15f64c3af2ee8a07a8cb429d5374" 139 | version = "v0.9.6" 140 | 141 | [[projects]] 142 | digest = "1:b8c7482d3a298ea2a4fc2cec50d04770180f2e5799170578c40500ae01f103e0" 143 | name = "github.com/m3db/prometheus_client_golang" 144 | packages = [ 145 | "prometheus", 146 | "prometheus/promhttp", 147 | ] 148 | pruneopts = "UT" 149 | revision = "8ae269d24972b8695572fa6b2e3718b5ea82d6b4" 150 | version = "v0.8.1" 151 | 152 | [[projects]] 153 | digest = "1:9cf4ac6c9a81579807b1cd6a394f22795aacaf25bd2ad4b8b7dd3f829aa206c9" 154 | name = "github.com/m3db/prometheus_client_model" 155 | packages = ["go"] 156 | pruneopts = "UT" 157 | revision = "d3fff8420252ef63bffb96f689d1a85096c97321" 158 | version = "v0.1.0" 159 | 160 | [[projects]] 161 | digest = "1:a4021f830c7bb25416a7acd0521e80aa7b05586ac9ef03c4ca58c016146ed12f" 162 | name = "github.com/m3db/prometheus_common" 163 | packages = [ 164 | "expfmt", 165 | "internal/bitbucket.org/ww/goautoneg", 166 | "model", 167 | ] 168 | pruneopts = "UT" 169 | revision = "d550673fc477123acb69017380567e8fafc765fc" 170 | version = "v0.1.0" 171 | 172 | [[projects]] 173 | digest = "1:195a65e93248ff74ff5959b2c9d8a19f47b6506284a7c826692b1352e9ad9f92" 174 | name = "github.com/m3db/prometheus_procfs" 175 | packages = ["."] 176 | pruneopts = "UT" 177 | revision = "1878d9fbb537119d24b21ca07effd591627cd160" 178 | version = "v0.8.1" 179 | 180 | [[projects]] 181 | digest = "1:ff5ebae34cfbf047d505ee150de27e60570e8c394b3b8fdbb720ff6ac71985fc" 182 | name = "github.com/matttproud/golang_protobuf_extensions" 183 | packages = ["pbutil"] 184 | pruneopts = "UT" 185 | revision = "c12348ce28de40eed0136aa2b644d0ee0650e56c" 186 | version = "v1.0.1" 187 | 188 | [[projects]] 189 | digest = "1:11e62d6050198055e6cd87ed57e5d8c669e84f839c16e16f192374d913d1a70d" 190 | name = "github.com/opentracing/opentracing-go" 191 | packages = [ 192 | ".", 193 | "ext", 194 | "log", 195 | ] 196 | pruneopts = "UT" 197 | revision = "659c90643e714681897ec2521c60567dd21da733" 198 | version = "v1.1.0" 199 | 200 | [[projects]] 201 | digest = "1:cf31692c14422fa27c83a05292eb5cbe0fb2775972e8f1f8446a71549bd8980b" 202 | name = "github.com/pkg/errors" 203 | packages = ["."] 204 | pruneopts = "UT" 205 | revision = "ba968bfe8b2f7e042a574c888954fccecfa385b4" 206 | version = "v0.8.1" 207 | 208 | [[projects]] 209 | digest = "1:0028cb19b2e4c3112225cd871870f2d9cf49b9b4276531f03438a88e94be86fe" 210 | name = "github.com/pmezard/go-difflib" 211 | packages = ["difflib"] 212 | pruneopts = "UT" 213 | revision = "792786c7400a136282c1664665ae0a8db921c6c2" 214 | version = "v1.0.0" 215 | 216 | [[projects]] 217 | digest = "1:5da8ce674952566deae4dbc23d07c85caafc6cfa815b0b3e03e41979cedb8750" 218 | name = "github.com/stretchr/testify" 219 | packages = [ 220 | "assert", 221 | "require", 222 | ] 223 | pruneopts = "UT" 224 | revision = "ffdc059bfe9ce6a4e144ba849dbedead332c6053" 225 | version = "v1.3.0" 226 | 227 | [[projects]] 228 | digest = "1:5604990ce6c053672bf1c4666c867c65e53ec9bbe51327e7471f73974258bcf0" 229 | name = "github.com/uber-go/tally" 230 | packages = [ 231 | ".", 232 | "m3", 233 | "m3/customtransports", 234 | "m3/thrift", 235 | "m3/thriftudp", 236 | "multi", 237 | "prometheus", 238 | ] 239 | pruneopts = "UT" 240 | revision = "24c699f78afd17db5aac42f83c1c5cad70254294" 241 | version = "v3.3.10" 242 | 243 | [[projects]] 244 | digest = "1:57e707ba5fcbab4913a1c81e640ebb9f05f6327dcf88ab3b0e16dba3b8bb31fb" 245 | name = "github.com/uber/jaeger-client-go" 246 | packages = [ 247 | ".", 248 | "config", 249 | "internal/baggage", 250 | "internal/baggage/remote", 251 | "internal/spanlog", 252 | "internal/throttler", 253 | "internal/throttler/remote", 254 | "log", 255 | "log/zap", 256 | "rpcmetrics", 257 | "thrift", 258 | "thrift-gen/agent", 259 | "thrift-gen/baggage", 260 | "thrift-gen/jaeger", 261 | "thrift-gen/sampling", 262 | "thrift-gen/zipkincore", 263 | "transport", 264 | "utils", 265 | ] 266 | pruneopts = "UT" 267 | revision = "2f47546e3facd43297739439600bcf43f44cce5d" 268 | version = "v2.16.0" 269 | 270 | [[projects]] 271 | digest = "1:034f3a72349013b835bc829136f88204a2c0115df4b8d4b94b6ed4f0e1f4a9db" 272 | name = "github.com/uber/jaeger-lib" 273 | packages = [ 274 | "metrics", 275 | "metrics/tally", 276 | ] 277 | pruneopts = "UT" 278 | revision = "0e30338a695636fe5bcf7301e8030ce8dd2a8530" 279 | version = "v2.0.0" 280 | 281 | [[projects]] 282 | digest = "1:a5158647b553c61877aa9ae74f4015000294e47981e6b8b07525edcbb0747c81" 283 | name = "go.uber.org/atomic" 284 | packages = ["."] 285 | pruneopts = "UT" 286 | revision = "df976f2515e274675050de7b3f42545de80594fd" 287 | version = "v1.4.0" 288 | 289 | [[projects]] 290 | digest = "1:60bf2a5e347af463c42ed31a493d817f8a72f102543060ed992754e689805d1a" 291 | name = "go.uber.org/multierr" 292 | packages = ["."] 293 | pruneopts = "UT" 294 | revision = "3c4937480c32f4c13a875a1829af76c98ca3d40a" 295 | version = "v1.1.0" 296 | 297 | [[projects]] 298 | digest = "1:676160e6a4722b08e0e26b11521d575c2cb2b6f0c679e1ee6178c5d8dee51e5e" 299 | name = "go.uber.org/zap" 300 | packages = [ 301 | ".", 302 | "buffer", 303 | "internal/bufferpool", 304 | "internal/color", 305 | "internal/exit", 306 | "zapcore", 307 | ] 308 | pruneopts = "UT" 309 | revision = "27376062155ad36be76b0f12cf1572a221d3a48c" 310 | version = "v1.10.0" 311 | 312 | [[projects]] 313 | branch = "master" 314 | digest = "1:9f915ece988ec60eb54677e0dcc77fd53a7f42a496d984c351416ffcfd16b8f7" 315 | name = "google.golang.org/genproto" 316 | packages = [ 317 | "protobuf/api", 318 | "protobuf/field_mask", 319 | "protobuf/ptype", 320 | "protobuf/source_context", 321 | ] 322 | pruneopts = "UT" 323 | revision = "eb0b1bdb6ae60fcfc41b8d907b50dfb346112301" 324 | 325 | [solve-meta] 326 | analyzer-name = "dep" 327 | analyzer-version = 1 328 | input-imports = [ 329 | "github.com/apple/foundationdb/bindings/go/src/fdb", 330 | "github.com/apple/foundationdb/bindings/go/src/fdb/tuple", 331 | "github.com/m3db/m3/src/dbnode/encoding", 332 | "github.com/m3db/m3/src/dbnode/encoding/m3tsz", 333 | "github.com/m3db/m3/src/x/time", 334 | "github.com/stretchr/testify/require", 335 | ] 336 | solver-name = "gps-cdcl" 337 | solver-version = 1 338 | -------------------------------------------------------------------------------- /Gopkg.toml: -------------------------------------------------------------------------------- 1 | # Gopkg.toml example 2 | # 3 | # Refer to https://golang.github.io/dep/docs/Gopkg.toml.html 4 | # for detailed Gopkg.toml documentation. 5 | # 6 | # required = ["github.com/user/thing/cmd/thing"] 7 | # ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"] 8 | # 9 | # [[constraint]] 10 | # name = "github.com/user/project" 11 | # version = "1.0.0" 12 | # 13 | # [[constraint]] 14 | # name = "github.com/user/project2" 15 | # branch = "dev" 16 | # source = "github.com/myfork/project2" 17 | # 18 | # [[override]] 19 | # name = "github.com/x/y" 20 | # version = "2.4.0" 21 | # 22 | # [prune] 23 | # non-go = false 24 | # go-tests = true 25 | # unused-packages = true 26 | 27 | 28 | [[constraint]] 29 | name = "github.com/apple/foundationdb" 30 | version = "6.1.8" 31 | 32 | [[constraint]] 33 | name = "github.com/m3db/m3" 34 | version = "0.9.6" 35 | 36 | [[constraint]] 37 | name = "github.com/stretchr/testify" 38 | version = "1.3.0" 39 | 40 | [[constraint]] 41 | name = "google.golang.org/grpc" 42 | 43 | [prune] 44 | go-tests = true 45 | unused-packages = true 46 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | gen-proto: install-go-proto-plugin 2 | protoc --proto_path=./protos --go_out=plugins=grpc:./protos/.gen/ ./protos/rpc.proto 3 | protoc --proto_path=./protos --grpc-gateway_out=logtostderr=true:./protos/.gen/ ./protos/rpc.proto 4 | 5 | install-go-proto-plugin: 6 | go get -u github.com/golang/protobuf/protoc-gen-go 7 | go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway 8 | go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger 9 | 10 | bench: 11 | go run ./src/cmd/bench/main.go -numSeries 10000 -batchSize 1 -numWorkers 1 -duration 30s -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Time Series and FoundationDB: Millions of writes/s and 10x compression in under 2,000 lines of Go 2 | 3 | ## Disclaimer 4 | 5 | I want to preface everything you’re about to read with the disclaimer that I built DiamondDB purely as a PoC to measure performance of different architectures for storing time series data in FoundationDB. It is in no way production ready. In fact, the code is littered with TODOs, cut corners, and missing features. The only thing DiamondDB is useful for in its current form is demonstrating how a performant time series database **could** be built on-top of FDB and reminding me that I should go outside more often. If you want a distributed database with the functionality described in this blog post, you should just use [M3DB](https://github.com/m3db/m3) itself. 6 | 7 | ## Target Audience 8 | 9 | This blog post is targeted at engineers who either work on large-scale distributed systems, or are curious about them. 10 | 11 | Throughout this post we’ll look at the problem of storing high volume time series data to illustrate how FoundationDB's excellent performance characteristics and strong consistency guarantees can be utilized to build reliable and performant distributed systems. 12 | 13 | In this case we're going to build a distributed time series database (modeled after [M3DB](https://github.com/m3db/m3)) that can handle millions of writes/s (with best in class compression!) on my 2018 Macbook pro in less than 2,000 lines of Go code. 14 | 15 | ## High Volume Time Series Data 16 | 17 | At $DAYJOB I spend most of my time developing an open-source distributed time series database called [M3DB](https://github.com/m3db/m3). So naturally my first instinct was to see if I could replicate an M3DB-like system using FDB. 18 | 19 | Time series means different things to different people. In this case, I want to focus on the type of time series storage engine that could efficiently power an [OLTP](https://en.wikipedia.org/wiki/Online_transaction_processing) system (strong consistency and immediately read your writes) or a monitoring / observability workload as opposed to a time series database designed for [OLAP](https://en.wikipedia.org/wiki/Online_analytical_processing) workloads. 20 | 21 | Primarily, our system should support the following two APIs: 22 | 23 | ```golang 24 | type Value struct { 25 | Timestamp int64 26 | Value float64 27 | } 28 | 29 | Write(seriesID string, value Value) 30 | 31 | Read(seriesID) ([]Value) 32 | ``` 33 | 34 | Note that M3DB has support for several other important features, such as custom types and inverted indexing, but let's put that aside for a moment. 35 | 36 | At this point you may be wondering why do we even need a fancy distributed system in the first place? Can’t we easily solve this problem using PostgreSQL with a simple table schema? An example being: 37 | 38 | ```sql 39 | CREATE TABLE timeseries ( 40 | series_id TEXT, 41 | timestamp integer, 42 | value double precision, 43 | PRIMARY KEY(series_id, timestamp) 44 | ); 45 | ``` 46 | 47 | This implementation would work for some small use-cases, but M3DB has three properties that the Postgres implementation does not: 48 | 49 | 1. Horizontal scalability (as additional machines are added the throughput of the system should increase in a roughly linear fashion) 50 | 2. High write throughput (millions of writes/s) 51 | 3. Efficient compression 52 | 53 | Its possible for PostGres to partially address the compression requirement in a variety of ways. ([This gist](https://gist.github.com/richardartoul/23b66ea6924f28fc6ec8dfcd06901302) is an example that demonstrates how a stored procedure can be used to perform time series compression, but the compression will never be as good as a custom designed algorithm, such as [Gorilla](https://www.vldb.org/pvldb/vol8/p1816-teller.pdf). In addition, the Postgres implementation will never achieve horizontal scalability or high write throughput without application layer sharding. 54 | 55 | If you're wondering why you would ever need to store so much data that you wouldn't be able to fit it all in a single large Postgres instance, consider the monitoring / observability use-case. Imagine you have a fleet of `50,000` servers and you want to monitor `100` different metrics (free disk space, CPU utilization, etc.) about each one at `10s` intervals. This would generate `5,000,000` unique time series and `500,000` data points per second, and you're still not even tracking any application level metrics! 56 | 57 | Implementing the `Write` and `Read` interfaces, while also achieving the three properties listed above, is the crux of what M3DB and other distributed time series databases in this space seek to accomplish. 58 | 59 | ## A Software Foundation for Distributed Systems 60 | 61 | FoundationDB (FDB) first piqued I first started paying attention to FoundationDB when I listened to a [podcast](https://www.dataengineeringpodcast.com/foundationdb-distributed-systems-episode-80/) during which [Ryan Worl](https://twitter.com/ryanworl) explained how FDB can be used as an extremely powerful primitive for building distributed systems. This piqued my interest because distributed systems engineers are **severely** lacking in good primitives. 62 | 63 | But why does FDB make for such a good primitive? To answer that question, we first need to understand the data model of FoundationDB. FoundationDB is a distributed system that provides the following semantics: 64 | 65 | 1. Key/Value storage where keys and values can be arbitrary byte arrays. 66 | 2. Keys are "sorted" lexicographically such that reading and truncating large sorted ranges is efficient. 67 | 3. Automatic detection and redistribution of hot keys (this one is particularly unique and I’m not aware of many other systems that handle this gracefully). 68 | 4. Completely ACID transactions at the highest level of isolation `strict serializability` across arbitrary key/value pairs. 69 | 70 | This is basically the holy grail of primitives for building distributed systems. For example, the architecture of almost every "distributed sql" database on the market right now boils down to some (admittedly really hairy) logic for dealing with SQL and transactions wrapped around a distributed key/value store: 71 | 72 | - [Exhibit A](https://pingcap.com/docs/v3.0/architecture/) 73 | - [Exhibit B](https://github.com/cockroachdb/cockroach/blob/master/docs/design.md) 74 | 75 | While there are other systems out there that offer similar semantics as FoundationDB does, FDB is notable for the fact that it was designed from the ground up with the idea of building other distributed systems on top of it and this decision permeates the entire system, from its architecture and documentation to its performance characteristics and APIs. 76 | 77 | On top of that, its impossible to spend any amount of time with FDB and not come away with a deep appreciation for the level of careful consideration and engineering that went into it. 78 | 79 | The path to distributed systems hell is paved with good ideas ruined by mediocre and poorly tested implementations, and there is nothing mediocre or poorly tested about FoundationDB. 80 | 81 | Of course I don't want to spend this entire blog post gushing about how amazing FoundationDB is (although it really is quite good) so if you want to learn more about it here are some resources to get started: 82 | 83 | - [(Video) Technical Overview of FoundationDB](https://www.youtube.com/watch?v=EMwhsGsxfPU) 84 | 85 | - [The Docs](https://apple.github.io/foundationdb/#documentation) 86 | 87 | Now that we have established some much needed context, let’s switch gears and actually build something! 88 | 89 | ## The Design and Implementation of DiamondDB 90 | 91 | The question I wanted to explore was this: Could I build a system with the same API, compression, and performance characteristics as M3DB, but as a thin layer on-top of FDB instead of a custom distributed system written from the ground up with its own storage engine (as M3DB is). 92 | 93 | The most naive approach to storing timeseries data in FDB looks something like this: 94 | 95 | ```golang 96 | db.Transact(func(tr FDB.Transaction) (interface{}, error) { 97 | for _, w := range writes { 98 | key := tuple.Tuple{w.ID, w.Timestamp.UnixNano()} 99 | tr.Set(key, tuple.Tuple{w.Value}.Pack()) 100 | } 101 | return nil, nil 102 | }) 103 | ``` 104 | 105 | Each datapoint is stored as an individual record in FDB where the key is a tuple in the form `` and the value is a tuple in the form ``. 106 | 107 | FDB keys are sorted so we can "efficiently" query for all the values for a given series by issuing a prefix query for all keys that begin with the specified time series ID. 108 | 109 | This design has several issues: 110 | 111 | 1. Compression is terrible because the time series ID is repeated for each record. This could be addressed by assigning each time series a unique integer ID so that each time series ID would only be stored once and all the datapoint entries would reference the integer. This is equivalent to a foreign key relationship in traditional relational databases and is easy to implement because of FDB’s strong transactional semantics, however, compression would still be poor compared to modern time series databases’ as we'd still have to store the timestamp (8 bytes) and value (8 bytes) in their entirety, plus an additional 8 bytes for the time series ID "pointer" (assuming we used an unsigned 64 bit integer). 112 | 113 | 2. Write throughput is terrible because every write to FDB is a real transaction. Benchmarking on my laptop indicated that getting more than a few thousand writes per second per storage node on commodity hardware using the `ssd` engine would be difficult. We could use the `memory` engine which is much faster while still being durable, however, that requires the entire working set of the database to fit in memory of all the storage nodes which is a constraint I didn’t want to impose on this project since RAM is much more expensive than disk. 114 | 115 | I didn’t expect this design to work, but its always good to benchmark the simple approach first so you can measure exactly how much of an improvement you’re getting with the more complex solution and weigh the benefits of complexity vs. performance. 116 | 117 | The next design I attempted was to perform [Gorilla Compression](https://www.vldb.org/pvldb/vol8/p1816-teller.pdf) on the time series data. This turns out to be tricky because Gorilla compression is usually performed in memory since it involves writing individual bits at a time. Despite this obstacle I was able to implement a prototype where each write was performed by loading the current state of a Gorilla encoder out of FDB, encoding the new value into the (now in-memory) encoder, and then finally writing the state of the encoder back to FDB. 118 | 119 | Here is a simplified version of the primary FDB transaction for this implementation: 120 | 121 | ```golang 122 | _, err := l.db.Transact(func(tr fdb.Transaction) (interface{}, error) { 123 | metadataKey := newTimeseriesMetadataKeyFromID(write.seriesID) 124 | metadata, err := tr.Get(metadataKey).Get() 125 | if err != nil { 126 | return nil, err 127 | } 128 | 129 | var ( 130 | metaValue timeSeriesMetadata 131 | dataAppend []byte 132 | enc = encoding.NewEncoder() 133 | ) 134 | 135 | if len(metadataBytes) == 0 { 136 | // Never written. 137 | enc := encoding.NewEncoder() 138 | if err := enc.Encode(write.Timestamp, write.Value); err != nil { 139 | return nil, err 140 | } 141 | 142 | metaValue = timeSeriesMetadata{ 143 | State: enc.State(), 144 | } 145 | 146 | b := enc.Bytes() 147 | if len(b) > 1 { 148 | dataAppend = enc.Bytes()[:len(b)-1] 149 | } 150 | } else { 151 | if err := json.Unmarshal(metadataBytes, &metaValue); err != nil { 152 | return nil, err 153 | } 154 | 155 | // Has been written before, restore encoder state. 156 | if err := enc.Restore(metaValue.State); err != nil { 157 | return nil, err 158 | } 159 | 160 | if err := enc.Encode(write.Timestamp, write.Value); err != nil { 161 | return nil, err 162 | } 163 | 164 | // Ensure new state gets persisted. 165 | var ( 166 | newState = enc.State() 167 | b = enc.Bytes() 168 | ) 169 | if len(b) == 0 { 170 | return nil, errors.New("encoder bytes was length zero") 171 | } 172 | if len(b) == 1 { 173 | // The existing last byte was modified without adding any additional bytes. The last 174 | // byte is always tracked by the state so there is nothing to append here. 175 | } 176 | if len(b) > 1 { 177 | // The last byte will be kept track of by the state, but any bytes preceding it are 178 | // new "complete" bytes which should be appended to the compressed stream. 179 | dataAppend = b[:len(b)-1] 180 | } 181 | metaValue.LastByte = b[len(b)-1] 182 | metaValue.State = newState 183 | } 184 | 185 | newMetadataBytes, err := json.Marshal(&metaValue) 186 | if err != nil { 187 | return nil, err 188 | } 189 | 190 | tr.Set(metadataKey, newMetadataBytes) 191 | dataKey := newTimeseriesDataKeyFromID(write.ID) 192 | tr.AppendIfFits(dataKey, dataAppend) 193 | 194 | return nil, nil 195 | }) 196 | ``` 197 | 198 | Note that Gorilla compression operates at the bit (not byte) level so some care had to be taken to manage the last byte of the compressed stream (which could be partial). 199 | 200 | This implementation provided compression levels as good as any modern time series database, but still suffered from terrible write throughput. I couldn't get more than ~5000 writes per second on my laptop using this implementation which makes a lot of sense considering that even though sometimes I was only adding a bit or two to a compressed stream, FDB still had to read/write an entire page of data to add those two additional bits of information since it uses a modified version of SQLite as its storage engine. 201 | 202 | The conclusion I came to was that in order to achieve high write throughput I'd have to implement a semi-stateful system in front of FDB. I'd been trying to avoid doing this since it's much more complicated than implementing a simple stateless layer, but in the words of Spiderman: "With great scale comes great complexity". 203 | 204 | What do I mean by "semi-stateful"? Let’s start by looking at the architecture of a truly stateful system, such as M3DB. 205 | 206 | M3DB's storage system behaves similar to a [Log Structured Merge Tree](https://en.wikipedia.org/wiki/Log-structured_merge-tree), except instead of compacting based on levels, compaction is based on time. 207 | 208 | ![](./resources/m3db_storage.png) 209 | 210 | To put that into concrete terms, as M3DB accepts writes they're immediately written to the commit log for durability. This ensures that all acknowledged writes can always be recovered in the case of a crash or failure. At the same time, incoming writes are also buffered in memory where they are actively compressed using Gorilla encoding. 211 | 212 | At regular intervals, data that has been compressed in memory is flushed to disk as immutable files (merging with any existing files if they already exist) where each set of files contains all of the data for all of the values during a fixed "block" period. For example, if the blocksize is configured to 2 hours then a set of files would contain all values with timestamps between 12pm and 2pm. 213 | 214 | This architecture allows M3DB to achieve high levels of compression AND also achieve high write throughput (since writes only need to be buffered in memory and written to a commitlog file before being acknowledged). The only caveat is that if an M3DB node fails (for whatever reason) when it then starts back up it will first need to read the commitlog in its entirety and rebuild the pre-failure in-memory state before it can begin serving reads. This can take some time. 215 | 216 | Another way to understand M3DB’s architecture is that at any point in time an acknowledged write must live in either an immutable fileset **or** a mutable encoder **and** a commit log file. 217 | 218 | ![](./resources/m3db_time.png) 219 | 220 | I decided that if I was going to achieve similar levels of performance as M3DB that I would need to replicate the architecture as well. 221 | 222 | ![](./resources/fdb_storage.png) 223 | 224 | Notice that the architecture looks very similar to M3DB's, except instead of using the filesystem we use FDB. This is why I refer to the architecture as "semi-stateful". It's stateful in the sense that it needs to hold some state in memory and if a node fails or reboots it will have to "bootstrap" that state from the commit logs just like M3DB does. 225 | 226 | However, since the commit logs and compressed data blocks are stored in FDB we don't have to worry about storage state. This is important because it greatly simplifies operational concerns. For example, imagine we wanted to run our database on Kubernetes. Accomplishing this with a completely stateful system like M3DB requires using [StatefulSets](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/), [Persistent Volumes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/), and also writing an [operator](https://github.com/m3db/m3db-operator) to manage it all. With this implementation where the storage is backed by FDB, running this on Kubernetes would be much more straightforward. As long as we had some way to assign each instance of our database a unique identifier, Kubernetes would be free to move the instances around since each instance could simply bootstrap itself from FDB after being moved. 227 | 228 | Of course, all of this relies upon the fact that you're able to maintain and operate an FDB cluster, but that's the point of building on top of FDB. Once you've figured out how to setup and operate FDB clusters, you can build all of your other distributed systems on top of it and let FDB handle the most complicated portions of the distributed systems so you can focus on the portions that are unique to the problem you're trying to solve. 229 | 230 | Let’s examine the implementation of this architecture in more detail, starting with the commit logs. If you want to read the code yourself, you can find it [here](https://github.com/richardartoul/tsdb-layer/blob/master/src/layer/rawblock/commitlog.go). The general idea with the commitlog is that we need to batch many writes together and encode them into a binary format that can be decoded in a linear fashion quickly. The commitlog format does **not** need to support random reads in an efficient manner. 231 | 232 | Implementing this is rather straightforward, we just need to gather writes together and then send them in large batches to FDB. Most of the code linked above concerns itself with making sure we can do this in a performant manner, as well as concurrent signaling (since we can't acknowledge writes back to the caller until a commitlog "chunk" containing all of their writes has been persisted in FDB). 233 | 234 | The other requirement of the commit log chunks is that we need to be able to: 235 | 236 | 1. Fetch all undeleted commitlog chunks from FDB (required to "bootstrap" an instance after a restart/failure/reschedule) 237 | 2. Delete all commitlog chunks before a provided chunk (the reason for this will become clear in a minute) 238 | 239 | Luckily, both of these operations are relatively efficient in FDB as long as the keys are formatted correctly. Remember, the abstraction provided by FDB is that of a sorted key/value store, so we just need to format the keys such that they sort lexicographically in a way that makes the two operations described above efficient. 240 | 241 | The way we accomplish this is very straightforward. We use FDB's [tuple layer](https://apple.github.io/foundationdb/data-modeling.html#tuples) to generate keys in the form ``, for example: `` would be the key for the first chunk. The `commitlog` prefix is used to separate the commitlog from entries for other portions of the system, and the index number provides a monotonically increasing number that can be used so that we can perform operations like: "delete all commitlog chunks before chunk number #127". 242 | 243 | The storage engine will be writing out commitlog chunks constantly so they need to be cleaned up regularly. But how do we know when it's safe to delete a given chunk? One easy way to do that is to take advantage of the fact that the chunks are ordered. 244 | 245 | We can setup a background process that runs on regular intervals and performs the following steps: 246 | 247 | 1. Wait for a new commitlog chunk to be written out and then take note of the index of the chunk. 248 | 2. Flush all in-memory buffered data as compressed chunks to FDB (note that the storage engine will still be accepting writes while this operation is going on, but that’s fine, this flow only needs to ensure that all writes that were already acknowledged **before** the commitlog chunk from #1 was written out are flushed to FDB). 249 | 3. Delete all commitlog chunks with an index **lower** than the chunk from step #1. Note this operation is now safe because the previous step (if it succeeds) guarantees that all the data in all the commitlog chunks that will be deleted have already been persisted to FDB in the form of compressed data chunks. 250 | 251 | ![](./resources/fdb_time.png) 252 | 253 | Using the diagram above as an example, the persistence loop would wait for chunk #3 to be flushed, then the buffer would begin flushing everything that was currently in-memory, and finally once that completed the storage engine could delete all commitlog chunks lower than 4 because all the data they contained was flushed to FDB as compressed chunks. 254 | 255 | In code, it looks like this: 256 | 257 | ```golang 258 | func (l *rawBlock) startPersistLoop() { 259 | for { 260 | // Prevent excessive activity when there are no incoming writes. 261 | time.Sleep(persistLoopInterval) 262 | 263 | // truncToken is opaque to the caller but the commit log can use it 264 | // to truncate all chunks whose index is lower than the chunk that 265 | // was just flushed as part of the commit log rotation. 266 | truncToken, err := l.cl.WaitForRotation() 267 | if err != nil { 268 | log.Printf("error waiting for commitlog rotation: %v", err) 269 | continue 270 | } 271 | 272 | if err := l.buffer.Flush(); err != nil { 273 | log.Printf("error flushing buffer: %v", err) 274 | continue 275 | } 276 | 277 | if err := l.cl.Truncate(truncToken); err != nil { 278 | log.Printf("error truncating commitlog: %v", err) 279 | continue 280 | } 281 | } 282 | } 283 | ``` 284 | 285 | The last thing to consider about the commit log chunks is that once an instance is restarted it will need to read all of the existing chunks before accepting any writes or reads. I didn’t implement this in the prototype to save time and because I’m fairly certain it wouldn’t be an issue from a performance perspective because read performance is one of FDB’s strengths. 286 | 287 | The next thing we need to understand is how the `buffer` works, both in terms of read and write operations, as well as the `Flush()` mechanism that we alluded to in the snippet above. 288 | 289 | The `buffer` system's job is straightforward: buffer writes in-memory (actively Gorilla compressing them to save memory) until the compressed block can be merged with an existing one in FDB, or inserted as a new chunk entirely. 290 | 291 | I won't go over the implementation of the encoders themselves because that's mainly just straightforward bit-fiddling and described well in the [Gorilla paper]([Gorilla](https://www.vldb.org/pvldb/vol8/p1816-teller.pdf). Also, the compression code is mostly just a knock-off of M3DB's ;). If you're really curious, you can check out the code for the [encoder](https://github.com/richardartoul/tsdb-layer/blob/master/src/encoding/encoder.go) and [decoder](https://github.com/richardartoul/tsdb-layer/blob/master/src/encoding/decoder.go) here. The only thing you really need to understand about the encoder and decoders are their (simplified) interfaces: 292 | 293 | ```golang 294 | type Encoder interface { 295 | Encode(timestamp time.Time, value float64) error 296 | Bytes() []byte 297 | } 298 | 299 | type Decoder interface { 300 | Next() bool 301 | Current() (time.Time, float64) 302 | Err() error 303 | Reset(b []byte) 304 | } 305 | ``` 306 | 307 | The implementation of the buffer itself is reasonably straightforward. The actual struct looks like this: 308 | 309 | ```golang 310 | type buffer struct { 311 | sync.Mutex 312 | encoders map[string][]encoding.Encoder 313 | } 314 | ``` 315 | 316 | The basic data structure is a synchronized hashmap from time series ID to an array of encoders. The existing implementation is simplified to make things easier and as a result has a few basic limitations (like the inability to write data points out of order) that would require a slightly more complicated data structure to solve, but the basic idea and performance would remain the same. 317 | 318 | Let's start by looking at the write path. This is the most straightforward part. All the encoders are treated as immutable (except the last one), so writing is as simple as finding the last encoder for a given seriesID (or creating one if necessary), and then encoding the newest value into it. 319 | 320 | ```golang 321 | func (b *buffer) Write(writes []layer.Write) error { 322 | b.Lock() 323 | defer b.Unlock() 324 | 325 | for _, w := range writes { 326 | encoders, ok := b.encoders[w.ID] 327 | if !ok { 328 | encoders = []encoding.Encoder{encoding.NewEncoder()} 329 | b.encoders[w.ID] = encoders 330 | } 331 | 332 | enc := encoders[len(encoders)-1] 333 | lastT, _, hasWrittenAnyValues := enc.LastEncoded() 334 | if hasWrittenAnyValues { 335 | if w.Timestamp.Before(lastT) { 336 | return fmt.Errorf( 337 | "cannot write data out of order, series: %s, prevTimestamp: %s, currTimestamp: %s", 338 | w.ID, lastT.String(), w.Timestamp.String()) 339 | } 340 | if w.Timestamp.Equal(lastT) { 341 | return fmt.Errorf( 342 | "cannot upsert existing values, series: %s, currTimestamp: %s", 343 | w.ID, lastT.String()) 344 | } 345 | } 346 | 347 | if err := enc.Encode(w.Timestamp, w.Value); err != nil { 348 | return err 349 | } 350 | } 351 | 352 | return nil 353 | } 354 | ``` 355 | 356 | Before we discuss the Read path, we need to go over the `Flush` path which is how data gets moved from temporary storage in the in-memory buffers to persistent storage in FDB. Remember from our earlier discussion of the background "persist loop" that the contract of the `Flush` method is that when it completes all writes that were already in the buffer when the function started **must** be persisted to FDB. 357 | 358 | The actual implementation (which you can read through [here](https://github.com/richardartoul/tsdb-layer/blob/master/src/layer/rawblock/buffer.go)) is unfortunately complicated by some complex synchronization and concurrency code that I don't want to delve into right now (mainly for performance reasons) but the basic idea is simple: iterate through every time series that was in memory when the function started, create a new encoder for it (making all previous encoders immutable) into which new writes will be encoded, and then flush all the immutable encoders to FDB. 359 | 360 | The first step for flushing the in-memory encoder data to FDB is to retrieve the existing metadata for that series from FDB. The metadata stored in FDB for each series looks like this: 361 | 362 | ```golang 363 | type tsMetadata struct { 364 | Chunks []chunkMetadata 365 | } 366 | ``` 367 | 368 | and each `chunkMetadata` looks like this: 369 | 370 | ```golang 371 | type chunkMetadata struct { 372 | Key []byte 373 | First time.Time 374 | Last time.Time 375 | SizeBytes int 376 | } 377 | ``` 378 | 379 | The series metadata entry serves as a sort of index for the series data by keeping track of all the compressed data chunks associated with that series. For each chunk it keeps track of: 380 | 381 | 1. The FDB key for that chunk (so that the chunk can be retrieved). 382 | 2. The timestamp for the first and last datapoint stored in the compressed block. This information is important for the read path as it informs us which chunks need to be retrieved to satisfy a query with a given time range. It can also be useful to the merge logic so that it can make better decisions about which chunks to merge together to form larger contiguous blocks. 383 | 3. The size (in bytes) of the chunk. This is used by the merging logic to determine when an encoder that is being flushed should be inserted as a new chunk or merged with an existing one. 384 | 385 | This is where FDB's unique programming model really shines. I said earlier that FoundationDB provides the abstraction of a sorted key/value storage system, but more importantly, it supports completely ACID transactions at the highest level of isolation `strict serializability` (which means you're not vulnerable to the types of bugs described in [this excellent blog post by the FaunaDB team](https://fauna.com/blog/demystifying-database-systems-correctness-anomalies-under-serializable-isolation)). 386 | 387 | Because of these guarantees, programming the flush logic is **almost** as simple as if we were programming against an in-memory system. In a single strict serializability ACID transaction we can do the following: 388 | 389 | 1. Read the existing metadata for the series being flushed. 390 | 2. Use the series metadata to decide if the data being flushed should be merged with an existing chunk or written out as a new, independent chunk (this makes experimenting with different compaction methods trivial since we don’t have to rewrite the underlying storage engine). 391 | 3. Read the existing chunk that we need to merge with (if necessary). 392 | 4. Write the merged (or new) chunk to FDB. 393 | 5. Write back the series metadata with the updated chunk information. 394 | 395 | Everything we’ve accomplished up until this point could probably have been accomplished on any distributed system with a sorted key/value interface (of which there are many). However, implementing the 5 steps described above 100% correctly with no edge-cases or race conditions using a loosely / eventually consistent distributed system like Cassandra would be a nightmare. Accomplishing it with FDB is a breeze. 396 | 397 | Finally, now that we've covered both the write and flush paths, we can discuss the read path. Implementing reads turns out to actually be quite straight forward. The steps are: 398 | 399 | 1. Read the latest version of the series metadata out of FDB. 400 | 2. Use the metadata to determine which chunk need to be pulled out of FDB to satisfy the provided query range (I.E if data points between the times of 12p.m and 2p.m are requested then any chunks where the `First`/`Last` data points intersect that range need to be pulled back). 401 | 3. Determine which in-memory encoders (which may not yet be flushed to FDB) also contain data points within the requested time range. 402 | 4. Return a decoder that will transparently iterate through all of the data points (returning them in order) by merging across all of the chunks retrieved from FDB as well as the in-memory encoders. This problem turns out to be equivalent to merging k sorted arrays and [this blog post](https://medium.com/outco/how-to-merge-k-sorted-arrays-c35d87aa298e) has a good explanation of how to accomplish that using a min heap. You can also take a look at my implementation [here](https://github.com/richardartoul/tsdb-layer/blob/master/src/encoding/multi_decoder.go). 403 | 404 | A lot of effort went into optimizing the write path, but we haven't done much of anything to optimize the read path. The reason for that is two-fold: 405 | 406 | 1. Systems like M3DB are designed for workloads where write throughput is much higher than read throughput. 407 | 2. FDB can perform reads at a much higher rate than writes by default, so less optimization is required. 408 | 409 | Let's pause for a moment and see if we’ve accomplished our goals. To reiterate, we wanted our system to implement the `Write` and `Read` interfaces (check) as well as satisfy the following properties: 410 | 411 | 1. Horizontal scalability - Check. Benchmarking shows that this design has a transaction conflict rate near zero which means the number of transactions we can do [should scale linearly as we add hardware](https://apple.github.io/foundationdb/performance.html). 412 | 2. High write throughput - Check. This implementation can easily handle over a million logical writes/s on my 2018 MacbookPro. 413 | 3. Efficient compression - Check. We’re using almost the exact same time series compression that M3DB and all the other popular time series databases use. 414 | 415 | ## Future Considerations and Extensions 416 | 417 | DiamondDB is missing a ton of features, but most notably it lacks: 418 | 419 | 1. The ability to store and compress custom types 420 | 2. Secondary indexing (Ex. Fetch all time series where `city` tag equals `san_francisco`) 421 | 3. Automatic TTL (time to live I.E data should “expire” after a certain period of time) 422 | 423 | ### Complex Types 424 | 425 | Storing and compressing custom types turns out to be the easiest to solve. All we have to do is replace our Gorilla encoder with one that can efficiently compress more complicated types. Fortunately, we had to solve that exact problem recently in M3DB as part of our plan to evolve it from a metrics store to a more general purpose time series database. The solution we came up with was to model our complex types as Protobufs and then write a general purpose compression scheme that can perform streaming delta compression of Protobuf messages much like Gorilla performs streaming delta compression of floats. The code for that solution is [open source](https://github.com/m3db/m3/tree/master/src/dbnode/encoding/proto) and could be lifted directly into DiamondDB. If you’re curious about how the bit-fiddly details of how the compression works, take a look at [this documentation](https://github.com/m3db/m3/blob/master/src/dbnode/encoding/proto/docs/encoding.md). 426 | 427 | ### Inverted / Secondary Indexing 428 | 429 | Next up is secondary indexing. We already got a brief glimpse about how to perform secondary indexing in FDB earlier with the `flush` code where we atomically wrote a new time series chunk and updated the series’ metadata entry (which is effectively a secondary index over the compressed data chunks). Implementing exact-match secondary indexing would be fairly straightforward. For example, let's say we wanted to implement a tag-based inverted index like the one M3DB supports. For each unique combination of key/value pair in the index we would store an FDB entry that contained a list of all the time series IDs that were tagged with that unique combination. The image below depicts a simple example of how to store and index two separate time series: 430 | 431 | ![](./resources/fdb_index.png) 432 | 433 | If we wanted to query for all the time series where the `city` tag is equal to `san_francisco` then we would retrieve the FDB entry with the key `` which would immediately tell us that there applicable series are `sf_num_widgets` and `sf_num_people`. More complicated queries could be executed by unioning and intersecting the results of these individual term queries. For example, it's not difficult to imagine how this simple schema could evaluate the query: “fetch all time series where `city` equals `san_francisco` OR `type` equals `widgets`. Ta da! We’ve just implemented a [postings list] on top of FDB. 434 | 435 | Of course, this is a fairly naive solution that could end up using a lot of disk space. If we’re willing to exchange complexity for better compression, we could assign each time series a unique ID (an operation that can be implemented efficiently in FDB) and then store a list of integers instead of time series IDs. This would reduce the size of secondary index entries substantially, but we could take it even further by storing a [roaring bitmap](https://roaringbitmap.org/) instead of a list of integers. 436 | 437 | Supporting regular expression queries (as M3DB does) gets more complicated, and if I’m being completely honest, I’d have to spend a few weeks building prototypes to come up with the best way to do this. Luckily, this is the internet, so I can just tell you my opinion with absolutely no evidence to back it up. 438 | 439 | First, the naive solution. In addition to the index entries from the previous example, we could also store entries where the key is the tag name and the value is all unique values that exist for that tag. We could then retrieve those index entries on-demand and run regular expressions on them in memory. This would tell us which unique tag/value pairs exist that match the regular expression which we could then use to pull back the index entries from the previous example and look up the matching series IDs. For many use-cases this would be reasonably performant, but you didn’t put up with my rambling for this long to settle for anything reasonable! 440 | 441 | M3DB handles regular expression queries by maintaining a [finite state transducer](https://en.wikipedia.org/wiki/Finite-state_transducer)(FST) for each tag in the inverted index (in our example above there would be an FST for the `city` tag and another for the `type` tag). The FST itself stores a mapping between all the unique values for the tag (`widgets` and `num_people` for the `type` tag) and an integer. In M3DB the integer is an offset into a file where a [roaring bitmap](https://roaringbitmap.org/) is stored that contains the unique integer IDs of all the time series that contain that tag. Andrew Gallant’s now infamous [blog post](https://blog.burntsushi.net/transducers/) is a great resource to understand why FSTs solve this problem so effectively, but the short of it is that they’re incredibly efficient in this situation because they have the dual properties of: 442 | 443 | 1. Compressing extremely well. 444 | 2. Supporting running regular expressions against them in a performant way. 445 | 446 | Could we leverage this solution in our FDB-backed system? Possibly. We’ve already discussed storing complex data structures like roaring bitmaps in FDB and there is no reason we couldn’t do something similar with FSTs. One limitation we might run into, however, is the fact that an individual value in FDB can’t exceed 100KiB in size which seems like a show-stopper, but we could probably work around it. For example, it’s not hard to imagine designing an mmap-like interface in the programming language of your choice that provides the abstraction of a byte array of arbitrary size that is transparently split and mapped onto FDB. You could then fork / modify existing FST libraries to execute against this interface since many of them are already designed with the ability to execute against FSTs stored in byte arrays or mmap’d files. 447 | 448 | ### Data Time To Live (TTL) 449 | 450 | Finally, let's talk about automatic TTLs (data expiry). I saved this one for last because it’s just a special case of secondary indexing and there are numerous ways you could build indices that would allow you to expire and clean up data in an efficient manner, but this blog post is already far too long. 451 | 452 | ## Conclusion 453 | 454 | There are lots of distributed systems problems that are difficult to solve, but that can be implemented trivially as stateless layers over FoundationDB. However, some problems that seem like a poor match for FDB at first glance can actually be solved with a semi-stateful layer. Of course, building a semi-stateful layer is significantly more complicated than building a stateless one, but it's also significantly **less** complicated than building a distributed system from scratch. While I cut a lot of corners in my implementation, I was still able to build a distributed system that can accept millions of time series writes per second (with competitive levels of compression) in under 2000 lines of Go code. It's not hard to imagine that with a few couple weeks/months of dedicated work and a few thousand more lines of code that we could build this out into a production-ready system. 455 | 456 | FoundationDB will never be able to beat a purpose designed storage engine, but programming against it is much easier than programming against the operating system, filesystem, network, and physical hardware. In a lot of ways building a distributed system on-top of FDB after having built one from scratch feels a lot like upgrading to Python from assembly. 457 | 458 | FoundationDB is fast, reliable, easy to use, and a lot of fun to program against. Next time you need to build a distributed system consider if FDB could make your job a little bit easier. You might be surprised by just how far you can push it with the right design. 459 | 460 | -------------------------------------------------------------------------------- /protos/.gen/rpc.pb.go: -------------------------------------------------------------------------------- 1 | // Code generated by protoc-gen-go. DO NOT EDIT. 2 | // source: rpc.proto 3 | 4 | package tsdblayer 5 | 6 | import ( 7 | context "context" 8 | fmt "fmt" 9 | proto "github.com/golang/protobuf/proto" 10 | _ "google.golang.org/genproto/googleapis/api/annotations" 11 | grpc "google.golang.org/grpc" 12 | codes "google.golang.org/grpc/codes" 13 | status "google.golang.org/grpc/status" 14 | math "math" 15 | ) 16 | 17 | // Reference imports to suppress errors if they are not otherwise used. 18 | var _ = proto.Marshal 19 | var _ = fmt.Errorf 20 | var _ = math.Inf 21 | 22 | // This is a compile-time assertion to ensure that this generated file 23 | // is compatible with the proto package it is being compiled against. 24 | // A compilation error at this line likely means your copy of the 25 | // proto package needs to be updated. 26 | const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package 27 | 28 | type WriteBatchRequest struct { 29 | Batch []*WriteRequest `protobuf:"bytes,1,rep,name=batch,proto3" json:"batch,omitempty"` 30 | XXX_NoUnkeyedLiteral struct{} `json:"-"` 31 | XXX_unrecognized []byte `json:"-"` 32 | XXX_sizecache int32 `json:"-"` 33 | } 34 | 35 | func (m *WriteBatchRequest) Reset() { *m = WriteBatchRequest{} } 36 | func (m *WriteBatchRequest) String() string { return proto.CompactTextString(m) } 37 | func (*WriteBatchRequest) ProtoMessage() {} 38 | func (*WriteBatchRequest) Descriptor() ([]byte, []int) { 39 | return fileDescriptor_77a6da22d6a3feb1, []int{0} 40 | } 41 | 42 | func (m *WriteBatchRequest) XXX_Unmarshal(b []byte) error { 43 | return xxx_messageInfo_WriteBatchRequest.Unmarshal(m, b) 44 | } 45 | func (m *WriteBatchRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { 46 | return xxx_messageInfo_WriteBatchRequest.Marshal(b, m, deterministic) 47 | } 48 | func (m *WriteBatchRequest) XXX_Merge(src proto.Message) { 49 | xxx_messageInfo_WriteBatchRequest.Merge(m, src) 50 | } 51 | func (m *WriteBatchRequest) XXX_Size() int { 52 | return xxx_messageInfo_WriteBatchRequest.Size(m) 53 | } 54 | func (m *WriteBatchRequest) XXX_DiscardUnknown() { 55 | xxx_messageInfo_WriteBatchRequest.DiscardUnknown(m) 56 | } 57 | 58 | var xxx_messageInfo_WriteBatchRequest proto.InternalMessageInfo 59 | 60 | func (m *WriteBatchRequest) GetBatch() []*WriteRequest { 61 | if m != nil { 62 | return m.Batch 63 | } 64 | return nil 65 | } 66 | 67 | type ReadBatchRequest struct { 68 | Batch []*ReadRequest `protobuf:"bytes,1,rep,name=batch,proto3" json:"batch,omitempty"` 69 | XXX_NoUnkeyedLiteral struct{} `json:"-"` 70 | XXX_unrecognized []byte `json:"-"` 71 | XXX_sizecache int32 `json:"-"` 72 | } 73 | 74 | func (m *ReadBatchRequest) Reset() { *m = ReadBatchRequest{} } 75 | func (m *ReadBatchRequest) String() string { return proto.CompactTextString(m) } 76 | func (*ReadBatchRequest) ProtoMessage() {} 77 | func (*ReadBatchRequest) Descriptor() ([]byte, []int) { 78 | return fileDescriptor_77a6da22d6a3feb1, []int{1} 79 | } 80 | 81 | func (m *ReadBatchRequest) XXX_Unmarshal(b []byte) error { 82 | return xxx_messageInfo_ReadBatchRequest.Unmarshal(m, b) 83 | } 84 | func (m *ReadBatchRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { 85 | return xxx_messageInfo_ReadBatchRequest.Marshal(b, m, deterministic) 86 | } 87 | func (m *ReadBatchRequest) XXX_Merge(src proto.Message) { 88 | xxx_messageInfo_ReadBatchRequest.Merge(m, src) 89 | } 90 | func (m *ReadBatchRequest) XXX_Size() int { 91 | return xxx_messageInfo_ReadBatchRequest.Size(m) 92 | } 93 | func (m *ReadBatchRequest) XXX_DiscardUnknown() { 94 | xxx_messageInfo_ReadBatchRequest.DiscardUnknown(m) 95 | } 96 | 97 | var xxx_messageInfo_ReadBatchRequest proto.InternalMessageInfo 98 | 99 | func (m *ReadBatchRequest) GetBatch() []*ReadRequest { 100 | if m != nil { 101 | return m.Batch 102 | } 103 | return nil 104 | } 105 | 106 | type ReadBatchResponse struct { 107 | Batch []*ReadResponse `protobuf:"bytes,1,rep,name=batch,proto3" json:"batch,omitempty"` 108 | XXX_NoUnkeyedLiteral struct{} `json:"-"` 109 | XXX_unrecognized []byte `json:"-"` 110 | XXX_sizecache int32 `json:"-"` 111 | } 112 | 113 | func (m *ReadBatchResponse) Reset() { *m = ReadBatchResponse{} } 114 | func (m *ReadBatchResponse) String() string { return proto.CompactTextString(m) } 115 | func (*ReadBatchResponse) ProtoMessage() {} 116 | func (*ReadBatchResponse) Descriptor() ([]byte, []int) { 117 | return fileDescriptor_77a6da22d6a3feb1, []int{2} 118 | } 119 | 120 | func (m *ReadBatchResponse) XXX_Unmarshal(b []byte) error { 121 | return xxx_messageInfo_ReadBatchResponse.Unmarshal(m, b) 122 | } 123 | func (m *ReadBatchResponse) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { 124 | return xxx_messageInfo_ReadBatchResponse.Marshal(b, m, deterministic) 125 | } 126 | func (m *ReadBatchResponse) XXX_Merge(src proto.Message) { 127 | xxx_messageInfo_ReadBatchResponse.Merge(m, src) 128 | } 129 | func (m *ReadBatchResponse) XXX_Size() int { 130 | return xxx_messageInfo_ReadBatchResponse.Size(m) 131 | } 132 | func (m *ReadBatchResponse) XXX_DiscardUnknown() { 133 | xxx_messageInfo_ReadBatchResponse.DiscardUnknown(m) 134 | } 135 | 136 | var xxx_messageInfo_ReadBatchResponse proto.InternalMessageInfo 137 | 138 | func (m *ReadBatchResponse) GetBatch() []*ReadResponse { 139 | if m != nil { 140 | return m.Batch 141 | } 142 | return nil 143 | } 144 | 145 | type WriteRequest struct { 146 | SeriesId string `protobuf:"bytes,1,opt,name=series_id,json=seriesId,proto3" json:"series_id,omitempty"` 147 | Datapoint *Datapoint `protobuf:"bytes,2,opt,name=datapoint,proto3" json:"datapoint,omitempty"` 148 | XXX_NoUnkeyedLiteral struct{} `json:"-"` 149 | XXX_unrecognized []byte `json:"-"` 150 | XXX_sizecache int32 `json:"-"` 151 | } 152 | 153 | func (m *WriteRequest) Reset() { *m = WriteRequest{} } 154 | func (m *WriteRequest) String() string { return proto.CompactTextString(m) } 155 | func (*WriteRequest) ProtoMessage() {} 156 | func (*WriteRequest) Descriptor() ([]byte, []int) { 157 | return fileDescriptor_77a6da22d6a3feb1, []int{3} 158 | } 159 | 160 | func (m *WriteRequest) XXX_Unmarshal(b []byte) error { 161 | return xxx_messageInfo_WriteRequest.Unmarshal(m, b) 162 | } 163 | func (m *WriteRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { 164 | return xxx_messageInfo_WriteRequest.Marshal(b, m, deterministic) 165 | } 166 | func (m *WriteRequest) XXX_Merge(src proto.Message) { 167 | xxx_messageInfo_WriteRequest.Merge(m, src) 168 | } 169 | func (m *WriteRequest) XXX_Size() int { 170 | return xxx_messageInfo_WriteRequest.Size(m) 171 | } 172 | func (m *WriteRequest) XXX_DiscardUnknown() { 173 | xxx_messageInfo_WriteRequest.DiscardUnknown(m) 174 | } 175 | 176 | var xxx_messageInfo_WriteRequest proto.InternalMessageInfo 177 | 178 | func (m *WriteRequest) GetSeriesId() string { 179 | if m != nil { 180 | return m.SeriesId 181 | } 182 | return "" 183 | } 184 | 185 | func (m *WriteRequest) GetDatapoint() *Datapoint { 186 | if m != nil { 187 | return m.Datapoint 188 | } 189 | return nil 190 | } 191 | 192 | type ReadRequest struct { 193 | // TODO(rartoul): Time ranges. 194 | SeriesId string `protobuf:"bytes,1,opt,name=series_id,json=seriesId,proto3" json:"series_id,omitempty"` 195 | XXX_NoUnkeyedLiteral struct{} `json:"-"` 196 | XXX_unrecognized []byte `json:"-"` 197 | XXX_sizecache int32 `json:"-"` 198 | } 199 | 200 | func (m *ReadRequest) Reset() { *m = ReadRequest{} } 201 | func (m *ReadRequest) String() string { return proto.CompactTextString(m) } 202 | func (*ReadRequest) ProtoMessage() {} 203 | func (*ReadRequest) Descriptor() ([]byte, []int) { 204 | return fileDescriptor_77a6da22d6a3feb1, []int{4} 205 | } 206 | 207 | func (m *ReadRequest) XXX_Unmarshal(b []byte) error { 208 | return xxx_messageInfo_ReadRequest.Unmarshal(m, b) 209 | } 210 | func (m *ReadRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { 211 | return xxx_messageInfo_ReadRequest.Marshal(b, m, deterministic) 212 | } 213 | func (m *ReadRequest) XXX_Merge(src proto.Message) { 214 | xxx_messageInfo_ReadRequest.Merge(m, src) 215 | } 216 | func (m *ReadRequest) XXX_Size() int { 217 | return xxx_messageInfo_ReadRequest.Size(m) 218 | } 219 | func (m *ReadRequest) XXX_DiscardUnknown() { 220 | xxx_messageInfo_ReadRequest.DiscardUnknown(m) 221 | } 222 | 223 | var xxx_messageInfo_ReadRequest proto.InternalMessageInfo 224 | 225 | func (m *ReadRequest) GetSeriesId() string { 226 | if m != nil { 227 | return m.SeriesId 228 | } 229 | return "" 230 | } 231 | 232 | type ReadResponse struct { 233 | SeriesId string `protobuf:"bytes,1,opt,name=series_id,json=seriesId,proto3" json:"series_id,omitempty"` 234 | Datapoints []*Datapoint `protobuf:"bytes,2,rep,name=datapoints,proto3" json:"datapoints,omitempty"` 235 | XXX_NoUnkeyedLiteral struct{} `json:"-"` 236 | XXX_unrecognized []byte `json:"-"` 237 | XXX_sizecache int32 `json:"-"` 238 | } 239 | 240 | func (m *ReadResponse) Reset() { *m = ReadResponse{} } 241 | func (m *ReadResponse) String() string { return proto.CompactTextString(m) } 242 | func (*ReadResponse) ProtoMessage() {} 243 | func (*ReadResponse) Descriptor() ([]byte, []int) { 244 | return fileDescriptor_77a6da22d6a3feb1, []int{5} 245 | } 246 | 247 | func (m *ReadResponse) XXX_Unmarshal(b []byte) error { 248 | return xxx_messageInfo_ReadResponse.Unmarshal(m, b) 249 | } 250 | func (m *ReadResponse) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { 251 | return xxx_messageInfo_ReadResponse.Marshal(b, m, deterministic) 252 | } 253 | func (m *ReadResponse) XXX_Merge(src proto.Message) { 254 | xxx_messageInfo_ReadResponse.Merge(m, src) 255 | } 256 | func (m *ReadResponse) XXX_Size() int { 257 | return xxx_messageInfo_ReadResponse.Size(m) 258 | } 259 | func (m *ReadResponse) XXX_DiscardUnknown() { 260 | xxx_messageInfo_ReadResponse.DiscardUnknown(m) 261 | } 262 | 263 | var xxx_messageInfo_ReadResponse proto.InternalMessageInfo 264 | 265 | func (m *ReadResponse) GetSeriesId() string { 266 | if m != nil { 267 | return m.SeriesId 268 | } 269 | return "" 270 | } 271 | 272 | func (m *ReadResponse) GetDatapoints() []*Datapoint { 273 | if m != nil { 274 | return m.Datapoints 275 | } 276 | return nil 277 | } 278 | 279 | type Datapoint struct { 280 | TimestampNanos uint64 `protobuf:"varint,1,opt,name=timestamp_nanos,json=timestampNanos,proto3" json:"timestamp_nanos,omitempty"` 281 | Value float64 `protobuf:"fixed64,2,opt,name=value,proto3" json:"value,omitempty"` 282 | XXX_NoUnkeyedLiteral struct{} `json:"-"` 283 | XXX_unrecognized []byte `json:"-"` 284 | XXX_sizecache int32 `json:"-"` 285 | } 286 | 287 | func (m *Datapoint) Reset() { *m = Datapoint{} } 288 | func (m *Datapoint) String() string { return proto.CompactTextString(m) } 289 | func (*Datapoint) ProtoMessage() {} 290 | func (*Datapoint) Descriptor() ([]byte, []int) { 291 | return fileDescriptor_77a6da22d6a3feb1, []int{6} 292 | } 293 | 294 | func (m *Datapoint) XXX_Unmarshal(b []byte) error { 295 | return xxx_messageInfo_Datapoint.Unmarshal(m, b) 296 | } 297 | func (m *Datapoint) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { 298 | return xxx_messageInfo_Datapoint.Marshal(b, m, deterministic) 299 | } 300 | func (m *Datapoint) XXX_Merge(src proto.Message) { 301 | xxx_messageInfo_Datapoint.Merge(m, src) 302 | } 303 | func (m *Datapoint) XXX_Size() int { 304 | return xxx_messageInfo_Datapoint.Size(m) 305 | } 306 | func (m *Datapoint) XXX_DiscardUnknown() { 307 | xxx_messageInfo_Datapoint.DiscardUnknown(m) 308 | } 309 | 310 | var xxx_messageInfo_Datapoint proto.InternalMessageInfo 311 | 312 | func (m *Datapoint) GetTimestampNanos() uint64 { 313 | if m != nil { 314 | return m.TimestampNanos 315 | } 316 | return 0 317 | } 318 | 319 | func (m *Datapoint) GetValue() float64 { 320 | if m != nil { 321 | return m.Value 322 | } 323 | return 0 324 | } 325 | 326 | type Empty struct { 327 | XXX_NoUnkeyedLiteral struct{} `json:"-"` 328 | XXX_unrecognized []byte `json:"-"` 329 | XXX_sizecache int32 `json:"-"` 330 | } 331 | 332 | func (m *Empty) Reset() { *m = Empty{} } 333 | func (m *Empty) String() string { return proto.CompactTextString(m) } 334 | func (*Empty) ProtoMessage() {} 335 | func (*Empty) Descriptor() ([]byte, []int) { 336 | return fileDescriptor_77a6da22d6a3feb1, []int{7} 337 | } 338 | 339 | func (m *Empty) XXX_Unmarshal(b []byte) error { 340 | return xxx_messageInfo_Empty.Unmarshal(m, b) 341 | } 342 | func (m *Empty) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { 343 | return xxx_messageInfo_Empty.Marshal(b, m, deterministic) 344 | } 345 | func (m *Empty) XXX_Merge(src proto.Message) { 346 | xxx_messageInfo_Empty.Merge(m, src) 347 | } 348 | func (m *Empty) XXX_Size() int { 349 | return xxx_messageInfo_Empty.Size(m) 350 | } 351 | func (m *Empty) XXX_DiscardUnknown() { 352 | xxx_messageInfo_Empty.DiscardUnknown(m) 353 | } 354 | 355 | var xxx_messageInfo_Empty proto.InternalMessageInfo 356 | 357 | func init() { 358 | proto.RegisterType((*WriteBatchRequest)(nil), "tsdblayer.WriteBatchRequest") 359 | proto.RegisterType((*ReadBatchRequest)(nil), "tsdblayer.ReadBatchRequest") 360 | proto.RegisterType((*ReadBatchResponse)(nil), "tsdblayer.ReadBatchResponse") 361 | proto.RegisterType((*WriteRequest)(nil), "tsdblayer.WriteRequest") 362 | proto.RegisterType((*ReadRequest)(nil), "tsdblayer.ReadRequest") 363 | proto.RegisterType((*ReadResponse)(nil), "tsdblayer.ReadResponse") 364 | proto.RegisterType((*Datapoint)(nil), "tsdblayer.Datapoint") 365 | proto.RegisterType((*Empty)(nil), "tsdblayer.Empty") 366 | } 367 | 368 | func init() { proto.RegisterFile("rpc.proto", fileDescriptor_77a6da22d6a3feb1) } 369 | 370 | var fileDescriptor_77a6da22d6a3feb1 = []byte{ 371 | // 408 bytes of a gzipped FileDescriptorProto 372 | 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x7c, 0x93, 0xc1, 0xce, 0xd2, 0x40, 373 | 0x10, 0xc7, 0xb3, 0x28, 0xea, 0x0e, 0x04, 0x61, 0x43, 0xa0, 0x81, 0x9a, 0x90, 0xbd, 0x48, 0x88, 374 | 0x40, 0xac, 0x9e, 0x38, 0x99, 0x06, 0x0f, 0x1a, 0x63, 0x48, 0x35, 0xf1, 0xe0, 0x81, 0x2c, 0x74, 375 | 0x83, 0x4d, 0xa0, 0xbb, 0x76, 0x17, 0x0c, 0x57, 0x5f, 0xc1, 0xf7, 0xf2, 0xf2, 0xbd, 0xc2, 0xf7, 376 | 0x20, 0x5f, 0xba, 0xfb, 0xb5, 0x2c, 0x84, 0x70, 0xec, 0xcc, 0x7f, 0x7e, 0xf3, 0x9f, 0xce, 0x2c, 377 | 0xe0, 0x4c, 0xae, 0x27, 0x32, 0x13, 0x5a, 0x10, 0xac, 0x55, 0xbc, 0xda, 0xb2, 0x23, 0xcf, 0x7a, 378 | 0xfe, 0x46, 0x88, 0xcd, 0x96, 0x4f, 0x99, 0x4c, 0xa6, 0x2c, 0x4d, 0x85, 0x66, 0x3a, 0x11, 0xa9, 379 | 0xb2, 0x42, 0x1a, 0x42, 0xeb, 0x47, 0x96, 0x68, 0x1e, 0x32, 0xbd, 0xfe, 0x15, 0xf1, 0xdf, 0x7b, 380 | 0xae, 0x34, 0x19, 0x43, 0x75, 0x95, 0x7f, 0x7b, 0x68, 0xf0, 0x64, 0x58, 0x0b, 0xba, 0x93, 0x92, 381 | 0x36, 0x31, 0xe2, 0x47, 0x5d, 0x64, 0x55, 0xf4, 0x03, 0x34, 0x23, 0xce, 0xe2, 0x33, 0xc4, 0x9b, 382 | 0x73, 0x44, 0xc7, 0x41, 0xe4, 0xda, 0x0b, 0x42, 0x08, 0x2d, 0x87, 0xa0, 0xa4, 0x48, 0x15, 0xbf, 383 | 0xe5, 0xc2, 0x22, 0xac, 0xae, 0x60, 0x2c, 0xa1, 0xee, 0x9a, 0x23, 0x7d, 0xc0, 0x8a, 0x67, 0x09, 384 | 0x57, 0xcb, 0x24, 0xf6, 0xd0, 0x00, 0x0d, 0x71, 0xf4, 0xc2, 0x06, 0x3e, 0xc5, 0x24, 0x00, 0x1c, 385 | 0x33, 0xcd, 0xa4, 0x48, 0x52, 0xed, 0x55, 0x06, 0x68, 0x58, 0x0b, 0xda, 0x0e, 0x7f, 0x5e, 0xe4, 386 | 0xa2, 0x93, 0x8c, 0x8e, 0xa0, 0xe6, 0x58, 0xbf, 0xc9, 0xa7, 0x0c, 0xea, 0xae, 0xc7, 0xdb, 0x66, 387 | 0xde, 0x03, 0x94, 0x5d, 0x94, 0x57, 0x31, 0xd3, 0x5e, 0x77, 0xe3, 0xe8, 0xe8, 0x67, 0xc0, 0x65, 388 | 0x82, 0xbc, 0x86, 0x97, 0x3a, 0xd9, 0x71, 0xa5, 0xd9, 0x4e, 0x2e, 0x53, 0x96, 0x0a, 0x65, 0xba, 389 | 0x3c, 0x8d, 0x1a, 0x65, 0xf8, 0x6b, 0x1e, 0x25, 0x6d, 0xa8, 0x1e, 0xd8, 0x76, 0xcf, 0xcd, 0xd0, 390 | 0x28, 0xb2, 0x1f, 0xf4, 0x39, 0x54, 0x3f, 0xee, 0xa4, 0x3e, 0x06, 0xff, 0x11, 0xe0, 0xef, 0xdf, 391 | 0xe6, 0xe1, 0x97, 0xbc, 0x31, 0xf9, 0x09, 0x70, 0x3a, 0x0e, 0xe2, 0x5f, 0x9e, 0x81, 0xbb, 0xf0, 392 | 0x5e, 0xd3, 0xc9, 0x1a, 0x16, 0x7d, 0xf5, 0xf7, 0xee, 0xfe, 0x5f, 0xa5, 0x4b, 0x89, 0x39, 0xbd, 393 | 0xc3, 0xdb, 0xe9, 0x9f, 0xb2, 0x68, 0x86, 0x46, 0x24, 0x06, 0x5c, 0xee, 0x9c, 0xf4, 0x2f, 0x96, 394 | 0x7b, 0x86, 0xf6, 0xaf, 0x27, 0xed, 0xaf, 0xa5, 0xbe, 0x69, 0xd3, 0xa1, 0xad, 0xa2, 0x4d, 0x56, 395 | 0x48, 0x66, 0x68, 0x14, 0x52, 0x68, 0xe4, 0xc5, 0x63, 0x5b, 0x9d, 0xc9, 0x75, 0xd8, 0x28, 0xe7, 396 | 0x5b, 0xe4, 0x2f, 0x60, 0x81, 0x56, 0xcf, 0xcc, 0x53, 0x78, 0xf7, 0x10, 0x00, 0x00, 0xff, 0xff, 397 | 0xb2, 0x3f, 0x6f, 0x0d, 0x40, 0x03, 0x00, 0x00, 398 | } 399 | 400 | // Reference imports to suppress errors if they are not otherwise used. 401 | var _ context.Context 402 | var _ grpc.ClientConn 403 | 404 | // This is a compile-time assertion to ensure that this generated file 405 | // is compatible with the grpc package it is being compiled against. 406 | const _ = grpc.SupportPackageIsVersion4 407 | 408 | // TSDBLayerClient is the client API for TSDBLayer service. 409 | // 410 | // For semantics around ctx use and closing/ending streaming RPCs, please refer to https://godoc.org/google.golang.org/grpc#ClientConn.NewStream. 411 | type TSDBLayerClient interface { 412 | WriteBatch(ctx context.Context, in *WriteBatchRequest, opts ...grpc.CallOption) (*Empty, error) 413 | ReadBatch(ctx context.Context, in *ReadBatchRequest, opts ...grpc.CallOption) (*ReadBatchResponse, error) 414 | } 415 | 416 | type tSDBLayerClient struct { 417 | cc *grpc.ClientConn 418 | } 419 | 420 | func NewTSDBLayerClient(cc *grpc.ClientConn) TSDBLayerClient { 421 | return &tSDBLayerClient{cc} 422 | } 423 | 424 | func (c *tSDBLayerClient) WriteBatch(ctx context.Context, in *WriteBatchRequest, opts ...grpc.CallOption) (*Empty, error) { 425 | out := new(Empty) 426 | err := c.cc.Invoke(ctx, "/tsdblayer.TSDBLayer/WriteBatch", in, out, opts...) 427 | if err != nil { 428 | return nil, err 429 | } 430 | return out, nil 431 | } 432 | 433 | func (c *tSDBLayerClient) ReadBatch(ctx context.Context, in *ReadBatchRequest, opts ...grpc.CallOption) (*ReadBatchResponse, error) { 434 | out := new(ReadBatchResponse) 435 | err := c.cc.Invoke(ctx, "/tsdblayer.TSDBLayer/ReadBatch", in, out, opts...) 436 | if err != nil { 437 | return nil, err 438 | } 439 | return out, nil 440 | } 441 | 442 | // TSDBLayerServer is the server API for TSDBLayer service. 443 | type TSDBLayerServer interface { 444 | WriteBatch(context.Context, *WriteBatchRequest) (*Empty, error) 445 | ReadBatch(context.Context, *ReadBatchRequest) (*ReadBatchResponse, error) 446 | } 447 | 448 | // UnimplementedTSDBLayerServer can be embedded to have forward compatible implementations. 449 | type UnimplementedTSDBLayerServer struct { 450 | } 451 | 452 | func (*UnimplementedTSDBLayerServer) WriteBatch(ctx context.Context, req *WriteBatchRequest) (*Empty, error) { 453 | return nil, status.Errorf(codes.Unimplemented, "method WriteBatch not implemented") 454 | } 455 | func (*UnimplementedTSDBLayerServer) ReadBatch(ctx context.Context, req *ReadBatchRequest) (*ReadBatchResponse, error) { 456 | return nil, status.Errorf(codes.Unimplemented, "method ReadBatch not implemented") 457 | } 458 | 459 | func RegisterTSDBLayerServer(s *grpc.Server, srv TSDBLayerServer) { 460 | s.RegisterService(&_TSDBLayer_serviceDesc, srv) 461 | } 462 | 463 | func _TSDBLayer_WriteBatch_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { 464 | in := new(WriteBatchRequest) 465 | if err := dec(in); err != nil { 466 | return nil, err 467 | } 468 | if interceptor == nil { 469 | return srv.(TSDBLayerServer).WriteBatch(ctx, in) 470 | } 471 | info := &grpc.UnaryServerInfo{ 472 | Server: srv, 473 | FullMethod: "/tsdblayer.TSDBLayer/WriteBatch", 474 | } 475 | handler := func(ctx context.Context, req interface{}) (interface{}, error) { 476 | return srv.(TSDBLayerServer).WriteBatch(ctx, req.(*WriteBatchRequest)) 477 | } 478 | return interceptor(ctx, in, info, handler) 479 | } 480 | 481 | func _TSDBLayer_ReadBatch_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { 482 | in := new(ReadBatchRequest) 483 | if err := dec(in); err != nil { 484 | return nil, err 485 | } 486 | if interceptor == nil { 487 | return srv.(TSDBLayerServer).ReadBatch(ctx, in) 488 | } 489 | info := &grpc.UnaryServerInfo{ 490 | Server: srv, 491 | FullMethod: "/tsdblayer.TSDBLayer/ReadBatch", 492 | } 493 | handler := func(ctx context.Context, req interface{}) (interface{}, error) { 494 | return srv.(TSDBLayerServer).ReadBatch(ctx, req.(*ReadBatchRequest)) 495 | } 496 | return interceptor(ctx, in, info, handler) 497 | } 498 | 499 | var _TSDBLayer_serviceDesc = grpc.ServiceDesc{ 500 | ServiceName: "tsdblayer.TSDBLayer", 501 | HandlerType: (*TSDBLayerServer)(nil), 502 | Methods: []grpc.MethodDesc{ 503 | { 504 | MethodName: "WriteBatch", 505 | Handler: _TSDBLayer_WriteBatch_Handler, 506 | }, 507 | { 508 | MethodName: "ReadBatch", 509 | Handler: _TSDBLayer_ReadBatch_Handler, 510 | }, 511 | }, 512 | Streams: []grpc.StreamDesc{}, 513 | Metadata: "rpc.proto", 514 | } 515 | -------------------------------------------------------------------------------- /protos/.gen/rpc.pb.gw.go: -------------------------------------------------------------------------------- 1 | // Code generated by protoc-gen-grpc-gateway. DO NOT EDIT. 2 | // source: rpc.proto 3 | 4 | /* 5 | Package tsdblayer is a reverse proxy. 6 | 7 | It translates gRPC into RESTful JSON APIs. 8 | */ 9 | package tsdblayer 10 | 11 | import ( 12 | "context" 13 | "io" 14 | "net/http" 15 | 16 | "github.com/golang/protobuf/proto" 17 | "github.com/grpc-ecosystem/grpc-gateway/runtime" 18 | "github.com/grpc-ecosystem/grpc-gateway/utilities" 19 | "google.golang.org/grpc" 20 | "google.golang.org/grpc/codes" 21 | "google.golang.org/grpc/grpclog" 22 | "google.golang.org/grpc/status" 23 | ) 24 | 25 | var _ codes.Code 26 | var _ io.Reader 27 | var _ status.Status 28 | var _ = runtime.String 29 | var _ = utilities.NewDoubleArray 30 | 31 | func request_TSDBLayer_WriteBatch_0(ctx context.Context, marshaler runtime.Marshaler, client TSDBLayerClient, req *http.Request, pathParams map[string]string) (proto.Message, runtime.ServerMetadata, error) { 32 | var protoReq WriteBatchRequest 33 | var metadata runtime.ServerMetadata 34 | 35 | newReader, berr := utilities.IOReaderFactory(req.Body) 36 | if berr != nil { 37 | return nil, metadata, status.Errorf(codes.InvalidArgument, "%v", berr) 38 | } 39 | if err := marshaler.NewDecoder(newReader()).Decode(&protoReq); err != nil && err != io.EOF { 40 | return nil, metadata, status.Errorf(codes.InvalidArgument, "%v", err) 41 | } 42 | 43 | msg, err := client.WriteBatch(ctx, &protoReq, grpc.Header(&metadata.HeaderMD), grpc.Trailer(&metadata.TrailerMD)) 44 | return msg, metadata, err 45 | 46 | } 47 | 48 | func request_TSDBLayer_ReadBatch_0(ctx context.Context, marshaler runtime.Marshaler, client TSDBLayerClient, req *http.Request, pathParams map[string]string) (proto.Message, runtime.ServerMetadata, error) { 49 | var protoReq ReadBatchRequest 50 | var metadata runtime.ServerMetadata 51 | 52 | newReader, berr := utilities.IOReaderFactory(req.Body) 53 | if berr != nil { 54 | return nil, metadata, status.Errorf(codes.InvalidArgument, "%v", berr) 55 | } 56 | if err := marshaler.NewDecoder(newReader()).Decode(&protoReq); err != nil && err != io.EOF { 57 | return nil, metadata, status.Errorf(codes.InvalidArgument, "%v", err) 58 | } 59 | 60 | msg, err := client.ReadBatch(ctx, &protoReq, grpc.Header(&metadata.HeaderMD), grpc.Trailer(&metadata.TrailerMD)) 61 | return msg, metadata, err 62 | 63 | } 64 | 65 | // RegisterTSDBLayerHandlerFromEndpoint is same as RegisterTSDBLayerHandler but 66 | // automatically dials to "endpoint" and closes the connection when "ctx" gets done. 67 | func RegisterTSDBLayerHandlerFromEndpoint(ctx context.Context, mux *runtime.ServeMux, endpoint string, opts []grpc.DialOption) (err error) { 68 | conn, err := grpc.Dial(endpoint, opts...) 69 | if err != nil { 70 | return err 71 | } 72 | defer func() { 73 | if err != nil { 74 | if cerr := conn.Close(); cerr != nil { 75 | grpclog.Infof("Failed to close conn to %s: %v", endpoint, cerr) 76 | } 77 | return 78 | } 79 | go func() { 80 | <-ctx.Done() 81 | if cerr := conn.Close(); cerr != nil { 82 | grpclog.Infof("Failed to close conn to %s: %v", endpoint, cerr) 83 | } 84 | }() 85 | }() 86 | 87 | return RegisterTSDBLayerHandler(ctx, mux, conn) 88 | } 89 | 90 | // RegisterTSDBLayerHandler registers the http handlers for service TSDBLayer to "mux". 91 | // The handlers forward requests to the grpc endpoint over "conn". 92 | func RegisterTSDBLayerHandler(ctx context.Context, mux *runtime.ServeMux, conn *grpc.ClientConn) error { 93 | return RegisterTSDBLayerHandlerClient(ctx, mux, NewTSDBLayerClient(conn)) 94 | } 95 | 96 | // RegisterTSDBLayerHandlerClient registers the http handlers for service TSDBLayer 97 | // to "mux". The handlers forward requests to the grpc endpoint over the given implementation of "TSDBLayerClient". 98 | // Note: the gRPC framework executes interceptors within the gRPC handler. If the passed in "TSDBLayerClient" 99 | // doesn't go through the normal gRPC flow (creating a gRPC client etc.) then it will be up to the passed in 100 | // "TSDBLayerClient" to call the correct interceptors. 101 | func RegisterTSDBLayerHandlerClient(ctx context.Context, mux *runtime.ServeMux, client TSDBLayerClient) error { 102 | 103 | mux.Handle("POST", pattern_TSDBLayer_WriteBatch_0, func(w http.ResponseWriter, req *http.Request, pathParams map[string]string) { 104 | ctx, cancel := context.WithCancel(req.Context()) 105 | defer cancel() 106 | inboundMarshaler, outboundMarshaler := runtime.MarshalerForRequest(mux, req) 107 | rctx, err := runtime.AnnotateContext(ctx, mux, req) 108 | if err != nil { 109 | runtime.HTTPError(ctx, mux, outboundMarshaler, w, req, err) 110 | return 111 | } 112 | resp, md, err := request_TSDBLayer_WriteBatch_0(rctx, inboundMarshaler, client, req, pathParams) 113 | ctx = runtime.NewServerMetadataContext(ctx, md) 114 | if err != nil { 115 | runtime.HTTPError(ctx, mux, outboundMarshaler, w, req, err) 116 | return 117 | } 118 | 119 | forward_TSDBLayer_WriteBatch_0(ctx, mux, outboundMarshaler, w, req, resp, mux.GetForwardResponseOptions()...) 120 | 121 | }) 122 | 123 | mux.Handle("POST", pattern_TSDBLayer_ReadBatch_0, func(w http.ResponseWriter, req *http.Request, pathParams map[string]string) { 124 | ctx, cancel := context.WithCancel(req.Context()) 125 | defer cancel() 126 | inboundMarshaler, outboundMarshaler := runtime.MarshalerForRequest(mux, req) 127 | rctx, err := runtime.AnnotateContext(ctx, mux, req) 128 | if err != nil { 129 | runtime.HTTPError(ctx, mux, outboundMarshaler, w, req, err) 130 | return 131 | } 132 | resp, md, err := request_TSDBLayer_ReadBatch_0(rctx, inboundMarshaler, client, req, pathParams) 133 | ctx = runtime.NewServerMetadataContext(ctx, md) 134 | if err != nil { 135 | runtime.HTTPError(ctx, mux, outboundMarshaler, w, req, err) 136 | return 137 | } 138 | 139 | forward_TSDBLayer_ReadBatch_0(ctx, mux, outboundMarshaler, w, req, resp, mux.GetForwardResponseOptions()...) 140 | 141 | }) 142 | 143 | return nil 144 | } 145 | 146 | var ( 147 | pattern_TSDBLayer_WriteBatch_0 = runtime.MustPattern(runtime.NewPattern(1, []int{2, 0, 2, 1, 2, 2}, []string{"api", "v1", "writeBatch"}, "")) 148 | 149 | pattern_TSDBLayer_ReadBatch_0 = runtime.MustPattern(runtime.NewPattern(1, []int{2, 0, 2, 1, 2, 2}, []string{"api", "v1", "readBatch"}, "")) 150 | ) 151 | 152 | var ( 153 | forward_TSDBLayer_WriteBatch_0 = runtime.ForwardResponseMessage 154 | 155 | forward_TSDBLayer_ReadBatch_0 = runtime.ForwardResponseMessage 156 | ) 157 | -------------------------------------------------------------------------------- /protos/google/api/annotations.proto: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2015, Google Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | syntax = "proto3"; 16 | 17 | package google.api; 18 | 19 | import "google/api/http.proto"; 20 | import "google/protobuf/descriptor.proto"; 21 | 22 | option go_package = "google.golang.org/genproto/googleapis/api/annotations;annotations"; 23 | option java_multiple_files = true; 24 | option java_outer_classname = "AnnotationsProto"; 25 | option java_package = "com.google.api"; 26 | option objc_class_prefix = "GAPI"; 27 | 28 | extend google.protobuf.MethodOptions { 29 | // See `HttpRule`. 30 | HttpRule http = 72295728; 31 | } -------------------------------------------------------------------------------- /protos/google/api/http.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Google LLC. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | 16 | syntax = "proto3"; 17 | 18 | package google.api; 19 | 20 | option cc_enable_arenas = true; 21 | option go_package = "google.golang.org/genproto/googleapis/api/annotations;annotations"; 22 | option java_multiple_files = true; 23 | option java_outer_classname = "HttpProto"; 24 | option java_package = "com.google.api"; 25 | option objc_class_prefix = "GAPI"; 26 | 27 | // Defines the HTTP configuration for an API service. It contains a list of 28 | // [HttpRule][google.api.HttpRule], each specifying the mapping of an RPC method 29 | // to one or more HTTP REST API methods. 30 | message Http { 31 | // A list of HTTP configuration rules that apply to individual API methods. 32 | // 33 | // **NOTE:** All service configuration rules follow "last one wins" order. 34 | repeated HttpRule rules = 1; 35 | 36 | // When set to true, URL path parameters will be fully URI-decoded except in 37 | // cases of single segment matches in reserved expansion, where "%2F" will be 38 | // left encoded. 39 | // 40 | // The default behavior is to not decode RFC 6570 reserved characters in multi 41 | // segment matches. 42 | bool fully_decode_reserved_expansion = 2; 43 | } 44 | 45 | // # gRPC Transcoding 46 | // 47 | // gRPC Transcoding is a feature for mapping between a gRPC method and one or 48 | // more HTTP REST endpoints. It allows developers to build a single API service 49 | // that supports both gRPC APIs and REST APIs. Many systems, including [Google 50 | // APIs](https://github.com/googleapis/googleapis), 51 | // [Cloud Endpoints](https://cloud.google.com/endpoints), [gRPC 52 | // Gateway](https://github.com/grpc-ecosystem/grpc-gateway), 53 | // and [Envoy](https://github.com/envoyproxy/envoy) proxy support this feature 54 | // and use it for large scale production services. 55 | // 56 | // `HttpRule` defines the schema of the gRPC/REST mapping. The mapping specifies 57 | // how different portions of the gRPC request message are mapped to the URL 58 | // path, URL query parameters, and HTTP request body. It also controls how the 59 | // gRPC response message is mapped to the HTTP response body. `HttpRule` is 60 | // typically specified as an `google.api.http` annotation on the gRPC method. 61 | // 62 | // Each mapping specifies a URL path template and an HTTP method. The path 63 | // template may refer to one or more fields in the gRPC request message, as long 64 | // as each field is a non-repeated field with a primitive (non-message) type. 65 | // The path template controls how fields of the request message are mapped to 66 | // the URL path. 67 | // 68 | // Example: 69 | // 70 | // service Messaging { 71 | // rpc GetMessage(GetMessageRequest) returns (Message) { 72 | // option (google.api.http) = { 73 | // get: "/v1/{name=messages/*}" 74 | // }; 75 | // } 76 | // } 77 | // message GetMessageRequest { 78 | // string name = 1; // Mapped to URL path. 79 | // } 80 | // message Message { 81 | // string text = 1; // The resource content. 82 | // } 83 | // 84 | // This enables an HTTP REST to gRPC mapping as below: 85 | // 86 | // HTTP | gRPC 87 | // -----|----- 88 | // `GET /v1/messages/123456` | `GetMessage(name: "messages/123456")` 89 | // 90 | // Any fields in the request message which are not bound by the path template 91 | // automatically become HTTP query parameters if there is no HTTP request body. 92 | // For example: 93 | // 94 | // service Messaging { 95 | // rpc GetMessage(GetMessageRequest) returns (Message) { 96 | // option (google.api.http) = { 97 | // get:"/v1/messages/{message_id}" 98 | // }; 99 | // } 100 | // } 101 | // message GetMessageRequest { 102 | // message SubMessage { 103 | // string subfield = 1; 104 | // } 105 | // string message_id = 1; // Mapped to URL path. 106 | // int64 revision = 2; // Mapped to URL query parameter `revision`. 107 | // SubMessage sub = 3; // Mapped to URL query parameter `sub.subfield`. 108 | // } 109 | // 110 | // This enables a HTTP JSON to RPC mapping as below: 111 | // 112 | // HTTP | gRPC 113 | // -----|----- 114 | // `GET /v1/messages/123456?revision=2&sub.subfield=foo` | 115 | // `GetMessage(message_id: "123456" revision: 2 sub: SubMessage(subfield: 116 | // "foo"))` 117 | // 118 | // Note that fields which are mapped to URL query parameters must have a 119 | // primitive type or a repeated primitive type or a non-repeated message type. 120 | // In the case of a repeated type, the parameter can be repeated in the URL 121 | // as `...?param=A¶m=B`. In the case of a message type, each field of the 122 | // message is mapped to a separate parameter, such as 123 | // `...?foo.a=A&foo.b=B&foo.c=C`. 124 | // 125 | // For HTTP methods that allow a request body, the `body` field 126 | // specifies the mapping. Consider a REST update method on the 127 | // message resource collection: 128 | // 129 | // service Messaging { 130 | // rpc UpdateMessage(UpdateMessageRequest) returns (Message) { 131 | // option (google.api.http) = { 132 | // patch: "/v1/messages/{message_id}" 133 | // body: "message" 134 | // }; 135 | // } 136 | // } 137 | // message UpdateMessageRequest { 138 | // string message_id = 1; // mapped to the URL 139 | // Message message = 2; // mapped to the body 140 | // } 141 | // 142 | // The following HTTP JSON to RPC mapping is enabled, where the 143 | // representation of the JSON in the request body is determined by 144 | // protos JSON encoding: 145 | // 146 | // HTTP | gRPC 147 | // -----|----- 148 | // `PATCH /v1/messages/123456 { "text": "Hi!" }` | `UpdateMessage(message_id: 149 | // "123456" message { text: "Hi!" })` 150 | // 151 | // The special name `*` can be used in the body mapping to define that 152 | // every field not bound by the path template should be mapped to the 153 | // request body. This enables the following alternative definition of 154 | // the update method: 155 | // 156 | // service Messaging { 157 | // rpc UpdateMessage(Message) returns (Message) { 158 | // option (google.api.http) = { 159 | // patch: "/v1/messages/{message_id}" 160 | // body: "*" 161 | // }; 162 | // } 163 | // } 164 | // message Message { 165 | // string message_id = 1; 166 | // string text = 2; 167 | // } 168 | // 169 | // 170 | // The following HTTP JSON to RPC mapping is enabled: 171 | // 172 | // HTTP | gRPC 173 | // -----|----- 174 | // `PATCH /v1/messages/123456 { "text": "Hi!" }` | `UpdateMessage(message_id: 175 | // "123456" text: "Hi!")` 176 | // 177 | // Note that when using `*` in the body mapping, it is not possible to 178 | // have HTTP parameters, as all fields not bound by the path end in 179 | // the body. This makes this option more rarely used in practice when 180 | // defining REST APIs. The common usage of `*` is in custom methods 181 | // which don't use the URL at all for transferring data. 182 | // 183 | // It is possible to define multiple HTTP methods for one RPC by using 184 | // the `additional_bindings` option. Example: 185 | // 186 | // service Messaging { 187 | // rpc GetMessage(GetMessageRequest) returns (Message) { 188 | // option (google.api.http) = { 189 | // get: "/v1/messages/{message_id}" 190 | // additional_bindings { 191 | // get: "/v1/users/{user_id}/messages/{message_id}" 192 | // } 193 | // }; 194 | // } 195 | // } 196 | // message GetMessageRequest { 197 | // string message_id = 1; 198 | // string user_id = 2; 199 | // } 200 | // 201 | // This enables the following two alternative HTTP JSON to RPC mappings: 202 | // 203 | // HTTP | gRPC 204 | // -----|----- 205 | // `GET /v1/messages/123456` | `GetMessage(message_id: "123456")` 206 | // `GET /v1/users/me/messages/123456` | `GetMessage(user_id: "me" message_id: 207 | // "123456")` 208 | // 209 | // ## Rules for HTTP mapping 210 | // 211 | // 1. Leaf request fields (recursive expansion nested messages in the request 212 | // message) are classified into three categories: 213 | // - Fields referred by the path template. They are passed via the URL path. 214 | // - Fields referred by the [HttpRule.body][google.api.HttpRule.body]. They are passed via the HTTP 215 | // request body. 216 | // - All other fields are passed via the URL query parameters, and the 217 | // parameter name is the field path in the request message. A repeated 218 | // field can be represented as multiple query parameters under the same 219 | // name. 220 | // 2. If [HttpRule.body][google.api.HttpRule.body] is "*", there is no URL query parameter, all fields 221 | // are passed via URL path and HTTP request body. 222 | // 3. If [HttpRule.body][google.api.HttpRule.body] is omitted, there is no HTTP request body, all 223 | // fields are passed via URL path and URL query parameters. 224 | // 225 | // ### Path template syntax 226 | // 227 | // Template = "/" Segments [ Verb ] ; 228 | // Segments = Segment { "/" Segment } ; 229 | // Segment = "*" | "**" | LITERAL | Variable ; 230 | // Variable = "{" FieldPath [ "=" Segments ] "}" ; 231 | // FieldPath = IDENT { "." IDENT } ; 232 | // Verb = ":" LITERAL ; 233 | // 234 | // The syntax `*` matches a single URL path segment. The syntax `**` matches 235 | // zero or more URL path segments, which must be the last part of the URL path 236 | // except the `Verb`. 237 | // 238 | // The syntax `Variable` matches part of the URL path as specified by its 239 | // template. A variable template must not contain other variables. If a variable 240 | // matches a single path segment, its template may be omitted, e.g. `{var}` 241 | // is equivalent to `{var=*}`. 242 | // 243 | // The syntax `LITERAL` matches literal text in the URL path. If the `LITERAL` 244 | // contains any reserved character, such characters should be percent-encoded 245 | // before the matching. 246 | // 247 | // If a variable contains exactly one path segment, such as `"{var}"` or 248 | // `"{var=*}"`, when such a variable is expanded into a URL path on the client 249 | // side, all characters except `[-_.~0-9a-zA-Z]` are percent-encoded. The 250 | // server side does the reverse decoding. Such variables show up in the 251 | // [Discovery 252 | // Document](https://developers.google.com/discovery/v1/reference/apis) as 253 | // `{var}`. 254 | // 255 | // If a variable contains multiple path segments, such as `"{var=foo/*}"` 256 | // or `"{var=**}"`, when such a variable is expanded into a URL path on the 257 | // client side, all characters except `[-_.~/0-9a-zA-Z]` are percent-encoded. 258 | // The server side does the reverse decoding, except "%2F" and "%2f" are left 259 | // unchanged. Such variables show up in the 260 | // [Discovery 261 | // Document](https://developers.google.com/discovery/v1/reference/apis) as 262 | // `{+var}`. 263 | // 264 | // ## Using gRPC API Service Configuration 265 | // 266 | // gRPC API Service Configuration (service config) is a configuration language 267 | // for configuring a gRPC service to become a user-facing product. The 268 | // service config is simply the YAML representation of the `google.api.Service` 269 | // proto message. 270 | // 271 | // As an alternative to annotating your proto file, you can configure gRPC 272 | // transcoding in your service config YAML files. You do this by specifying a 273 | // `HttpRule` that maps the gRPC method to a REST endpoint, achieving the same 274 | // effect as the proto annotation. This can be particularly useful if you 275 | // have a proto that is reused in multiple services. Note that any transcoding 276 | // specified in the service config will override any matching transcoding 277 | // configuration in the proto. 278 | // 279 | // Example: 280 | // 281 | // http: 282 | // rules: 283 | // # Selects a gRPC method and applies HttpRule to it. 284 | // - selector: example.v1.Messaging.GetMessage 285 | // get: /v1/messages/{message_id}/{sub.subfield} 286 | // 287 | // ## Special notes 288 | // 289 | // When gRPC Transcoding is used to map a gRPC to JSON REST endpoints, the 290 | // proto to JSON conversion must follow the [proto3 291 | // specification](https://developers.google.com/protocol-buffers/docs/proto3#json). 292 | // 293 | // While the single segment variable follows the semantics of 294 | // [RFC 6570](https://tools.ietf.org/html/rfc6570) Section 3.2.2 Simple String 295 | // Expansion, the multi segment variable **does not** follow RFC 6570 Section 296 | // 3.2.3 Reserved Expansion. The reason is that the Reserved Expansion 297 | // does not expand special characters like `?` and `#`, which would lead 298 | // to invalid URLs. As the result, gRPC Transcoding uses a custom encoding 299 | // for multi segment variables. 300 | // 301 | // The path variables **must not** refer to any repeated or mapped field, 302 | // because client libraries are not capable of handling such variable expansion. 303 | // 304 | // The path variables **must not** capture the leading "/" character. The reason 305 | // is that the most common use case "{var}" does not capture the leading "/" 306 | // character. For consistency, all path variables must share the same behavior. 307 | // 308 | // Repeated message fields must not be mapped to URL query parameters, because 309 | // no client library can support such complicated mapping. 310 | // 311 | // If an API needs to use a JSON array for request or response body, it can map 312 | // the request or response body to a repeated field. However, some gRPC 313 | // Transcoding implementations may not support this feature. 314 | message HttpRule { 315 | // Selects a method to which this rule applies. 316 | // 317 | // Refer to [selector][google.api.DocumentationRule.selector] for syntax details. 318 | string selector = 1; 319 | 320 | // Determines the URL pattern is matched by this rules. This pattern can be 321 | // used with any of the {get|put|post|delete|patch} methods. A custom method 322 | // can be defined using the 'custom' field. 323 | oneof pattern { 324 | // Maps to HTTP GET. Used for listing and getting information about 325 | // resources. 326 | string get = 2; 327 | 328 | // Maps to HTTP PUT. Used for replacing a resource. 329 | string put = 3; 330 | 331 | // Maps to HTTP POST. Used for creating a resource or performing an action. 332 | string post = 4; 333 | 334 | // Maps to HTTP DELETE. Used for deleting a resource. 335 | string delete = 5; 336 | 337 | // Maps to HTTP PATCH. Used for updating a resource. 338 | string patch = 6; 339 | 340 | // The custom pattern is used for specifying an HTTP method that is not 341 | // included in the `pattern` field, such as HEAD, or "*" to leave the 342 | // HTTP method unspecified for this rule. The wild-card rule is useful 343 | // for services that provide content to Web (HTML) clients. 344 | CustomHttpPattern custom = 8; 345 | } 346 | 347 | // The name of the request field whose value is mapped to the HTTP request 348 | // body, or `*` for mapping all request fields not captured by the path 349 | // pattern to the HTTP body, or omitted for not having any HTTP request body. 350 | // 351 | // NOTE: the referred field must be present at the top-level of the request 352 | // message type. 353 | string body = 7; 354 | 355 | // Optional. The name of the response field whose value is mapped to the HTTP 356 | // response body. When omitted, the entire response message will be used 357 | // as the HTTP response body. 358 | // 359 | // NOTE: The referred field must be present at the top-level of the response 360 | // message type. 361 | string response_body = 12; 362 | 363 | // Additional HTTP bindings for the selector. Nested bindings must 364 | // not contain an `additional_bindings` field themselves (that is, 365 | // the nesting may only be one level deep). 366 | repeated HttpRule additional_bindings = 11; 367 | } 368 | 369 | // A custom pattern is used for defining custom HTTP verb. 370 | message CustomHttpPattern { 371 | // The name of this custom HTTP verb. 372 | string kind = 1; 373 | 374 | // The path matched by this custom verb. 375 | string path = 2; 376 | } -------------------------------------------------------------------------------- /protos/rpc.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | option java_multiple_files = true; 4 | option java_package = "tsdb-layer.rpc"; 5 | option java_outer_classname = "TSDBLayerProto"; 6 | 7 | package tsdblayer; 8 | 9 | import "google/api/annotations.proto"; 10 | 11 | // Interface exported by the server. 12 | service TSDBLayer { 13 | rpc WriteBatch(WriteBatchRequest) returns (Empty) { 14 | option (google.api.http) = { 15 | post: "/api/v1/writeBatch" 16 | body: "*" 17 | }; 18 | } 19 | rpc ReadBatch(ReadBatchRequest) returns (ReadBatchResponse) { 20 | option (google.api.http) = { 21 | post: "/api/v1/readBatch" 22 | body: "*" 23 | }; 24 | } 25 | } 26 | 27 | 28 | message WriteBatchRequest { 29 | repeated WriteRequest batch = 1; 30 | } 31 | 32 | message ReadBatchRequest { 33 | repeated ReadRequest batch = 1; 34 | } 35 | 36 | message ReadBatchResponse { 37 | repeated ReadResponse batch = 1; 38 | } 39 | 40 | message WriteRequest { 41 | string series_id = 1; 42 | Datapoint datapoint = 2; 43 | } 44 | 45 | message ReadRequest { 46 | // TODO(rartoul): Time ranges. 47 | string series_id = 1; 48 | } 49 | 50 | message ReadResponse { 51 | string series_id = 1; 52 | repeated Datapoint datapoints = 2; 53 | } 54 | 55 | message Datapoint { 56 | uint64 timestamp_nanos = 1; 57 | double value = 2; 58 | } 59 | 60 | message Empty {} 61 | 62 | -------------------------------------------------------------------------------- /resources/fdb_index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/richardartoul/tsdb-layer/ae5d4df717e8d728bd764bbad452e37488b76576/resources/fdb_index.png -------------------------------------------------------------------------------- /resources/fdb_storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/richardartoul/tsdb-layer/ae5d4df717e8d728bd764bbad452e37488b76576/resources/fdb_storage.png -------------------------------------------------------------------------------- /resources/fdb_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/richardartoul/tsdb-layer/ae5d4df717e8d728bd764bbad452e37488b76576/resources/fdb_time.png -------------------------------------------------------------------------------- /resources/m3db_storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/richardartoul/tsdb-layer/ae5d4df717e8d728bd764bbad452e37488b76576/resources/m3db_storage.png -------------------------------------------------------------------------------- /resources/m3db_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/richardartoul/tsdb-layer/ae5d4df717e8d728bd764bbad452e37488b76576/resources/m3db_time.png -------------------------------------------------------------------------------- /src/cmd/bench/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "io/ioutil" 7 | "log" 8 | "math/rand" 9 | "runtime/pprof" 10 | "sync" 11 | "sync/atomic" 12 | "time" 13 | 14 | "github.com/richardartoul/tsdb-layer/src/layer" 15 | "github.com/richardartoul/tsdb-layer/src/layer/dircompress" 16 | "github.com/richardartoul/tsdb-layer/src/layer/raw" 17 | "github.com/richardartoul/tsdb-layer/src/layer/rawblock" 18 | ) 19 | 20 | var ( 21 | numSeriesFlag = flag.Int("numSeries", 100000, "number of unique series") 22 | batchSizeFlag = flag.Int("batchSize", 128, "client batch size") 23 | numWorkersFlag = flag.Int("numWorkers", 100, "number of concurrent workers") 24 | durationFlag = flag.Duration("duration", time.Minute, "duration to run the load test") 25 | layerEngineFlag = flag.String("layerEngine", "raw-block", "layer engine to benchmark") 26 | ) 27 | 28 | func main() { 29 | flag.Parse() 30 | 31 | tempFile, err := ioutil.TempFile("", "bench_cpu ") 32 | if err != nil { 33 | panic(err) 34 | } 35 | 36 | pprof.StartCPUProfile(tempFile) 37 | defer func() { 38 | defer pprof.StopCPUProfile() 39 | fmt.Println("cpu profile at:", tempFile.Name()) 40 | }() 41 | 42 | var ( 43 | numSeries = *numSeriesFlag 44 | batchSize = *batchSizeFlag 45 | numWorkers = *numWorkersFlag 46 | duration = *durationFlag 47 | layerEngine = *layerEngineFlag 48 | ) 49 | fmt.Println("Running test with arguments:") 50 | fmt.Println(" layerEngine:", layerEngine) 51 | fmt.Println(" numSeries:", numSeries) 52 | fmt.Println(" batchSize:", batchSize) 53 | fmt.Println(" numWorkers:", numWorkers) 54 | fmt.Println(" duration:", duration) 55 | var layerClient layer.Layer 56 | switch layerEngine { 57 | case "direct-compress": 58 | layerClient = dircompress.NewLayer() 59 | case "raw": 60 | layerClient = raw.NewLayer() 61 | case "raw-block": 62 | layerClient = rawblock.NewLayer() 63 | default: 64 | log.Fatalf("invalid layer engine: %s", layerEngine) 65 | } 66 | 67 | seriesIDs := make([]string, 0, numSeries) 68 | for i := 0; i < numSeries; i++ { 69 | seriesIDs = append(seriesIDs, fmt.Sprintf("%s-%d", randomString(20), i)) 70 | } 71 | 72 | var ( 73 | wg sync.WaitGroup 74 | numWritesCompleted int64 75 | doneCh = make(chan struct{}) 76 | ) 77 | go func() { 78 | time.Sleep(duration) 79 | close(doneCh) 80 | }() 81 | for i := 0; i < numWorkers; i++ { 82 | wg.Add(1) 83 | // Chunk up the IDs into groups for each worker. 84 | idsBatchSize := len(seriesIDs) / numWorkers 85 | localIDs := seriesIDs[idsBatchSize*i : idsBatchSize*i+idsBatchSize] 86 | 87 | go func(localIDs []string) { 88 | defer wg.Done() 89 | 90 | var ( 91 | batch = make([]layer.Write, 0, batchSize) 92 | source = rand.NewSource(time.Now().UnixNano()) 93 | rng = rand.New(source) 94 | currVal int64 95 | ) 96 | for { 97 | select { 98 | case <-doneCh: 99 | atomic.AddInt64(&numWritesCompleted, currVal) 100 | return 101 | default: 102 | } 103 | batch = batch[:0] 104 | for j := 0; j < batchSize; j++ { 105 | idx := rng.Intn(len(localIDs)) 106 | batch = append( 107 | batch, 108 | layer.Write{ 109 | ID: localIDs[idx], 110 | Timestamp: time.Unix(0, int64(currVal)), 111 | Value: float64(currVal)}) 112 | currVal++ 113 | } 114 | if err := layerClient.WriteBatch(batch); err != nil { 115 | panic(err) 116 | } 117 | } 118 | }(localIDs) 119 | 120 | } 121 | wg.Wait() 122 | 123 | qps := float64(numWritesCompleted) / duration.Seconds() 124 | fmt.Println("QPS: ", qps) 125 | } 126 | 127 | func randomString(len int) string { 128 | bytes := make([]byte, len) 129 | for i := 0; i < len; i++ { 130 | bytes[i] = byte(65 + rand.Intn(25)) //A=65 and Z = 65+25 131 | } 132 | return string(bytes) 133 | } 134 | -------------------------------------------------------------------------------- /src/cmd/server/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "fmt" 7 | "log" 8 | "net" 9 | "net/http" 10 | 11 | pb "github.com/richardartoul/tsdb-layer/protos/.gen" 12 | "github.com/richardartoul/tsdb-layer/src/layer/server" 13 | 14 | "github.com/grpc-ecosystem/grpc-gateway/runtime" 15 | "google.golang.org/grpc" 16 | "google.golang.org/grpc/credentials" 17 | ) 18 | 19 | var ( 20 | useTLS = flag.Bool("use_tls", false, "Connection uses TLS if true, else plain TCP") 21 | certFile = flag.String("cert_file", "", "The TLS cert file") 22 | keyFile = flag.String("key_file", "", "The TLS key file") 23 | port = flag.Int("port", 10000, "The server port") 24 | ) 25 | 26 | func main() { 27 | flag.Parse() 28 | var ( 29 | opts []grpc.ServerOption 30 | dopts []grpc.DialOption 31 | ) 32 | if *useTLS { 33 | if *certFile == "" { 34 | log.Fatalf("cert_file path is required") 35 | } 36 | if *keyFile == "" { 37 | log.Fatalf("key_file path is required") 38 | } 39 | creds, err := credentials.NewServerTLSFromFile(*certFile, *keyFile) 40 | if err != nil { 41 | log.Fatalf("Failed to generate credentials %v", err) 42 | } 43 | opts = []grpc.ServerOption{grpc.Creds(creds)} 44 | dopts = []grpc.DialOption{grpc.WithTransportCredentials(creds)} 45 | } else { 46 | dopts = []grpc.DialOption{grpc.WithInsecure()} 47 | } 48 | 49 | conn, err := net.Listen("tcp", fmt.Sprintf(":%d", *port)) 50 | if err != nil { 51 | log.Fatalf("Failed to initial TCP listen : %v\n", err) 52 | } 53 | 54 | go func() { 55 | // Start gRPC. 56 | grpcServer := grpc.NewServer(opts...) 57 | pb.RegisterTSDBLayerServer(grpcServer, server.NewServer()) 58 | log.Printf("gRPC Listening on %s\n", conn.Addr().String()) 59 | if err := grpcServer.Serve(conn); err != nil { 60 | log.Fatalf("error initializing gRPC: %v", err) 61 | } 62 | }() 63 | 64 | connString := fmt.Sprintf("localhost:%d", *port) 65 | mux := runtime.NewServeMux() 66 | err = pb.RegisterTSDBLayerHandlerFromEndpoint(context.Background(), mux, connString, dopts) 67 | if err != nil { 68 | log.Fatalf("Failed to register http handler from endpoint: %v\n", err) 69 | } 70 | 71 | port := *port + 1 72 | log.Printf("HTTP Listening on %d\n", port) 73 | log.Fatal(http.ListenAndServe(fmt.Sprintf(":%d", port), mux)) 74 | } 75 | -------------------------------------------------------------------------------- /src/encoding/common.go: -------------------------------------------------------------------------------- 1 | package encoding 2 | 3 | const ( 4 | hasMoreBit = 1 5 | ) 6 | -------------------------------------------------------------------------------- /src/encoding/decoder.go: -------------------------------------------------------------------------------- 1 | package encoding 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "io" 7 | "math" 8 | "time" 9 | 10 | "github.com/m3db/m3/src/dbnode/encoding" 11 | "github.com/m3db/m3/src/dbnode/encoding/m3tsz" 12 | xtime "github.com/m3db/m3/src/x/time" 13 | ) 14 | 15 | type Decoder interface { 16 | ReadableDecoder 17 | Reset(b []byte) 18 | } 19 | 20 | type ReadableDecoder interface { 21 | Next() bool 22 | Current() (time.Time, float64) 23 | Err() error 24 | } 25 | 26 | type decoder struct { 27 | tsDecoder m3tsz.TimestampIterator 28 | floatDecoder m3tsz.FloatEncoderAndIterator 29 | bReader *bytes.Reader 30 | stream encoding.IStream 31 | 32 | err error 33 | done bool 34 | } 35 | 36 | // NewDecoder creates a new decoder. 37 | func NewDecoder() Decoder { 38 | return &decoder{ 39 | bReader: bytes.NewReader(nil), 40 | stream: encoding.NewIStream(nil), 41 | } 42 | } 43 | 44 | func (d *decoder) Reset(b []byte) { 45 | d.tsDecoder = m3tsz.NewTimestampIterator(opts, true) 46 | d.tsDecoder.TimeUnit = xtime.Nanosecond 47 | d.floatDecoder = m3tsz.FloatEncoderAndIterator{} 48 | d.bReader.Reset(b) 49 | d.stream.Reset(d.bReader) 50 | d.done = false 51 | } 52 | 53 | func (d *decoder) Next() bool { 54 | if d.done || d.err != nil { 55 | return false 56 | } 57 | 58 | bit, err := d.stream.ReadBit() 59 | if err == io.EOF { 60 | d.done = true 61 | return false 62 | } 63 | if err != nil { 64 | d.err = err 65 | return false 66 | } 67 | if bit != hasMoreBit { 68 | d.done = true 69 | return false 70 | } 71 | 72 | _, done, err := d.tsDecoder.ReadTimestamp(d.stream) 73 | if done { 74 | // This should never happen since we never encode the EndOfStream marker. 75 | d.err = errors.New("unexpected end of timestamp stream") 76 | return false 77 | } 78 | if err != nil { 79 | d.err = err 80 | return false 81 | } 82 | 83 | if err := d.floatDecoder.ReadFloat(d.stream); err != nil { 84 | d.err = err 85 | return false 86 | } 87 | 88 | return true 89 | } 90 | 91 | func (d *decoder) Current() (time.Time, float64) { 92 | return d.tsDecoder.PrevTime, math.Float64frombits(d.floatDecoder.PrevFloatBits) 93 | } 94 | 95 | func (d *decoder) Err() error { 96 | return d.err 97 | } 98 | -------------------------------------------------------------------------------- /src/encoding/encoder.go: -------------------------------------------------------------------------------- 1 | package encoding 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "math" 7 | "time" 8 | "unsafe" 9 | 10 | "github.com/m3db/m3/src/dbnode/encoding" 11 | "github.com/m3db/m3/src/dbnode/encoding/m3tsz" 12 | xtime "github.com/m3db/m3/src/x/time" 13 | ) 14 | 15 | var ( 16 | // TODO(rartoul): Eliminate the need for this. 17 | opts = encoding.NewOptions() 18 | ) 19 | 20 | type Encoder interface { 21 | Encode(timestamp time.Time, value float64) error 22 | LastEncoded() (time.Time, float64, bool) 23 | State() []byte 24 | Restore(b []byte) error 25 | Bytes() []byte 26 | } 27 | 28 | type marshalState struct { 29 | TSEncoder m3tsz.TimestampEncoder 30 | FloatEncoder m3tsz.FloatEncoderAndIterator 31 | LastByte byte 32 | BitPos int 33 | HasWrittenFirst bool 34 | } 35 | 36 | type encoder struct { 37 | tsEncoder m3tsz.TimestampEncoder 38 | floatEncoder m3tsz.FloatEncoderAndIterator 39 | stream OStream 40 | 41 | hasWrittenFirst bool 42 | } 43 | 44 | // NewEncoder creates a new encoder. 45 | func NewEncoder() Encoder { 46 | return &encoder{} 47 | } 48 | 49 | func (e *encoder) Encode(timestamp time.Time, value float64) error { 50 | if e.stream == nil { 51 | // Lazy init. 52 | e.stream = NewOStream() 53 | e.tsEncoder = m3tsz.NewTimestampEncoder(timestamp, xtime.Nanosecond, opts) 54 | } 55 | 56 | e.stream.WriteBit(hasMoreBit) 57 | 58 | var ( 59 | // Unsafe insanity to temporarily avoid having to fork upstream. 60 | encodingStream = *(*encoding.OStream)(unsafe.Pointer(&e.stream)) 61 | err error 62 | ) 63 | if !e.hasWrittenFirst { 64 | err = e.tsEncoder.WriteFirstTime(encodingStream, timestamp, nil, xtime.Nanosecond) 65 | } else { 66 | err = e.tsEncoder.WriteNextTime(encodingStream, timestamp, nil, xtime.Nanosecond) 67 | } 68 | if err != nil { 69 | return err 70 | } 71 | 72 | e.floatEncoder.WriteFloat(encodingStream, value) 73 | e.hasWrittenFirst = true 74 | return nil 75 | } 76 | 77 | func (e *encoder) LastEncoded() (time.Time, float64, bool) { 78 | return e.tsEncoder.PrevTime, math.Float64frombits(e.floatEncoder.PrevFloatBits), e.hasWrittenFirst 79 | } 80 | 81 | func (e *encoder) State() []byte { 82 | var ( 83 | raw, bitPos = e.stream.Rawbytes() 84 | lastByte byte 85 | ) 86 | if len(raw) > 0 { 87 | lastByte = raw[len(raw)-1] 88 | } 89 | 90 | marshalState := marshalState{ 91 | TSEncoder: e.tsEncoder, 92 | FloatEncoder: e.floatEncoder, 93 | HasWrittenFirst: e.hasWrittenFirst, 94 | LastByte: lastByte, 95 | BitPos: bitPos, 96 | } 97 | // Prevent JSON marshaling error. 98 | marshalState.TSEncoder.Options = nil 99 | 100 | // TODO(rartoul): Replace this with something efficient / performant. 101 | marshaled, err := json.Marshal(&marshalState) 102 | if err != nil { 103 | // TODO(rartoul): Remove this once there is a better encoding scheme. 104 | panic(err) 105 | } 106 | 107 | return marshaled 108 | } 109 | 110 | func (e *encoder) Restore(b []byte) error { 111 | if b == nil { 112 | return fmt.Errorf("cannot restore from nil state") 113 | } 114 | 115 | marshalState := marshalState{} 116 | if err := json.Unmarshal(b, &marshalState); err != nil { 117 | return err 118 | } 119 | 120 | e.tsEncoder = marshalState.TSEncoder 121 | e.tsEncoder.Options = opts 122 | e.floatEncoder = marshalState.FloatEncoder 123 | e.hasWrittenFirst = marshalState.HasWrittenFirst 124 | 125 | if e.stream == nil { 126 | e.stream = NewOStream() 127 | } 128 | // TODO(rartoul): Fix this non-sense. 129 | e.stream.(*ostream).buf = []byte{marshalState.LastByte} 130 | e.stream.(*ostream).pos = marshalState.BitPos 131 | 132 | return nil 133 | } 134 | 135 | func (e *encoder) Bytes() []byte { 136 | if e.stream == nil { 137 | return nil 138 | } 139 | 140 | b, _ := e.stream.Rawbytes() 141 | return b 142 | } 143 | -------------------------------------------------------------------------------- /src/encoding/merge.go: -------------------------------------------------------------------------------- 1 | package encoding 2 | 3 | // MergeStreams merges a list of streams into a a single stream. 4 | func MergeStreams(streams ...[]byte) ([]byte, error) { 5 | decoders := make([]Decoder, 0, len(streams)) 6 | for _, stream := range streams { 7 | dec := NewDecoder() 8 | dec.Reset(stream) 9 | decoders = append(decoders, dec) 10 | } 11 | 12 | multiDec := NewMultiDecoder() 13 | multiDec.Reset(decoders) 14 | 15 | mergedEnc := NewEncoder() 16 | for multiDec.Next() { 17 | mergedEnc.Encode(multiDec.Current()) 18 | } 19 | if err := multiDec.Err(); err != nil { 20 | return nil, err 21 | } 22 | 23 | return mergedEnc.Bytes(), nil 24 | } 25 | -------------------------------------------------------------------------------- /src/encoding/merge_test.go: -------------------------------------------------------------------------------- 1 | package encoding 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "testing" 7 | "time" 8 | 9 | "github.com/stretchr/testify/require" 10 | ) 11 | 12 | type mergeStreamsTestCase struct { 13 | title string 14 | streams [][]testValue 15 | } 16 | 17 | func TestMergeStreams(t *testing.T) { 18 | testCases := []mergeStreamsTestCase{ 19 | { 20 | title: "Merge two in order streams", 21 | streams: [][]testValue{ 22 | []testValue{{timestamp: time.Unix(0, 0), value: 0}}, 23 | []testValue{{timestamp: time.Unix(1, 0), value: 1}}, 24 | }, 25 | }, 26 | { 27 | title: "Merge two out of order streams", 28 | streams: [][]testValue{ 29 | []testValue{{timestamp: time.Unix(1, 0), value: 1}}, 30 | []testValue{{timestamp: time.Unix(0, 0), value: 0}}, 31 | }, 32 | }, 33 | { 34 | title: "Merge multiple streams", 35 | streams: [][]testValue{ 36 | []testValue{{timestamp: time.Unix(10, 0), value: 10}, {timestamp: time.Unix(11, 0), value: 11}}, 37 | []testValue{{timestamp: time.Unix(7, 0), value: 7}}, 38 | []testValue{{timestamp: time.Unix(8, 0), value: 8}, {timestamp: time.Unix(9, 0), value: 9}}, 39 | []testValue{{timestamp: time.Unix(1, 0), value: 1}, {timestamp: time.Unix(3, 0), value: 3}}, 40 | }, 41 | }, 42 | } 43 | 44 | for _, tc := range testCases { 45 | t.Run(tc.title, func(t *testing.T) { 46 | streams := make([][]byte, 0, len(tc.streams)) 47 | expected := []testValue{} 48 | for _, stream := range tc.streams { 49 | enc := NewEncoder() 50 | for _, v := range stream { 51 | enc.Encode(v.timestamp, v.value) 52 | expected = append(expected, v) 53 | } 54 | 55 | streams = append(streams, enc.Bytes()) 56 | } 57 | sort.Slice(expected, func(i, j int) bool { 58 | return expected[i].timestamp.Before(expected[j].timestamp) 59 | }) 60 | 61 | merged, err := MergeStreams(streams...) 62 | require.NoError(t, err) 63 | decoder := NewDecoder() 64 | decoder.Reset(merged) 65 | 66 | i := 0 67 | for decoder.Next() { 68 | currT, currV := decoder.Current() 69 | require.True( 70 | t, 71 | expected[i].timestamp.Equal(currT), 72 | fmt.Sprintf("expected %s but got %s", expected[i].timestamp.String(), currT.String())) 73 | require.Equal(t, expected[i].value, currV) 74 | i++ 75 | } 76 | require.NoError(t, decoder.Err()) 77 | require.Equal(t, len(expected), i) 78 | }) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/encoding/multi_decoder.go: -------------------------------------------------------------------------------- 1 | package encoding 2 | 3 | import ( 4 | "container/heap" 5 | "time" 6 | ) 7 | 8 | type MultiDecoder interface { 9 | ReadableDecoder 10 | Reset(decs []Decoder) 11 | } 12 | 13 | type decState struct { 14 | dec Decoder 15 | } 16 | 17 | type multiDecoder struct { 18 | decs []decState 19 | currEntry heapEntry 20 | heap minHeap 21 | err error 22 | } 23 | 24 | func NewMultiDecoder() *multiDecoder { 25 | return &multiDecoder{} 26 | } 27 | 28 | func (m *multiDecoder) Next() bool { 29 | if m.err != nil { 30 | return false 31 | } 32 | if m.heap.Len() == 0 { 33 | return false 34 | } 35 | m.currEntry = heap.Pop(&m.heap).(heapEntry) 36 | dec := m.decs[m.currEntry.decIdx].dec 37 | if dec.Next() { 38 | t, v := dec.Current() 39 | heap.Push(&m.heap, heapEntry{t: t, v: v, decIdx: m.currEntry.decIdx}) 40 | } else { 41 | if dec.Err() != nil { 42 | m.err = dec.Err() 43 | } 44 | } 45 | return true 46 | } 47 | 48 | func (m *multiDecoder) Current() (time.Time, float64) { 49 | return m.currEntry.t, m.currEntry.v 50 | } 51 | 52 | func (m *multiDecoder) Err() error { 53 | return nil 54 | } 55 | 56 | func (m *multiDecoder) Reset(decs []Decoder) { 57 | m.err = nil 58 | for i := range m.decs { 59 | m.decs[i] = decState{} 60 | } 61 | m.decs = m.decs[:0] 62 | for _, dec := range decs { 63 | m.decs = append(m.decs, decState{dec: dec}) 64 | } 65 | 66 | m.heap.vals = m.heap.vals[:0] 67 | for i, dec := range m.decs { 68 | if dec.dec.Next() { 69 | t, v := dec.dec.Current() 70 | m.heap.vals = append(m.heap.vals, heapEntry{t: t, v: v, decIdx: i}) 71 | } else { 72 | if dec.dec.Err() != nil { 73 | m.err = dec.dec.Err() 74 | } 75 | } 76 | } 77 | heap.Init(&m.heap) 78 | } 79 | 80 | type minHeap struct { 81 | vals []heapEntry 82 | } 83 | 84 | type heapEntry struct { 85 | t time.Time 86 | v float64 87 | decIdx int 88 | } 89 | 90 | func (h *minHeap) Push(x interface{}) { 91 | h.vals = append(h.vals, x.(heapEntry)) 92 | } 93 | 94 | func (h *minHeap) Pop() interface{} { 95 | lastIdx := len(h.vals) - 1 96 | x := h.vals[lastIdx] 97 | h.vals = h.vals[:lastIdx] 98 | return x 99 | } 100 | 101 | func (h *minHeap) Len() int { 102 | if h == nil { 103 | return 0 104 | } 105 | return len(h.vals) 106 | } 107 | 108 | func (h *minHeap) Less(i, j int) bool { 109 | return h.vals[i].t.Before(h.vals[j].t) 110 | } 111 | 112 | func (h *minHeap) Swap(i, j int) { 113 | h.vals[i], h.vals[j] = h.vals[j], h.vals[i] 114 | } 115 | -------------------------------------------------------------------------------- /src/encoding/multi_decoder_test.go: -------------------------------------------------------------------------------- 1 | package encoding 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "testing" 7 | "time" 8 | 9 | "github.com/stretchr/testify/require" 10 | ) 11 | 12 | type multiDecoderTestCase struct { 13 | title string 14 | streams [][]testValue 15 | } 16 | 17 | func TestMultiDecoder(t *testing.T) { 18 | testCases := []multiDecoderTestCase{ 19 | { 20 | title: "Merge two in order streams", 21 | streams: [][]testValue{ 22 | []testValue{{timestamp: time.Unix(0, 0), value: 0}}, 23 | []testValue{{timestamp: time.Unix(1, 0), value: 1}}, 24 | }, 25 | }, 26 | { 27 | title: "Merge two out of order streams", 28 | streams: [][]testValue{ 29 | []testValue{{timestamp: time.Unix(1, 0), value: 1}}, 30 | []testValue{{timestamp: time.Unix(0, 0), value: 0}}, 31 | }, 32 | }, 33 | { 34 | title: "Merge multiple streams", 35 | streams: [][]testValue{ 36 | []testValue{{timestamp: time.Unix(10, 0), value: 10}, {timestamp: time.Unix(11, 0), value: 11}}, 37 | []testValue{{timestamp: time.Unix(7, 0), value: 7}}, 38 | []testValue{{timestamp: time.Unix(8, 0), value: 8}, {timestamp: time.Unix(9, 0), value: 9}}, 39 | []testValue{{timestamp: time.Unix(1, 0), value: 1}, {timestamp: time.Unix(3, 0), value: 3}}, 40 | }, 41 | }, 42 | } 43 | 44 | for _, tc := range testCases { 45 | t.Run(tc.title, func(t *testing.T) { 46 | decs := make([]Decoder, 0, len(tc.streams)) 47 | expected := []testValue{} 48 | for _, stream := range tc.streams { 49 | enc := NewEncoder() 50 | for _, v := range stream { 51 | enc.Encode(v.timestamp, v.value) 52 | expected = append(expected, v) 53 | } 54 | 55 | dec := NewDecoder() 56 | dec.Reset(enc.Bytes()) 57 | decs = append(decs, dec) 58 | } 59 | sort.Slice(expected, func(i, j int) bool { 60 | return expected[i].timestamp.Before(expected[j].timestamp) 61 | }) 62 | 63 | multiDecoder := NewMultiDecoder() 64 | multiDecoder.Reset(decs) 65 | 66 | i := 0 67 | for multiDecoder.Next() { 68 | currT, currV := multiDecoder.Current() 69 | require.True( 70 | t, 71 | expected[i].timestamp.Equal(currT), 72 | fmt.Sprintf("expected %s but got %s", expected[i].timestamp.String(), currT.String())) 73 | require.Equal(t, expected[i].value, currV) 74 | i++ 75 | } 76 | require.NoError(t, multiDecoder.Err()) 77 | require.Equal(t, len(expected), i) 78 | }) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/encoding/ostream.go: -------------------------------------------------------------------------------- 1 | package encoding 2 | 3 | // Frked from "github.com/m3db/m3/src/dbnode/encoding/ostream.go" to make some changes that 4 | // don't make sense to include upstream. 5 | 6 | type Bit byte 7 | 8 | // OStream encapsulates a writable stream. 9 | type OStream interface { 10 | Len() int 11 | Empty() bool 12 | WriteBit(v Bit) 13 | WriteBits(v uint64, numBits int) 14 | WriteByte(v byte) 15 | WriteBytes(bytes []byte) 16 | Write(bytes []byte) (int, error) 17 | Reset(buffer []byte) 18 | Discard() []byte 19 | Rawbytes() ([]byte, int) 20 | } 21 | 22 | const ( 23 | initAllocSize = 1024 24 | ) 25 | 26 | // Ostream encapsulates a writable stream. 27 | type ostream struct { 28 | buf []byte 29 | pos int // how many bits have been used in the last byte 30 | } 31 | 32 | // NewOStream creates a new Ostream 33 | func NewOStream() OStream { 34 | return &ostream{} 35 | } 36 | 37 | // Len returns the length of the Ostream 38 | func (os *ostream) Len() int { 39 | return len(os.buf) 40 | } 41 | 42 | // Empty returns whether the Ostream is empty 43 | func (os *ostream) Empty() bool { 44 | return os.Len() == 0 && os.pos == 0 45 | } 46 | 47 | func (os *ostream) lastIndex() int { 48 | return os.Len() - 1 49 | } 50 | 51 | func (os *ostream) hasUnusedBits() bool { 52 | return os.pos > 0 && os.pos < 8 53 | } 54 | 55 | // grow appends the last byte of v to buf and sets pos to np. 56 | func (os *ostream) grow(v byte, np int) { 57 | os.ensureCapacityFor(1) 58 | os.buf = append(os.buf, v) 59 | 60 | os.pos = np 61 | } 62 | 63 | // ensureCapacity ensures that there is at least capacity for n more bytes. 64 | func (os *ostream) ensureCapacityFor(n int) { 65 | var ( 66 | currCap = cap(os.buf) 67 | currLen = len(os.buf) 68 | availableCap = currCap - currLen 69 | missingCap = n - availableCap 70 | ) 71 | if missingCap <= 0 { 72 | // Already have enough capacity. 73 | return 74 | } 75 | 76 | newCap := max(cap(os.buf)*2, currCap+missingCap) 77 | newbuf := make([]byte, 0, newCap) 78 | newbuf = append(newbuf, os.buf...) 79 | os.buf = newbuf 80 | } 81 | 82 | func (os *ostream) fillUnused(v byte) { 83 | os.buf[os.lastIndex()] |= v >> uint(os.pos) 84 | } 85 | 86 | // WriteBit writes the last bit of v. 87 | func (os *ostream) WriteBit(v Bit) { 88 | v <<= 7 89 | if !os.hasUnusedBits() { 90 | os.grow(byte(v), 1) 91 | return 92 | } 93 | os.fillUnused(byte(v)) 94 | os.pos++ 95 | } 96 | 97 | // WriteByte writes the last byte of v. 98 | func (os *ostream) WriteByte(v byte) { 99 | if !os.hasUnusedBits() { 100 | os.grow(v, 8) 101 | return 102 | } 103 | os.fillUnused(v) 104 | os.grow(v< 64 { 147 | numBits = 64 148 | } 149 | 150 | v <<= uint(64 - numBits) 151 | for numBits >= 8 { 152 | os.WriteByte(byte(v >> 56)) 153 | v <<= 8 154 | numBits -= 8 155 | } 156 | 157 | for numBits > 0 { 158 | os.WriteBit(Bit((v >> 63) & 1)) 159 | v <<= 1 160 | numBits-- 161 | } 162 | } 163 | 164 | // Discard takes the ref to the raw buffer from the ostream. 165 | func (os *ostream) Discard() []byte { 166 | buffer := os.buf 167 | 168 | os.buf = nil 169 | os.pos = 0 170 | 171 | return buffer 172 | } 173 | 174 | // Reset resets the ostream 175 | func (os *ostream) Reset(buffer []byte) { 176 | os.buf = buffer 177 | 178 | os.pos = 0 179 | if os.Len() > 0 { 180 | // If the byte array passed in is not empty, we set 181 | // pos to 8 indicating the last byte is fully used. 182 | os.pos = 8 183 | } 184 | } 185 | 186 | func (os *ostream) Rawbytes() ([]byte, int) { 187 | return os.buf, os.pos 188 | } 189 | 190 | func max(x, y int) int { 191 | if x > y { 192 | return x 193 | } 194 | return y 195 | } 196 | -------------------------------------------------------------------------------- /src/encoding/round_trip_test.go: -------------------------------------------------------------------------------- 1 | package encoding 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/stretchr/testify/require" 8 | ) 9 | 10 | type testValue struct { 11 | timestamp time.Time 12 | value float64 13 | } 14 | 15 | type roundTripTestCase struct { 16 | title string 17 | vals []testValue 18 | } 19 | 20 | // TODO(rartoul): This probably needs some kind of property test. 21 | func TestRoundTripSimple(t *testing.T) { 22 | testCases := []roundTripTestCase{ 23 | { 24 | title: "simple in order", 25 | vals: []testValue{ 26 | { 27 | timestamp: time.Unix(0, 1), 28 | value: -1, 29 | }, 30 | { 31 | timestamp: time.Unix(0, 2), 32 | value: 0, 33 | }, 34 | { 35 | timestamp: time.Unix(0, 3), 36 | value: 1, 37 | }, 38 | }, 39 | }, 40 | { 41 | title: "simple out of order", 42 | vals: []testValue{ 43 | { 44 | timestamp: time.Unix(0, 3), 45 | value: -1, 46 | }, 47 | { 48 | timestamp: time.Unix(0, 2), 49 | value: 0, 50 | }, 51 | { 52 | timestamp: time.Unix(0, 1), 53 | value: 1, 54 | }, 55 | }, 56 | }, 57 | } 58 | 59 | for _, tc := range testCases { 60 | t.Run(tc.title, func(t *testing.T) { 61 | encoder := NewEncoder() 62 | // TODO(rartoul): This should probably be its own test. 63 | _, _, ok := encoder.LastEncoded() 64 | require.False(t, ok) 65 | 66 | for _, v := range tc.vals { 67 | err := encoder.Encode(v.timestamp, v.value) 68 | require.NoError(t, err) 69 | 70 | // TODO(rartoul): This should probably be its own test. 71 | lastEncodedT, lastEncodedV, ok := encoder.LastEncoded() 72 | require.True(t, ok) 73 | require.True(t, v.timestamp.Equal(lastEncodedT)) 74 | require.Equal(t, v.value, lastEncodedV) 75 | } 76 | 77 | encodedBytes := encoder.Bytes() 78 | require.Equal(t, 22, len(encodedBytes)) 79 | 80 | decoder := NewDecoder() 81 | decoder.Reset(encodedBytes) 82 | 83 | i := 0 84 | for decoder.Next() { 85 | currT, currV := decoder.Current() 86 | require.Equal(t, tc.vals[i].timestamp, currT) 87 | require.Equal(t, tc.vals[i].value, currV) 88 | i++ 89 | } 90 | require.NoError(t, decoder.Err()) 91 | require.Equal(t, len(tc.vals), i) 92 | }) 93 | } 94 | 95 | } 96 | 97 | func TestRoundTripWithStateAndRestore(t *testing.T) { 98 | values := []testValue{ 99 | { 100 | timestamp: time.Unix(0, 1), 101 | value: -1, 102 | }, 103 | { 104 | timestamp: time.Unix(0, 2), 105 | value: 0, 106 | }, 107 | { 108 | timestamp: time.Unix(0, 3), 109 | value: 1, 110 | }, 111 | } 112 | 113 | var ( 114 | accumulated []byte 115 | lastState []byte 116 | ) 117 | for _, v := range values { 118 | encoder := NewEncoder() 119 | if lastState != nil { 120 | err := encoder.Restore(lastState) 121 | require.NoError(t, err) 122 | } 123 | err := encoder.Encode(v.timestamp, v.value) 124 | require.NoError(t, err) 125 | lastState = encoder.State() 126 | 127 | b := encoder.Bytes() 128 | if accumulated == nil { 129 | accumulated = b 130 | } else { 131 | accumulated[len(accumulated)-1] = b[0] 132 | if len(b) > 1 { 133 | accumulated = append(accumulated, b[1:]...) 134 | } 135 | } 136 | } 137 | 138 | require.Equal(t, 22, len(accumulated)) 139 | 140 | decoder := NewDecoder() 141 | decoder.Reset(accumulated) 142 | 143 | i := 0 144 | for decoder.Next() { 145 | currT, currV := decoder.Current() 146 | require.Equal(t, values[i].timestamp, currT) 147 | require.Equal(t, values[i].value, currV) 148 | i++ 149 | } 150 | require.NoError(t, decoder.Err()) 151 | require.Equal(t, len(values), i) 152 | } 153 | -------------------------------------------------------------------------------- /src/layer/dircompress/layer.go: -------------------------------------------------------------------------------- 1 | package dircompress 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "fmt" 7 | "time" 8 | 9 | "github.com/apple/foundationdb/bindings/go/src/fdb" 10 | "github.com/richardartoul/tsdb-layer/src/encoding" 11 | "github.com/richardartoul/tsdb-layer/src/layer" 12 | ) 13 | 14 | func NewLayer() layer.Layer { 15 | fdb.MustAPIVersion(610) 16 | // TODO(rartoul): Make this configurable. 17 | db := fdb.MustOpenDefault() 18 | return &directCompress{ 19 | db: db, 20 | } 21 | } 22 | 23 | type directCompress struct { 24 | db fdb.Database 25 | } 26 | 27 | type timeSeriesMetadata struct { 28 | State []byte 29 | LastByte byte 30 | } 31 | 32 | func (l *directCompress) Write(id string, timestamp time.Time, value float64) error { 33 | // TODO: Don't allocate 34 | return l.WriteBatch([]layer.Write{{ID: id, Timestamp: timestamp, Value: value}}) 35 | } 36 | 37 | func (l *directCompress) WriteBatch(writes []layer.Write) error { 38 | _, err := l.db.Transact(func(tr fdb.Transaction) (interface{}, error) { 39 | metadataFutures := make([]fdb.FutureByteSlice, 0, len(writes)) 40 | for _, w := range writes { 41 | metadataKey := newTimeseriesMetadataKeyFromID(w.ID) 42 | metadataFuture := tr.Get(metadataKey) 43 | metadataFutures = append(metadataFutures, metadataFuture) 44 | } 45 | 46 | for i, f := range metadataFutures { 47 | var ( 48 | w = writes[i] 49 | metadataKey = newTimeseriesMetadataKeyFromID(w.ID) 50 | // TODO: Error handling 51 | metadataBytes = f.MustGet() 52 | metaValue timeSeriesMetadata 53 | dataAppend []byte 54 | enc = encoding.NewEncoder() 55 | ) 56 | if len(metadataBytes) == 0 { 57 | // Never written. 58 | enc := encoding.NewEncoder() 59 | if err := enc.Encode(w.Timestamp, w.Value); err != nil { 60 | return nil, err 61 | } 62 | 63 | metaValue = timeSeriesMetadata{ 64 | State: enc.State(), 65 | } 66 | 67 | // TODO: Should lastByte be set here too? 68 | b := enc.Bytes() 69 | if len(b) > 1 { 70 | dataAppend = enc.Bytes()[:len(b)-1] 71 | } 72 | } else { 73 | // TODO(rartoul): Don't use JSON. 74 | if err := json.Unmarshal(metadataBytes, &metaValue); err != nil { 75 | return nil, err 76 | } 77 | 78 | // Has been written before, restore encoder state. 79 | if err := enc.Restore(metaValue.State); err != nil { 80 | return nil, err 81 | } 82 | 83 | if err := enc.Encode(w.Timestamp, w.Value); err != nil { 84 | return nil, err 85 | } 86 | 87 | // Ensure new state gets persisted. 88 | var ( 89 | newState = enc.State() 90 | b = enc.Bytes() 91 | ) 92 | if len(b) == 0 { 93 | return nil, errors.New("encoder bytes was length zero") 94 | } 95 | if len(b) == 1 { 96 | // The existing last byte was modified without adding any additional bytes. The last 97 | // byte is always tracked by the state so there is nothing to append here. 98 | } 99 | if len(b) > 1 { 100 | // The last byte will be kept track of by the state, but any bytes preceding it are 101 | // new "complete" bytes which should be appended to the compressed stream. 102 | dataAppend = b[:len(b)-1] 103 | } 104 | metaValue.LastByte = b[len(b)-1] 105 | metaValue.State = newState 106 | } 107 | 108 | // TODO(rartoul): Don't use JSON. 109 | newMetadataBytes, err := json.Marshal(&metaValue) 110 | if err != nil { 111 | return nil, err 112 | } 113 | 114 | tr.Set(metadataKey, newMetadataBytes) 115 | // TODO(rartoul): Ensure it fits and if not split into new keys. 116 | dataKey := newTimeseriesDataKeyFromID(w.ID) 117 | tr.AppendIfFits(dataKey, dataAppend) 118 | } 119 | 120 | return nil, nil 121 | }) 122 | 123 | if err != nil { 124 | return err 125 | } 126 | return nil 127 | } 128 | 129 | func (l *directCompress) Read(id string) (encoding.ReadableDecoder, error) { 130 | stream, err := l.db.Transact(func(tr fdb.Transaction) (interface{}, error) { 131 | var ( 132 | metadataKey = newTimeseriesMetadataKeyFromID(id) 133 | dataKey = newTimeseriesDataKeyFromID(id) 134 | metadataFuture = tr.Get(metadataKey) 135 | dataFuture = tr.Get(dataKey) 136 | ) 137 | 138 | // TODO(rartoul): Proper error handling instead of Must() 139 | metadataBytes := metadataFuture.MustGet() 140 | dataBytes := dataFuture.MustGet() 141 | 142 | if len(metadataBytes) == 0 { 143 | // Does not exist. 144 | return nil, nil 145 | } 146 | 147 | var metaValue timeSeriesMetadata 148 | if err := json.Unmarshal(metadataBytes, &metaValue); err != nil { 149 | return nil, err 150 | } 151 | stream := append(dataBytes, metaValue.LastByte) 152 | return stream, nil 153 | }) 154 | if err != nil { 155 | return nil, err 156 | } 157 | 158 | dec := encoding.NewDecoder() 159 | dec.Reset(stream.([]byte)) 160 | return dec, nil 161 | } 162 | 163 | func newTimeseriesDataKeyFromID(id string) fdb.KeyConvertible { 164 | // TODO(rartoul): This function will need to be much more intelligent to handle 165 | // the fact that the data may be spread across multiple values. 166 | return fdb.Key(fmt.Sprintf("%s-data", id)) 167 | } 168 | 169 | func newTimeseriesMetadataKeyFromID(id string) fdb.KeyConvertible { 170 | return fdb.Key(fmt.Sprintf("%s-metadata", id)) 171 | } 172 | -------------------------------------------------------------------------------- /src/layer/dircompress/layer_test.go: -------------------------------------------------------------------------------- 1 | package dircompress 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/stretchr/testify/require" 8 | ) 9 | 10 | type testValue struct { 11 | timestamp time.Time 12 | value float64 13 | } 14 | 15 | // TODO(rartoul): This probably needs some kind of property test. 16 | func TestRoundTripSimple(t *testing.T) { 17 | tsID := "test-id-1" 18 | values := []testValue{ 19 | { 20 | timestamp: time.Unix(0, 1), 21 | value: -1, 22 | }, 23 | { 24 | timestamp: time.Unix(0, 2), 25 | value: 0, 26 | }, 27 | { 28 | timestamp: time.Unix(0, 3), 29 | value: 1, 30 | }, 31 | } 32 | 33 | layer := NewLayer() 34 | for _, v := range values { 35 | err := layer.Write(tsID, v.timestamp, v.value) 36 | require.NoError(t, err) 37 | } 38 | 39 | decoder, err := layer.Read(tsID) 40 | require.NoError(t, err) 41 | 42 | i := 0 43 | for decoder.Next() { 44 | currT, currV := decoder.Current() 45 | require.Equal(t, values[i].timestamp, currT) 46 | require.Equal(t, values[i].value, currV) 47 | i++ 48 | } 49 | require.NoError(t, decoder.Err()) 50 | require.Equal(t, len(values), i) 51 | } 52 | -------------------------------------------------------------------------------- /src/layer/raw/layer.go: -------------------------------------------------------------------------------- 1 | package raw 2 | 3 | import ( 4 | "errors" 5 | "time" 6 | 7 | "github.com/apple/foundationdb/bindings/go/src/fdb" 8 | "github.com/apple/foundationdb/bindings/go/src/fdb/tuple" 9 | "github.com/richardartoul/tsdb-layer/src/encoding" 10 | "github.com/richardartoul/tsdb-layer/src/layer" 11 | ) 12 | 13 | func NewLayer() layer.Layer { 14 | fdb.MustAPIVersion(610) 15 | // TODO(rartoul): Make this configurable. 16 | db := fdb.MustOpenDefault() 17 | return &raw{ 18 | db: db, 19 | } 20 | } 21 | 22 | type raw struct { 23 | db fdb.Database 24 | } 25 | 26 | func (l *raw) Write(id string, timestamp time.Time, value float64) error { 27 | // TODO: Don't allocate 28 | return l.WriteBatch([]layer.Write{{ID: id, Timestamp: timestamp, Value: value}}) 29 | } 30 | 31 | func (l *raw) WriteBatch(writes []layer.Write) error { 32 | _, err := l.db.Transact(func(tr fdb.Transaction) (interface{}, error) { 33 | for _, w := range writes { 34 | key := tuple.Tuple{w.ID, w.Timestamp.UnixNano()} 35 | tr.Set(key, tuple.Tuple{w.Value}.Pack()) 36 | } 37 | return nil, nil 38 | }) 39 | 40 | return err 41 | } 42 | 43 | func (l *raw) Read(id string) (encoding.ReadableDecoder, error) { 44 | return nil, errors.New("not-implemented") 45 | } 46 | -------------------------------------------------------------------------------- /src/layer/rawblock/buffer.go: -------------------------------------------------------------------------------- 1 | package rawblock 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "sync" 7 | "time" 8 | 9 | "github.com/apple/foundationdb/bindings/go/src/fdb" 10 | "github.com/apple/foundationdb/bindings/go/src/fdb/tuple" 11 | "github.com/richardartoul/tsdb-layer/src/encoding" 12 | "github.com/richardartoul/tsdb-layer/src/layer" 13 | ) 14 | 15 | const ( 16 | bufferKeyPrefix = "b-" 17 | metadataKeyPostfix = "-meta" 18 | tsChunkKeyPrefix = "-chunk-" 19 | 20 | targetChunkSize = 4096 21 | flushBatchSize = 128 22 | ) 23 | 24 | type tsMetadata struct { 25 | Chunks []chunkMetadata 26 | } 27 | 28 | func newTSMetadata() tsMetadata { 29 | return tsMetadata{} 30 | } 31 | 32 | type chunkMetadata struct { 33 | Key []byte 34 | First time.Time 35 | Last time.Time 36 | SizeBytes int 37 | } 38 | 39 | func newChunkMetadata(key []byte, first, last time.Time, sizeBytes int) chunkMetadata { 40 | return chunkMetadata{ 41 | Key: key, 42 | First: first, 43 | Last: last, 44 | SizeBytes: sizeBytes, 45 | } 46 | } 47 | 48 | type Buffer interface { 49 | Write(writes []layer.Write) error 50 | Read(id string) (encoding.MultiDecoder, bool, error) 51 | Flush() error 52 | } 53 | 54 | // TODO(rartoul): This entire thing needs to be refactored to support creating 55 | // new encoders (not just during flush) so that encoders can be split when: 56 | // 1. An existing encoder gets too big (so we don't end up with huge streams 57 | // that later need to be broken up during flush into smaller streams) 58 | // 2. An out-of-order write comes in. 59 | type buffer struct { 60 | sync.Mutex 61 | db fdb.Database 62 | encoders map[string][]encoding.Encoder 63 | } 64 | 65 | func NewBuffer(db fdb.Database) Buffer { 66 | return &buffer{ 67 | db: db, 68 | encoders: map[string][]encoding.Encoder{}, 69 | } 70 | } 71 | 72 | // TODO(rartoul): This should split up writes into a new encoder once the existing 73 | // encoder has reached a certain size so that a given stream cant grow too large 74 | // inbetween flushes (which is an issue because fdb has maximum sizes for a given 75 | // value). 76 | // TODO(rartoul): Should have per-write error handling. 77 | func (b *buffer) Write(writes []layer.Write) error { 78 | b.Lock() 79 | defer b.Unlock() 80 | 81 | for _, w := range writes { 82 | encoders, ok := b.encoders[w.ID] 83 | if !ok { 84 | encoders = []encoding.Encoder{encoding.NewEncoder()} 85 | b.encoders[w.ID] = encoders 86 | } 87 | 88 | enc := encoders[len(encoders)-1] 89 | lastT, _, hasWrittenAnyValues := enc.LastEncoded() 90 | if hasWrittenAnyValues { 91 | if w.Timestamp.Before(lastT) { 92 | // TODO(rartoul): Remove this restriction with multiple encoders. 93 | return fmt.Errorf( 94 | "cannot write data out of order, series: %s, prevTimestamp: %s, currTimestamp: %s", 95 | w.ID, lastT.String(), w.Timestamp.String()) 96 | } 97 | if w.Timestamp.Equal(lastT) { 98 | return fmt.Errorf( 99 | "cannot upsert existing values, series: %s, currTimestamp: %s", 100 | w.ID, lastT.String()) 101 | } 102 | } 103 | 104 | if err := enc.Encode(w.Timestamp, w.Value); err != nil { 105 | return err 106 | } 107 | } 108 | 109 | return nil 110 | } 111 | 112 | // TODO(rartoul): This should accept a time range to query and use that information 113 | // to determine which chunks to pull back instead of just reading all of them. 114 | func (b *buffer) Read(id string) (encoding.MultiDecoder, bool, error) { 115 | var decoders []encoding.Decoder 116 | _, err := b.db.Transact(func(tr fdb.Transaction) (interface{}, error) { 117 | metadataKey := metadataKey(id) 118 | metaBytes, err := tr.Get(metadataKey).Get() 119 | if err != nil { 120 | return nil, err 121 | } 122 | if metaBytes == nil { 123 | return nil, nil 124 | } 125 | 126 | var metadata tsMetadata 127 | if err := json.Unmarshal(metaBytes, &metadata); err != nil { 128 | return nil, err 129 | } 130 | 131 | for _, chunk := range metadata.Chunks { 132 | chunkBytes, err := tr.Get(fdb.Key(chunk.Key)).Get() 133 | if err != nil { 134 | return nil, err 135 | } 136 | dec := encoding.NewDecoder() 137 | dec.Reset(chunkBytes) 138 | decoders = append(decoders, dec) 139 | } 140 | return nil, nil 141 | }) 142 | if err != nil { 143 | return nil, false, err 144 | } 145 | 146 | encoders, ok := b.encoders[id] 147 | if ok { 148 | decoders = append(decoders, encodersToDecoders(encoders)...) 149 | } 150 | 151 | if len(decoders) == 0 { 152 | return nil, false, nil 153 | } 154 | 155 | multiDec := encoding.NewMultiDecoder() 156 | multiDec.Reset(decoders) 157 | return multiDec, true, nil 158 | } 159 | 160 | func encodersToDecoders(encs []encoding.Encoder) []encoding.Decoder { 161 | decs := make([]encoding.Decoder, 0, len(encs)) 162 | for _, enc := range encs { 163 | dec := encoding.NewDecoder() 164 | dec.Reset(enc.Bytes()) 165 | decs = append(decs, dec) 166 | } 167 | return decs 168 | } 169 | 170 | // TODO(rartoul): Instead of performing one transaction per series it would be more efficient 171 | // to collect "batches" of series and then write them all together in one fdb transaction. 172 | func (b *buffer) Flush() error { 173 | // Manually control locking so map can be iterated while still being concurrently 174 | // accessed. 175 | b.Lock() 176 | 177 | var pendingFlush []toFlush 178 | for seriesID, encoders := range b.encoders { 179 | if len(encoders) == 0 { 180 | continue 181 | } 182 | 183 | // Append a new encoder to the list of existing encoders. Only the last encoder 184 | // in the list is ever written to so this effectively renders all previous 185 | // encoders immutable which can be taken advantage of to flush them without 186 | // holding a lock on the entire map. 187 | encoders = append(encoders, encoding.NewEncoder()) 188 | encodersToFlush := encoders[:len(encoders)-1] 189 | b.encoders[seriesID] = encoders 190 | 191 | var streams [][]byte 192 | for _, enc := range encodersToFlush { 193 | streams = append(streams, enc.Bytes()) 194 | } 195 | pendingFlush = append(pendingFlush, toFlush{ 196 | id: seriesID, 197 | streams: streams, 198 | }) 199 | 200 | if len(pendingFlush) < flushBatchSize { 201 | continue 202 | } 203 | 204 | b.Unlock() 205 | if err := b.flush(pendingFlush); err != nil { 206 | return err 207 | } 208 | pendingFlush = pendingFlush[:0] 209 | 210 | // Hold the lock for the next iteration. 211 | b.Lock() 212 | } 213 | b.Unlock() 214 | if err := b.flush(pendingFlush); err != nil { 215 | return err 216 | } 217 | return nil 218 | } 219 | 220 | type toFlush struct { 221 | id string 222 | streams [][]byte 223 | } 224 | 225 | func (b *buffer) flush(toFlush []toFlush) error { 226 | if len(toFlush) == 0 { 227 | return nil 228 | } 229 | 230 | _, err := b.db.Transact(func(tr fdb.Transaction) (interface{}, error) { 231 | var metadataFutures []fdb.FutureByteSlice 232 | // Start parallel fetches for each metadata. 233 | for _, series := range toFlush { 234 | metadataKey := metadataKey(series.id) 235 | metadataFuture := tr.Get(metadataKey) 236 | metadataFutures = append(metadataFutures, metadataFuture) 237 | } 238 | 239 | for i, series := range toFlush { 240 | metaBytes, err := metadataFutures[i].Get() 241 | if err != nil { 242 | return nil, err 243 | } 244 | 245 | var metadata tsMetadata 246 | if metaBytes == nil { 247 | metadata = newTSMetadata() 248 | } else { 249 | // TODO(rartoul): Don't use JSON. 250 | if err := json.Unmarshal(metaBytes, &metadata); err != nil { 251 | return nil, err 252 | } 253 | } 254 | 255 | stream, err := encoding.MergeStreams(series.streams...) 256 | if err != nil { 257 | return nil, err 258 | } 259 | 260 | var newChunkKey fdb.Key 261 | if len(metadata.Chunks) == 0 { 262 | newChunkKey = tsChunkKey(series.id, 0) 263 | metadata.Chunks = append(metadata.Chunks, newChunkMetadata( 264 | newChunkKey, 265 | time.Unix(0, 0), // TODO(rartoul): Fill this in. 266 | time.Unix(0, 0), // TODO(rartoul): Fill this in. 267 | len(stream), 268 | )) 269 | } else { 270 | lastChunkIdx := len(metadata.Chunks) - 1 271 | lastChunk := metadata.Chunks[lastChunkIdx] 272 | // TODO(rartoul): Make compaction/merge logic more intelligent. 273 | if lastChunk.SizeBytes+len(stream) <= targetChunkSize { 274 | // Merge with last chunk. 275 | newChunkKey = fdb.Key(lastChunk.Key) 276 | // TODO(rartoul): This is inefficient because it forces a synchronous wait 277 | // on a read from fdb. This should be refactored so that all of the chunks 278 | // that need to be read can be fetched in parallel similar to how the metadata 279 | // futures are fetched in parallel above. 280 | existingStream, err := tr.Get(newChunkKey).Get() 281 | if err != nil { 282 | return nil, err 283 | } 284 | stream, err = encoding.MergeStreams(existingStream, stream) 285 | if err != nil { 286 | return nil, err 287 | } 288 | // TODO(rartoul): Update first and last properties here as well. 289 | metadata.Chunks[lastChunkIdx].SizeBytes = len(stream) 290 | } else { 291 | // Insert new chunk. 292 | newChunkKey = tsChunkKey(series.id, lastChunkIdx) 293 | metadata.Chunks = append(metadata.Chunks, newChunkMetadata( 294 | newChunkKey, 295 | time.Unix(0, 0), // TODO(rartoul): Fill this in. 296 | time.Unix(0, 0), // TODO(rartoul): Fill this in. 297 | len(stream), 298 | )) 299 | } 300 | } 301 | 302 | newMetadataBytes, err := json.Marshal(metadata) 303 | if err != nil { 304 | return nil, err 305 | } 306 | 307 | metadataKey := metadataKey(series.id) 308 | tr.Set(metadataKey, newMetadataBytes) 309 | tr.Set(newChunkKey, stream) 310 | } 311 | return nil, nil 312 | }) 313 | if err != nil { 314 | return err 315 | } 316 | 317 | b.Lock() 318 | defer b.Unlock() 319 | for _, series := range toFlush { 320 | encoders, ok := b.encoders[series.id] 321 | if !ok { 322 | return fmt.Errorf("flushed series %s which does not exist in encoders", series.id) 323 | } 324 | 325 | // Now that all of the immutable encoders have been flushed, they can be removed 326 | // from the list of existing encoders because they can now be read from FDB directly. 327 | // 328 | // TODO(rartoul): This logic works right now because the only thing that can 329 | // trigger creating a new encoder for an existing series is a flush and because flushing 330 | // is single-threaded. Once there is support for out-of-order writes, this logic will need 331 | // to change since there will be no way to determine if all of the encoder except the last 332 | // have been flushed yet (or could just force out of order writes to merge on demand?). 333 | b.encoders[series.id] = encoders[len(encoders)-1:] 334 | } 335 | return nil 336 | } 337 | 338 | func metadataKey(id string) fdb.Key { 339 | // TODO(rartoul): Not sure if this is ideal key structure/ 340 | return tuple.Tuple{bufferKeyPrefix, id, metadataKeyPostfix}.Pack() 341 | } 342 | 343 | func tsChunkKey(id string, chunkNum int) fdb.Key { 344 | return tuple.Tuple{bufferKeyPrefix, id, tsChunkKeyPrefix, chunkNum}.Pack() 345 | } 346 | -------------------------------------------------------------------------------- /src/layer/rawblock/buffer_test.go: -------------------------------------------------------------------------------- 1 | package rawblock 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "time" 7 | 8 | "github.com/richardartoul/tsdb-layer/src/layer" 9 | 10 | "github.com/stretchr/testify/require" 11 | ) 12 | 13 | const ( 14 | testID = "test-id" 15 | ) 16 | 17 | type testValue struct { 18 | timestamp time.Time 19 | value float64 20 | } 21 | 22 | type bufferWriteReadTestCase struct { 23 | title string 24 | vals []testValue 25 | } 26 | 27 | func TestBufferWriteRead(t *testing.T) { 28 | testCases := []bufferWriteReadTestCase{ 29 | { 30 | title: "in order values", 31 | vals: []testValue{{timestamp: time.Unix(0, 0), value: 0}, {timestamp: time.Unix(1, 0), value: 1}}, 32 | }, 33 | // TODO(rartoul): Not supported right now. 34 | // { 35 | // title: "out of order values", 36 | // vals: []testValue{{timestamp: time.Unix(1, 0), value: 1}, {timestamp: time.Unix(0, 0), value: 0}}, 37 | // }, 38 | } 39 | 40 | for _, tc := range testCases { 41 | t.Run(tc.title, func(t *testing.T) { 42 | db, cleanup := newTestFDB() 43 | defer cleanup() 44 | 45 | buffer := NewBuffer(db) 46 | writes := []layer.Write{} 47 | for _, val := range tc.vals { 48 | writes = append( 49 | writes, 50 | layer.Write{ 51 | ID: testID, 52 | Timestamp: val.timestamp, 53 | Value: val.value}) 54 | } 55 | require.NoError(t, buffer.Write(writes)) 56 | 57 | assertReadFn := func() { 58 | multiDec, ok, err := buffer.Read(testID) 59 | require.NoError(t, err) 60 | require.True(t, ok) 61 | 62 | i := 0 63 | for multiDec.Next() { 64 | currT, currV := multiDec.Current() 65 | require.True( 66 | t, 67 | tc.vals[i].timestamp.Equal(currT), 68 | fmt.Sprintf("expected %s but got %s", tc.vals[i].timestamp.String(), currT.String())) 69 | require.Equal(t, tc.vals[i].value, currV) 70 | i++ 71 | } 72 | require.NoError(t, multiDec.Err()) 73 | require.Equal(t, len(tc.vals), i) 74 | } 75 | 76 | // Ensure reads work correctly before and after flushing. 77 | assertReadFn() 78 | require.NoError(t, buffer.Flush()) 79 | assertReadFn() 80 | }) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/layer/rawblock/commitlog.go: -------------------------------------------------------------------------------- 1 | package rawblock 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "log" 7 | "math" 8 | "sync" 9 | "time" 10 | 11 | "github.com/apple/foundationdb/bindings/go/src/fdb" 12 | "github.com/apple/foundationdb/bindings/go/src/fdb/tuple" 13 | ) 14 | 15 | const ( 16 | // Multiple of fdb page size. 17 | defaultBatchSize = 4096 * 24 18 | defaultMaxPendingBytes = 10000000 19 | defaultFlushEvery = 10 * time.Millisecond 20 | 21 | commitLogKey = "commitlog-" 22 | commitLogKeyTupleLength = 2 23 | ) 24 | 25 | type clStatus int 26 | 27 | const ( 28 | clStatusUnopened clStatus = iota 29 | clStatusOpen 30 | clStatusClosed 31 | ) 32 | 33 | // truncationToken is a token that can be passed to the commitlog to truncate the commitlogs up to 34 | // a specific point. It should be treated as opaque by external callers. 35 | type truncationToken struct { 36 | upTo tuple.Tuple 37 | } 38 | 39 | // Commitlog is the interface for an FDB-backed commitlog. 40 | type Commitlog interface { 41 | Write([]byte) error 42 | Open() error 43 | Close() error 44 | WaitForRotation() (truncationToken, error) 45 | Truncate(token truncationToken) error 46 | } 47 | 48 | // CommitlogOptions encapsulates the options for the commit log. 49 | type CommitlogOptions struct { 50 | IdealBatchSize int 51 | MaxPendingBytes int 52 | FlushEvery time.Duration 53 | } 54 | 55 | // NewCommitlogOptions creates a new CommitlogOptions. 56 | func NewCommitlogOptions() CommitlogOptions { 57 | return CommitlogOptions{ 58 | IdealBatchSize: defaultBatchSize, 59 | MaxPendingBytes: defaultMaxPendingBytes, 60 | FlushEvery: defaultFlushEvery, 61 | } 62 | } 63 | 64 | type flushOutcome struct { 65 | // TODO(rartoul): Fix this, but last ID can be nil in the case 66 | // that there was no data to flush. This is useful because it 67 | // enables the WaitForRotation() API. 68 | lastID tuple.Tuple 69 | nextID tuple.Tuple 70 | err error 71 | doneCh chan struct{} 72 | } 73 | 74 | func newFlushOutcome() *flushOutcome { 75 | return &flushOutcome{ 76 | doneCh: make(chan struct{}, 0), 77 | } 78 | } 79 | 80 | func (f *flushOutcome) waitForFlush() error { 81 | <-f.doneCh 82 | return f.err 83 | } 84 | 85 | func (f *flushOutcome) notify(lastID, nextID tuple.Tuple, err error) { 86 | f.lastID = lastID 87 | f.nextID = nextID 88 | f.err = err 89 | close(f.doneCh) 90 | } 91 | 92 | type commitlog struct { 93 | sync.Mutex 94 | status clStatus 95 | db fdb.Database 96 | prevBatch []byte 97 | currBatch []byte 98 | lastFlushTime time.Time 99 | lastIdx int64 100 | flushOutcome *flushOutcome 101 | closeCh chan struct{} 102 | closeDoneCh chan error 103 | opts CommitlogOptions 104 | } 105 | 106 | // NewCommitlog creates a new commitlog. 107 | func NewCommitlog(db fdb.Database, opts CommitlogOptions) Commitlog { 108 | return &commitlog{ 109 | status: clStatusUnopened, 110 | db: db, 111 | flushOutcome: newFlushOutcome(), 112 | closeCh: make(chan struct{}, 1), 113 | closeDoneCh: make(chan error, 1), 114 | opts: opts, 115 | } 116 | } 117 | 118 | func (c *commitlog) Open() error { 119 | c.Lock() 120 | defer c.Unlock() 121 | if c.status != clStatusUnopened { 122 | return errors.New("commitlog cannot be opened more than once") 123 | } 124 | 125 | // "Bootstrap" the latest existing index to maintain a monotonically increasing 126 | // value for the commitlog chunk indices. 127 | existingIdx, ok, err := c.getLatestExistingIndex() 128 | if err != nil { 129 | return err 130 | } 131 | if !ok { 132 | existingIdx = -1 133 | } 134 | c.lastIdx = existingIdx 135 | fmt.Println("last existing IDX", c.lastIdx) 136 | 137 | c.status = clStatusOpen 138 | 139 | go func() { 140 | for { 141 | i := 0 142 | select { 143 | case <-c.closeCh: 144 | c.closeDoneCh <- c.flush() 145 | return 146 | default: 147 | } 148 | time.Sleep(time.Millisecond) 149 | if err := c.flush(); err != nil { 150 | log.Printf("error flushing commitlog: %v", err) 151 | } 152 | i++ 153 | } 154 | }() 155 | 156 | return nil 157 | } 158 | 159 | func (c *commitlog) Close() error { 160 | c.Lock() 161 | if c.status != clStatusOpen { 162 | c.Unlock() 163 | return errors.New("cannot close commit log that is not open") 164 | } 165 | c.status = clStatusClosed 166 | c.Unlock() 167 | 168 | c.closeCh <- struct{}{} 169 | return <-c.closeDoneCh 170 | } 171 | 172 | // TODO(rartoul): Kind of gross that this just takes a []byte but more 173 | // flexible for now. 174 | func (c *commitlog) Write(b []byte) error { 175 | if len(b) == 0 { 176 | return errors.New("commit log can not write empty chunk") 177 | } 178 | 179 | c.Lock() 180 | if c.status != clStatusOpen { 181 | c.Unlock() 182 | return errors.New("cannot write into commit log that is not open") 183 | } 184 | 185 | if len(c.currBatch)+len(b) > c.opts.MaxPendingBytes { 186 | c.Unlock() 187 | return errors.New("commit log queue is full") 188 | } 189 | 190 | c.currBatch = append(c.currBatch, b...) 191 | currFlushOutcome := c.flushOutcome 192 | c.Unlock() 193 | return currFlushOutcome.waitForFlush() 194 | } 195 | 196 | func (c *commitlog) Truncate(token truncationToken) error { 197 | if token.upTo == nil { 198 | // This can occur in the situation where there were no existing commitlogs when 199 | // the truncationToken was generated by a call to WaitForRotation(). 200 | return nil 201 | } 202 | 203 | _, err := c.db.Transact(func(tr fdb.Transaction) (interface{}, error) { 204 | tr.ClearRange(fdb.KeyRange{Begin: tuple.Tuple{commitLogKey}, End: token.upTo}) 205 | return nil, nil 206 | }) 207 | 208 | return err 209 | } 210 | 211 | func (c *commitlog) WaitForRotation() (truncationToken, error) { 212 | c.Lock() 213 | if c.status != clStatusOpen { 214 | c.Unlock() 215 | return truncationToken{}, errors.New("cannot wait for commit log rotation if commit log is not open") 216 | } 217 | currFlushOutcome := c.flushOutcome 218 | c.Unlock() 219 | 220 | if err := currFlushOutcome.waitForFlush(); err != nil { 221 | return truncationToken{}, err 222 | } 223 | 224 | // nextID instead of lastID because fdb clear ranges are exclusive on the end. 225 | return truncationToken{upTo: currFlushOutcome.nextID}, nil 226 | } 227 | 228 | func (c *commitlog) flush() error { 229 | c.Lock() 230 | currFlushOutcome := c.flushOutcome 231 | c.flushOutcome = newFlushOutcome() 232 | 233 | var ( 234 | lastKey tuple.Tuple 235 | nextKey tuple.Tuple 236 | ) 237 | if !(time.Since(c.lastFlushTime) >= c.opts.FlushEvery && len(c.currBatch) > 0) { 238 | c.Unlock() 239 | // Notify anyways so that the WaitForRotation() API can function. 240 | if c.lastIdx >= 0 { 241 | lastKey = commitlogKeyFromIdx(c.lastIdx) 242 | nextKey = commitlogKeyFromIdx(c.lastIdx + 1) 243 | } 244 | currFlushOutcome.notify(lastKey, nextKey, nil) 245 | return nil 246 | } 247 | 248 | toWrite := c.currBatch 249 | c.currBatch, c.prevBatch = c.prevBatch, c.currBatch 250 | c.currBatch = c.currBatch[:0] 251 | c.Unlock() 252 | 253 | _, err := c.db.Transact(func(tr fdb.Transaction) (interface{}, error) { 254 | // TODO(rartoul): Need to be smarter about this because don't want to actually 255 | // break chunks across writes I.E every call to WriteBatch() should end up 256 | // in one key so that each key is a complete unit. 257 | startIdx := 0 258 | for startIdx < len(toWrite) { 259 | lastKey = c.nextKey() 260 | nextKey = commitlogKeyFromIdx(c.lastIdx + 1) 261 | endIdx := startIdx + c.opts.IdealBatchSize 262 | if endIdx > len(toWrite) { 263 | endIdx = len(toWrite) 264 | } 265 | tr.Set(lastKey, toWrite[startIdx:endIdx]) 266 | startIdx = endIdx 267 | } 268 | 269 | return nil, nil 270 | }) 271 | currFlushOutcome.notify(lastKey, nextKey, err) 272 | return err 273 | } 274 | 275 | func (c *commitlog) nextKey() tuple.Tuple { 276 | // TODO(rartoul): This should have some kind of host identifier in it. 277 | nextKey := commitlogKeyFromIdx(c.lastIdx + 1) 278 | // Safe to update this optimistically since even if the write ends up failing 279 | // its ok to have "gaps". 280 | // 281 | // Also safe to do this without any locking as this function is always called 282 | // in a single-threaded manner. 283 | c.lastIdx++ 284 | return nextKey 285 | } 286 | 287 | // TODO(rartoul): This could run afoul of fdb transction time and/or size limits if there 288 | // are too many commitlog chunks. Should be refactored to use a limit and break into multiple 289 | // transactions if necessary. 290 | func (c *commitlog) getLatestExistingIndex() (int64, bool, error) { 291 | key, err := c.db.Transact(func(tr fdb.Transaction) (interface{}, error) { 292 | var ( 293 | rangeResult = tr.GetRange(fdb.KeyRange{ 294 | Begin: tuple.Tuple{commitLogKey, 0}, 295 | End: tuple.Tuple{commitLogKey, math.MaxInt64}}, fdb.RangeOptions{}) 296 | iter = rangeResult.Iterator() 297 | key fdb.Key 298 | ) 299 | for iter.Advance() { 300 | curr, err := iter.Get() 301 | if err != nil { 302 | return nil, err 303 | } 304 | key = curr.Key 305 | } 306 | 307 | if key == nil { 308 | return nil, nil 309 | } 310 | return key, nil 311 | }) 312 | 313 | if err != nil { 314 | return -1, false, err 315 | } 316 | if key == nil { 317 | return -1, false, nil 318 | } 319 | 320 | keyTuple, err := tuple.Unpack(key.(fdb.Key)) 321 | if err != nil { 322 | return -1, false, err 323 | } 324 | 325 | if len(keyTuple) != commitLogKeyTupleLength { 326 | return -1, false, fmt.Errorf( 327 | "malformed commitlog key tuple, expected len: %d, but was: %d, raw: %v", 328 | commitLogKeyTupleLength, len(keyTuple), key) 329 | } 330 | idx, ok := keyTuple[1].(int64) 331 | if !ok { 332 | return -1, false, errors.New("malformed commitlog key tuple, expected second value to be of type int64") 333 | } 334 | return idx, true, nil 335 | } 336 | 337 | type commitlogKey struct { 338 | index int 339 | } 340 | 341 | // func (k *commitLogKey) 342 | func commitlogKeyFromIdx(idx int64) tuple.Tuple { 343 | return tuple.Tuple{commitLogKey, idx} 344 | } 345 | -------------------------------------------------------------------------------- /src/layer/rawblock/commitlog_test.go: -------------------------------------------------------------------------------- 1 | package rawblock 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/require" 7 | ) 8 | 9 | func TestCommitlogBootstrapLastIndex(t *testing.T) { 10 | db, cleanup := newTestFDB() 11 | defer cleanup() 12 | 13 | cl := NewCommitlog(db, NewCommitlogOptions()) 14 | require.NoError(t, cl.Open()) 15 | 16 | clImpl := cl.(*commitlog) 17 | // Verify it starts at -1. 18 | require.Equal(t, int64(-1), clImpl.lastIdx) 19 | // Issue two writes sequentially so it will increase by 2 (+1 for each flush). 20 | require.NoError(t, cl.Write([]byte("some-data"))) 21 | require.Equal(t, int64(0), clImpl.lastIdx) 22 | require.NoError(t, cl.Write([]byte("some-data"))) 23 | require.Equal(t, int64(1), clImpl.lastIdx) 24 | 25 | require.NoError(t, cl.Close()) 26 | 27 | // Ensure correct value is bootstrapped. 28 | cl = NewCommitlog(db, NewCommitlogOptions()) 29 | require.NoError(t, cl.Open()) 30 | require.Equal(t, int64(1), clImpl.lastIdx) 31 | require.NoError(t, cl.Close()) 32 | } 33 | 34 | func TestCommitlogTruncation(t *testing.T) { 35 | db, cleanup := newTestFDB() 36 | defer cleanup() 37 | 38 | cl := NewCommitlog(db, NewCommitlogOptions()).(*commitlog) 39 | require.NoError(t, cl.Open()) 40 | 41 | // Verify it starts at -1. 42 | require.Equal(t, int64(-1), cl.lastIdx) 43 | // Issue two writes sequentially so it will increase by 2 (+1 for each flush). 44 | require.NoError(t, cl.Write([]byte("some-data"))) 45 | require.Equal(t, int64(0), cl.lastIdx) 46 | require.NoError(t, cl.Write([]byte("some-data"))) 47 | require.Equal(t, int64(1), cl.lastIdx) 48 | 49 | truncToken, err := cl.WaitForRotation() 50 | require.NoError(t, err) 51 | // Use the truncation token to truncate all commitlog chunks before 2. 52 | require.NoError(t, cl.Truncate(truncToken)) 53 | require.NoError(t, cl.Close()) 54 | 55 | // Ensure all commitlog chunks were cleared. 56 | cl = NewCommitlog(db, NewCommitlogOptions()).(*commitlog) 57 | require.NoError(t, cl.Open()) 58 | require.Equal(t, int64(-1), cl.lastIdx) 59 | 60 | // Issue a write before waiting for rotation (this should be cleared by the 61 | // call to Truncate()). 62 | require.NoError(t, cl.Write([]byte("some-data"))) 63 | require.Equal(t, int64(0), cl.lastIdx) 64 | 65 | truncToken, err = cl.WaitForRotation() 66 | require.NoError(t, err) 67 | // Issue one write after waiting for rotation so that there is one commitlog 68 | // chunk that should be deleted by truncation (0) and one that should remain(1). 69 | require.NoError(t, cl.Write([]byte("some-data"))) 70 | require.Equal(t, int64(1), cl.lastIdx) 71 | require.NoError(t, cl.Truncate(truncToken)) 72 | require.NoError(t, cl.Close()) 73 | 74 | // Ensure that chunk 0 (written before WaitForRotation()) was cleared but chunk 1 75 | // (writen after WaitForRotation()) remains. 76 | cl = NewCommitlog(db, NewCommitlogOptions()).(*commitlog) 77 | require.NoError(t, cl.Open()) 78 | require.Equal(t, int64(1), cl.lastIdx) 79 | } 80 | -------------------------------------------------------------------------------- /src/layer/rawblock/common_test.go: -------------------------------------------------------------------------------- 1 | package rawblock 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/apple/foundationdb/bindings/go/src/fdb" 7 | "github.com/apple/foundationdb/bindings/go/src/fdb/tuple" 8 | ) 9 | 10 | type cleanupFn func() 11 | 12 | func newTestFDB() (fdb.Database, cleanupFn) { 13 | fdb.MustAPIVersion(610) 14 | // TODO(rartoul): Should truncate database before and after. 15 | db := fdb.MustOpenDefault() 16 | truncateFDB(db) 17 | cleanupFn := func() { truncateFDB(db) } 18 | return db, cleanupFn 19 | } 20 | 21 | func truncateFDB(db fdb.Database) { 22 | _, err := db.Transact(func(tr fdb.Transaction) (interface{}, error) { 23 | tr.ClearRange(fdb.KeyRange{Begin: tuple.Tuple{""}, End: tuple.Tuple{0xFF}}) 24 | return nil, nil 25 | }) 26 | if err != nil { 27 | panic(fmt.Sprintf("error truncating fdb: %v", err)) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/layer/rawblock/layer.go: -------------------------------------------------------------------------------- 1 | package rawblock 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "log" 7 | "math" 8 | "sync" 9 | "time" 10 | 11 | "github.com/apple/foundationdb/bindings/go/src/fdb" 12 | "github.com/richardartoul/tsdb-layer/src/encoding" 13 | "github.com/richardartoul/tsdb-layer/src/layer" 14 | ) 15 | 16 | const ( 17 | persistLoopInterval = 100 * time.Millisecond 18 | ) 19 | 20 | func NewLayer() layer.Layer { 21 | fdb.MustAPIVersion(610) 22 | // TODO(rartoul): Make this configurable. 23 | db := fdb.MustOpenDefault() 24 | cl := NewCommitlog(db, NewCommitlogOptions()) 25 | if err := cl.Open(); err != nil { 26 | // TODO(rartoul): Clean this up 27 | panic(err) 28 | } 29 | buffer := NewBuffer(db) 30 | 31 | l := &rawBlock{ 32 | db: db, 33 | cl: cl, 34 | buffer: buffer, 35 | bytesPool: newBytesPool(1024, 16000, 4096), 36 | } 37 | go l.startPersistLoop() 38 | return l 39 | } 40 | 41 | type rawBlock struct { 42 | db fdb.Database 43 | cl Commitlog 44 | buffer Buffer 45 | bytesPool *bytesPool 46 | } 47 | 48 | func (l *rawBlock) Write(id string, timestamp time.Time, value float64) error { 49 | // TODO: Don't allocate 50 | return l.WriteBatch([]layer.Write{{ID: id, Timestamp: timestamp, Value: value}}) 51 | } 52 | 53 | func (l *rawBlock) WriteBatch(writes []layer.Write) error { 54 | if err := l.buffer.Write(writes); err != nil { 55 | return err 56 | } 57 | 58 | b := l.bytesPool.Get() 59 | for _, w := range writes { 60 | b = encodeWrite(b, w) 61 | } 62 | err := l.cl.Write(b) 63 | l.bytesPool.Put(b) 64 | return err 65 | } 66 | 67 | func (l *rawBlock) Read(id string) (encoding.ReadableDecoder, error) { 68 | decoder, _, err := l.buffer.Read(id) 69 | return decoder, err 70 | } 71 | 72 | // TODO(rartoul): Add clean shutdown logic. 73 | func (l *rawBlock) startPersistLoop() { 74 | for { 75 | // Prevent excessive activity when there are no incoming writes. 76 | time.Sleep(persistLoopInterval) 77 | 78 | truncToken, err := l.cl.WaitForRotation() 79 | if err != nil { 80 | log.Printf("error waiting for commitlog rotation: %v", err) 81 | continue 82 | } 83 | start := time.Now() 84 | if err := l.buffer.Flush(); err != nil { 85 | log.Printf("error flushing buffer: %v", err) 86 | continue 87 | } 88 | fmt.Println("flush took: ", time.Now().Sub(start)) 89 | if err := l.cl.Truncate(truncToken); err != nil { 90 | log.Printf("error truncating commitlog: %v", err) 91 | continue 92 | } 93 | } 94 | } 95 | 96 | // TODO(rartoul): Bucketized would be more efficient 97 | type bytesPool struct { 98 | sync.Mutex 99 | pool [][]byte 100 | size int 101 | maxCapacity int 102 | defaultAllocSize int 103 | } 104 | 105 | func newBytesPool(size, maxCapacity, defaultAllocSize int) *bytesPool { 106 | return &bytesPool{ 107 | defaultAllocSize: defaultAllocSize, 108 | size: size, 109 | maxCapacity: maxCapacity, 110 | } 111 | } 112 | 113 | func (p *bytesPool) Get() []byte { 114 | p.Lock() 115 | var b []byte 116 | if len(p.pool) == 0 { 117 | b = make([]byte, 0, p.defaultAllocSize) 118 | } else { 119 | b = p.pool[len(p.pool)-1] 120 | p.pool = p.pool[:len(p.pool)-1] 121 | } 122 | p.Unlock() 123 | return b 124 | } 125 | 126 | func (p *bytesPool) Put(b []byte) { 127 | p.Lock() 128 | if len(p.pool) >= p.size || cap(b) > p.maxCapacity { 129 | p.Unlock() 130 | return 131 | } 132 | p.pool = append(p.pool, b[:0]) 133 | p.Unlock() 134 | } 135 | 136 | // TODO: This needs to be length prefixed and all that other nice stuff so it can actually be decoded 137 | func encodeWrite(b []byte, w layer.Write) []byte { 138 | b = append(b, w.ID...) 139 | binary.PutVarint(b, w.Timestamp.UnixNano()) 140 | binary.PutUvarint(b, math.Float64bits(w.Value)) 141 | return b 142 | } 143 | -------------------------------------------------------------------------------- /src/layer/server/server.go: -------------------------------------------------------------------------------- 1 | package server 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | pb "github.com/richardartoul/tsdb-layer/protos/.gen" 8 | ) 9 | 10 | var _ pb.TSDBLayerServer = &server{} 11 | 12 | type server struct { 13 | } 14 | 15 | func NewServer() pb.TSDBLayerServer { 16 | return &server{} 17 | } 18 | 19 | func (s *server) WriteBatch(context.Context, *pb.WriteBatchRequest) (*pb.Empty, error) { 20 | fmt.Println("hmm1") 21 | return nil, nil 22 | } 23 | 24 | func (s *server) ReadBatch(context.Context, *pb.ReadBatchRequest) (*pb.ReadBatchResponse, error) { 25 | fmt.Println("hmm2") 26 | return nil, nil 27 | } 28 | -------------------------------------------------------------------------------- /src/layer/types.go: -------------------------------------------------------------------------------- 1 | package layer 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/richardartoul/tsdb-layer/src/encoding" 7 | ) 8 | 9 | type Write struct { 10 | ID string 11 | Timestamp time.Time 12 | Value float64 13 | } 14 | 15 | type Layer interface { 16 | Write(id string, timestamp time.Time, value float64) error 17 | WriteBatch(writes []Write) error 18 | Read(id string) (encoding.ReadableDecoder, error) 19 | } 20 | --------------------------------------------------------------------------------