├── .github └── workflows │ ├── gh-release.yml │ └── test.yml ├── .gitignore ├── LICENSE ├── Makefile ├── NOTICE.txt ├── README.md ├── RELEASE ├── bstream.go ├── bstream_test.go ├── codecov.yml ├── disk_partition.go ├── disk_partition_test.go ├── disk_wal.go ├── disk_wal_test.go ├── doc.go ├── encoding.go ├── encoding_test.go ├── fake_encoder.go ├── fake_partition.go ├── go.mod ├── go.sum ├── internal ├── cgroup │ ├── cpu.go │ ├── cpu_test.go │ ├── mem.go │ ├── mem_test.go │ ├── testdata │ │ ├── cgroup │ │ │ ├── cpu.cfs_period_us │ │ │ ├── cpu.cfs_quota_us │ │ │ ├── memory.limit_in_bytes │ │ │ └── memory.stat │ │ ├── docker │ │ │ └── 74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db │ │ │ │ ├── cpu.cfs_period_us │ │ │ │ ├── cpu.cfs_quota_us │ │ │ │ ├── memory.limit_in_bytes │ │ │ │ └── memory.stat │ │ └── self │ │ │ └── cgroup │ ├── util.go │ └── util_test.go ├── encoding │ └── int.go ├── syscall │ ├── mmap.go │ ├── mmap_386.go │ ├── mmap_amd64.go │ ├── mmap_arm.go │ ├── mmap_unix.go │ └── mmap_windows.go └── timerpool │ ├── timerpool.go │ └── timerpool_test.go ├── label.go ├── label_test.go ├── logger.go ├── memory_partition.go ├── memory_partition_test.go ├── partition.go ├── partition_list.go ├── partition_list_test.go ├── storage.go ├── storage_benchmark_test.go ├── storage_examples_test.go ├── storage_test.go ├── testdata └── meta.json └── wal.go /.github/workflows/gh-release.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | paths: 6 | - 'RELEASE' 7 | pull_request: 8 | types: [opened, synchronize] 9 | branches: 10 | - main 11 | paths: 12 | - 'RELEASE' 13 | 14 | jobs: 15 | gh-release: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v1 19 | - uses: pipe-cd/actions-gh-release@v2.6.0 20 | with: 21 | release_file: 'RELEASE' 22 | token: ${{ secrets.GITHUB_TOKEN }} 23 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | on: 3 | push: 4 | branches: 5 | - main 6 | paths: 7 | - '**.go' 8 | pull_request: 9 | branches: 10 | - main 11 | paths: 12 | - '**.go' 13 | jobs: 14 | test: 15 | strategy: 16 | matrix: 17 | platform: [ubuntu-latest, macos-latest] 18 | runs-on: ${{ matrix.platform }} 19 | steps: 20 | - name: Install Go 21 | uses: actions/setup-go@v2 22 | with: 23 | go-version: '1.20' 24 | - name: Checkout code 25 | uses: actions/checkout@v2 26 | - name: Run tests 27 | run: make test 28 | - name: Upload coverage to Codecov 29 | uses: codecov/codecov-action@v2 30 | with: 31 | token: ${{ secrets.CODECOV_TOKEN }} 32 | file: ./coverage.txt 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Output about testing 9 | *.test 10 | *.out 11 | /coverage.txt 12 | /pprof 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | # vendor/ 16 | 17 | # Editor 18 | /.idea 19 | 20 | .DS_Store 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | go test -race -v -coverpkg=./... -covermode=atomic -coverprofile=coverage.txt ./... 3 | 4 | test-bench: 5 | go test -benchtime=4s -benchmem -bench=. -cpuprofile=pprof/cpu.out -memprofile=pprof/mem.out . 6 | 7 | pprof-mem: 8 | go tool pprof pprof/mem.out 9 | 10 | pprof-cpu: 11 | go tool pprof pprof/cpu.out 12 | 13 | dep: 14 | go mod tidy 15 | 16 | godoc: 17 | godoc -http=:6060 18 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | tstorage 2 | =============== 3 | 4 | This product contains a modified part of VictoriaMetrics, distributed by VictoriaMetrics, Inc: 5 | 6 | * License: https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/LICENSE (Apache License v2.0) 7 | * Homepage: https://github.com/VictoriaMetrics/VictoriaMetrics 8 | 9 | This product contains a modified part of catena, distributed by Cistern: 10 | 11 | * License: https://github.com/Cistern/catena/blob/master/LICENSE 12 | * Homepage: https://github.com/Cistern/catena 13 | 14 | This product contains a modified part of go-tsz, distributed by dgryski: 15 | 16 | * License: https://github.com/dgryski/go-tsz/blob/master/LICENSE 17 | * Homepage: https://github.com/dgryski/go-tsz 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tstorage [![Go Reference](https://pkg.go.dev/badge/mod/github.com/nakabonne/tstorage.svg)](https://pkg.go.dev/mod/github.com/nakabonne/tstorage) 2 | 3 | `tstorage` is a lightweight local on-disk storage engine for time-series data with a straightforward API. 4 | Especially ingestion is massively optimized as it provides goroutine safe capabilities of write into and read from TSDB that partitions data points by time. 5 | 6 | ## Motivation 7 | I'm working on a couple of tools that handle a tremendous amount of time-series data, such as [Ali](https://github.com/nakabonne/ali) and [Gosivy](https://github.com/nakabonne/gosivy). 8 | Especially Ali, I had been facing a problem of increasing heap consumption over time as it's a load testing tool that aims to perform real-time analysis. 9 | I little poked around a fast TSDB library that offers simple APIs but eventually nothing works as well as I'd like, that's why I settled on writing this package myself. 10 | 11 | To see how much `tstorage` has helped improve Ali's performance, see the release notes [here](https://github.com/nakabonne/ali/releases/tag/v0.7.0). 12 | 13 | ## Usage 14 | Currently, `tstorage` requires Go version 1.16 or greater 15 | 16 | By default, `tstorage.Storage` works as an in-memory database. 17 | The below example illustrates how to insert a row into the memory and immediately select it. 18 | 19 | ```go 20 | package main 21 | 22 | import ( 23 | "fmt" 24 | 25 | "github.com/nakabonne/tstorage" 26 | ) 27 | 28 | func main() { 29 | storage, _ := tstorage.NewStorage( 30 | tstorage.WithTimestampPrecision(tstorage.Seconds), 31 | ) 32 | defer storage.Close() 33 | 34 | _ = storage.InsertRows([]tstorage.Row{ 35 | { 36 | Metric: "metric1", 37 | DataPoint: tstorage.DataPoint{Timestamp: 1600000000, Value: 0.1}, 38 | }, 39 | }) 40 | points, _ := storage.Select("metric1", nil, 1600000000, 1600000001) 41 | for _, p := range points { 42 | fmt.Printf("timestamp: %v, value: %v\n", p.Timestamp, p.Value) 43 | // => timestamp: 1600000000, value: 0.1 44 | } 45 | } 46 | ``` 47 | 48 | ### Using disk 49 | To make time-series data persistent on disk, specify the path to directory that stores time-series data through [WithDataPath](https://pkg.go.dev/github.com/nakabonne/tstorage#WithDataPath) option. 50 | 51 | ```go 52 | storage, _ := tstorage.NewStorage( 53 | tstorage.WithDataPath("./data"), 54 | ) 55 | defer storage.Close() 56 | ``` 57 | 58 | ### Labeled metrics 59 | In tstorage, you can identify a metric with combination of metric name and optional labels. 60 | Here is an example of insertion a labeled metric to the disk. 61 | 62 | ```go 63 | metric := "mem_alloc_bytes" 64 | labels := []tstorage.Label{ 65 | {Name: "host", Value: "host-1"}, 66 | } 67 | 68 | _ = storage.InsertRows([]tstorage.Row{ 69 | { 70 | Metric: metric, 71 | Labels: labels, 72 | DataPoint: tstorage.DataPoint{Timestamp: 1600000000, Value: 0.1}, 73 | }, 74 | }) 75 | points, _ := storage.Select(metric, labels, 1600000000, 1600000001) 76 | ``` 77 | 78 | For more examples see [the documentation](https://pkg.go.dev/github.com/nakabonne/tstorage#pkg-examples). 79 | 80 | ## Benchmarks 81 | Benchmark tests were made using Intel(R) Core(TM) i7-8559U CPU @ 2.70GHz with 16GB of RAM on macOS 10.15.7 82 | 83 | ``` 84 | $ go version 85 | go version go1.16.2 darwin/amd64 86 | 87 | $ go test -benchtime=4s -benchmem -bench=. . 88 | goos: darwin 89 | goarch: amd64 90 | pkg: github.com/nakabonne/tstorage 91 | cpu: Intel(R) Core(TM) i7-8559U CPU @ 2.70GHz 92 | BenchmarkStorage_InsertRows-8 14135685 305.9 ns/op 174 B/op 2 allocs/op 93 | BenchmarkStorage_SelectAmongThousandPoints-8 20548806 222.4 ns/op 56 B/op 2 allocs/op 94 | BenchmarkStorage_SelectAmongMillionPoints-8 16185709 292.2 ns/op 56 B/op 1 allocs/op 95 | PASS 96 | ok github.com/nakabonne/tstorage 16.501s 97 | ``` 98 | 99 | ## Internal 100 | Time-series database has specific characteristics in its workload. 101 | In terms of write operations, a time-series database has to ingest a tremendous amount of data points ordered by time. 102 | Time-series data is immutable, mostly an append-only workload with delete operations performed in batches on less recent data. 103 | In terms of read operations, in most cases, we want to retrieve multiple data points by specifying its time range, also, most recent first: query the recent data in real-time. 104 | Besides, time-series data is already indexed in time order. 105 | 106 | Based on these characteristics, `tstorage` adopts a linear data model structure that partitions data points by time, totally different from the B-trees or LSM trees based storage engines. 107 | Each partition acts as a fully independent database containing all data points for its time range. 108 | 109 | 110 | ``` 111 | │ │ 112 | Read Write 113 | │ │ 114 | │ V 115 | │ ┌───────────────────┐ max: 1600010800 116 | ├─────> Memory Partition 117 | │ └───────────────────┘ min: 1600007201 118 | │ 119 | │ ┌───────────────────┐ max: 1600007200 120 | ├─────> Memory Partition 121 | │ └───────────────────┘ min: 1600003601 122 | │ 123 | │ ┌───────────────────┐ max: 1600003600 124 | └─────> Disk Partition 125 | └───────────────────┘ min: 1600000000 126 | ``` 127 | 128 | Key benefits: 129 | - We can easily ignore all data outside of the partition time range when querying data points. 130 | - Most read operations work fast because recent data get cached in heap. 131 | - When a partition gets full, we can persist the data from our in-memory database by sequentially writing just a handful of larger files. We avoid any write-amplification and serve SSDs and HDDs equally well. 132 | 133 | ### Memory partition 134 | The memory partition is writable and stores data points in heap. The head partition is always memory partition. Its next one is also memory partition to accept out-of-order data points. 135 | It stores data points in an ordered Slice, which offers excellent cache hit ratio compared to linked lists unless it gets updated way too often (like delete, add elements at random locations). 136 | 137 | All incoming data is written to a write-ahead log (WAL) right before inserting into a memory partition to prevent data loss. 138 | 139 | ### Disk partition 140 | The old memory partitions get compacted and persisted to the directory prefixed with `p-`, under the directory specified with the [WithDataPath](https://pkg.go.dev/github.com/nakabonne/tstorage#WithDataPath) option. 141 | Here is the macro layout of disk partitions: 142 | 143 | ``` 144 | $ tree ./data 145 | ./data 146 | ├── p-1600000001-1600003600 147 | │   ├── data 148 | │   └── meta.json 149 | ├── p-1600003601-1600007200 150 | │   ├── data 151 | │   └── meta.json 152 | └── p-1600007201-1600010800 153 | ├── data 154 | └── meta.json 155 | ``` 156 | 157 | As you can see each partition holds two files: `meta.json` and `data`. 158 | The `data` is compressed, read-only and is memory-mapped with [mmap(2)](https://en.wikipedia.org/wiki/Mmap) that maps a kernel address space to a user address space. 159 | Therefore, what it has to store in heap is only partition's metadata. Just looking at `meta.json` gives us a good picture of what it stores: 160 | 161 | ```json 162 | $ cat ./data/p-1600000001-1600003600/meta.json 163 | { 164 | "minTimestamp": 1600000001, 165 | "maxTimestamp": 1600003600, 166 | "numDataPoints": 7200, 167 | "metrics": { 168 | "metric-1": { 169 | "name": "metric-1", 170 | "offset": 0, 171 | "minTimestamp": 1600000001, 172 | "maxTimestamp": 1600003600, 173 | "numDataPoints": 3600 174 | }, 175 | "metric-2": { 176 | "name": "metric-2", 177 | "offset": 36014, 178 | "minTimestamp": 1600000001, 179 | "maxTimestamp": 1600003600, 180 | "numDataPoints": 3600 181 | } 182 | } 183 | } 184 | ``` 185 | 186 | Each metric has its own file offset of the beginning. 187 | Data point slice for each metric is compressed separately, so all we have to do when reading is to seek, and read the points off. 188 | 189 | ### Out-of-order data points 190 | What data points get out-of-order in real-world applications is not uncommon because of network latency or clock synchronization issues; `tstorage` basically doesn't discard them. 191 | If out-of-order data points are within the range of the head memory partition, they get temporarily buffered and merged at flush time. 192 | Sometimes we should handle data points that cross a partition boundary. That is the reason why `tstorage` keeps more than one partition writable. 193 | 194 | ## More 195 | Want to know more details on tstorage internal? If so see the blog post: [Write a time-series database engine from scratch](https://nakabonne.dev/posts/write-tsdb-from-scratch). 196 | 197 | ## Acknowledgements 198 | This package is implemented based on tons of existing ideas. What I especially got inspired by are: 199 | - https://misfra.me/state-of-the-state-part-iii 200 | - https://fabxc.org/tsdb 201 | - https://questdb.io/blog/2020/11/26/why-timeseries-data 202 | - https://akumuli.org/akumuli/2017/04/29/nbplustree 203 | - https://github.com/VictoriaMetrics/VictoriaMetrics 204 | 205 | A big "thank you!" goes out to all of them. 206 | -------------------------------------------------------------------------------- /RELEASE: -------------------------------------------------------------------------------- 1 | tag: v0.3.6 2 | -------------------------------------------------------------------------------- /bstream.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2015,2016 Damian Gryski 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are met: 6 | // 7 | // * Redistributions of source code must retain the above copyright notice, 8 | // this list of conditions and the following disclaimer. 9 | // 10 | // * Redistributions in binary form must reproduce the above copyright notice, 11 | // this list of conditions and the following disclaimer in the documentation 12 | // and/or other materials provided with the distribution. 13 | // 14 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | package tstorage 26 | 27 | import ( 28 | "encoding/binary" 29 | "io" 30 | ) 31 | 32 | // bstream is a stream of bits. 33 | type bstream struct { 34 | stream []byte // the data stream 35 | count uint8 // how many bits are valid in current byte 36 | } 37 | 38 | func (b *bstream) bytes() []byte { 39 | return b.stream 40 | } 41 | 42 | // reset resets the buffer to be empty, 43 | // but it retains the underlying storage for use by future writes. 44 | func (b *bstream) reset() { 45 | b.stream = b.stream[:0] 46 | b.count = 0 47 | } 48 | 49 | type bit bool 50 | 51 | const ( 52 | zero bit = false 53 | one bit = true 54 | ) 55 | 56 | func (b *bstream) writeBit(bit bit) { 57 | if b.count == 0 { 58 | b.stream = append(b.stream, 0) 59 | b.count = 8 60 | } 61 | 62 | i := len(b.stream) - 1 63 | 64 | if bit { 65 | b.stream[i] |= 1 << (b.count - 1) 66 | } 67 | 68 | b.count-- 69 | } 70 | 71 | func (b *bstream) writeByte(byt byte) { 72 | if b.count == 0 { 73 | b.stream = append(b.stream, 0) 74 | b.count = 8 75 | } 76 | 77 | i := len(b.stream) - 1 78 | 79 | // fill up b.b with b.count bits from byt 80 | b.stream[i] |= byt >> (8 - b.count) 81 | 82 | b.stream = append(b.stream, 0) 83 | i++ 84 | b.stream[i] = byt << b.count 85 | } 86 | 87 | func (b *bstream) writeBits(u uint64, nbits int) { 88 | u <<= (64 - uint(nbits)) 89 | for nbits >= 8 { 90 | byt := byte(u >> 56) 91 | b.writeByte(byt) 92 | u <<= 8 93 | nbits -= 8 94 | } 95 | 96 | for nbits > 0 { 97 | b.writeBit((u >> 63) == 1) 98 | u <<= 1 99 | nbits-- 100 | } 101 | } 102 | 103 | type bstreamReader struct { 104 | stream []byte 105 | streamOffset int // The offset from which read the next byte from the stream. 106 | 107 | buffer uint64 // The current buffer, filled from the stream, containing up to 8 bytes from which read bits. 108 | valid uint8 // The number of bits valid to read (from left) in the current buffer. 109 | } 110 | 111 | func newBReader(b []byte) bstreamReader { 112 | return bstreamReader{ 113 | stream: b, 114 | } 115 | } 116 | 117 | func (b *bstreamReader) readBit() (bit, error) { 118 | if b.valid == 0 { 119 | if !b.loadNextBuffer(1) { 120 | return false, io.EOF 121 | } 122 | } 123 | 124 | return b.readBitFast() 125 | } 126 | 127 | // readBitFast is like readBit but can return io.EOF if the internal buffer is empty. 128 | // If it returns io.EOF, the caller should retry reading bits calling readBit(). 129 | // This function must be kept small and a leaf in order to help the compiler inlining it 130 | // and further improve performances. 131 | func (b *bstreamReader) readBitFast() (bit, error) { 132 | if b.valid == 0 { 133 | return false, io.EOF 134 | } 135 | 136 | b.valid-- 137 | bitmask := uint64(1) << b.valid 138 | return (b.buffer & bitmask) != 0, nil 139 | } 140 | 141 | func (b *bstreamReader) readBits(nbits uint8) (uint64, error) { 142 | if b.valid == 0 { 143 | if !b.loadNextBuffer(nbits) { 144 | return 0, io.EOF 145 | } 146 | } 147 | 148 | if nbits <= b.valid { 149 | return b.readBitsFast(nbits) 150 | } 151 | 152 | // We have to read all remaining valid bits from the current buffer and a part from the next one. 153 | bitmask := (uint64(1) << b.valid) - 1 154 | nbits -= b.valid 155 | v := (b.buffer & bitmask) << nbits 156 | b.valid = 0 157 | 158 | if !b.loadNextBuffer(nbits) { 159 | return 0, io.EOF 160 | } 161 | 162 | bitmask = (uint64(1) << nbits) - 1 163 | v = v | ((b.buffer >> (b.valid - nbits)) & bitmask) 164 | b.valid -= nbits 165 | 166 | return v, nil 167 | } 168 | 169 | // readBitsFast is like readBits but can return io.EOF if the internal buffer is empty. 170 | // If it returns io.EOF, the caller should retry reading bits calling readBits(). 171 | // This function must be kept small and a leaf in order to help the compiler inlining it 172 | // and further improve performances. 173 | func (b *bstreamReader) readBitsFast(nbits uint8) (uint64, error) { 174 | if nbits > b.valid { 175 | return 0, io.EOF 176 | } 177 | 178 | bitmask := (uint64(1) << nbits) - 1 179 | b.valid -= nbits 180 | 181 | return (b.buffer >> b.valid) & bitmask, nil 182 | } 183 | 184 | func (b *bstreamReader) ReadByte() (byte, error) { 185 | v, err := b.readBits(8) 186 | if err != nil { 187 | return 0, err 188 | } 189 | return byte(v), nil 190 | } 191 | 192 | // loadNextBuffer loads the next bytes from the stream into the internal buffer. 193 | // The input nbits is the minimum number of bits that must be read, but the implementation 194 | // can read more (if possible) to improve performances. 195 | func (b *bstreamReader) loadNextBuffer(nbits uint8) bool { 196 | if b.streamOffset >= len(b.stream) { 197 | return false 198 | } 199 | 200 | // Handle the case there are more then 8 bytes in the buffer (most common case) 201 | // in a optimized way. It's guaranteed that this branch will never read from the 202 | // very last byte of the stream (which suffers race conditions due to concurrent 203 | // writes). 204 | if b.streamOffset+8 < len(b.stream) { 205 | b.buffer = binary.BigEndian.Uint64(b.stream[b.streamOffset:]) 206 | b.streamOffset += 8 207 | b.valid = 64 208 | return true 209 | } 210 | 211 | // We're here if the are 8 or less bytes left in the stream. Since this reader needs 212 | // to handle race conditions with concurrent writes happening on the very last byte 213 | // we make sure to never over more than the minimum requested bits (rounded up to 214 | // the next byte). The following code is slower but called less frequently. 215 | nbytes := int((nbits / 8) + 1) 216 | if b.streamOffset+nbytes > len(b.stream) { 217 | nbytes = len(b.stream) - b.streamOffset 218 | } 219 | 220 | buffer := uint64(0) 221 | for i := 0; i < nbytes; i++ { 222 | buffer = buffer | (uint64(b.stream[b.streamOffset+i]) << uint(8*(nbytes-i-1))) 223 | } 224 | 225 | b.buffer = buffer 226 | b.streamOffset += nbytes 227 | b.valid = uint8(nbytes * 8) 228 | 229 | return true 230 | } 231 | -------------------------------------------------------------------------------- /bstream_test.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/require" 7 | ) 8 | 9 | func TestBstreamReader(t *testing.T) { 10 | // Write to the bit stream. 11 | w := bstream{} 12 | for _, bit := range []bit{true, false} { 13 | w.writeBit(bit) 14 | } 15 | for nbits := 1; nbits <= 64; nbits++ { 16 | w.writeBits(uint64(nbits), nbits) 17 | } 18 | for v := 1; v < 10000; v += 123 { 19 | w.writeBits(uint64(v), 29) 20 | } 21 | 22 | // Read back. 23 | r := newBReader(w.bytes()) 24 | for _, bit := range []bit{true, false} { 25 | v, err := r.readBitFast() 26 | if err != nil { 27 | v, err = r.readBit() 28 | } 29 | require.NoError(t, err) 30 | require.Equal(t, bit, v) 31 | } 32 | for nbits := uint8(1); nbits <= 64; nbits++ { 33 | v, err := r.readBitsFast(nbits) 34 | if err != nil { 35 | v, err = r.readBits(nbits) 36 | } 37 | require.NoError(t, err) 38 | require.Equal(t, uint64(nbits), v, "nbits=%d", nbits) 39 | } 40 | for v := 1; v < 10000; v += 123 { 41 | actual, err := r.readBitsFast(29) 42 | if err != nil { 43 | actual, err = r.readBits(29) 44 | } 45 | require.NoError(t, err) 46 | require.Equal(t, uint64(v), actual, "v=%d", v) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | range: 30..90 3 | round: up 4 | status: 5 | project: 6 | default: 7 | target: 70% 8 | threshold: 20% 9 | if_not_found: success 10 | if_ci_failed: success 11 | patch: 12 | default: 13 | target: 0% 14 | threshold: 20% 15 | if_no_uploads: success 16 | if_not_found: success 17 | if_ci_failed: success 18 | -------------------------------------------------------------------------------- /disk_partition.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "errors" 7 | "fmt" 8 | "io" 9 | "os" 10 | "path/filepath" 11 | "time" 12 | 13 | "github.com/nakabonne/tstorage/internal/syscall" 14 | ) 15 | 16 | const ( 17 | dataFileName = "data" 18 | metaFileName = "meta.json" 19 | ) 20 | 21 | var ( 22 | errInvalidPartition = errors.New("invalid partition") 23 | ) 24 | 25 | // A disk partition implements a partition that uses local disk as a storage. 26 | // It mainly has two files, data file and meta file. 27 | // The data file is memory-mapped and read only; no need to lock at all. 28 | type diskPartition struct { 29 | dirPath string 30 | meta meta 31 | // file descriptor of data file 32 | f *os.File 33 | // memory-mapped file backed by f 34 | mappedFile []byte 35 | // duration to store data 36 | retention time.Duration 37 | } 38 | 39 | // meta is a mapper for a meta file, which is put for each partition. 40 | // Note that the CreatedAt is surely timestamped by tstorage but Min/Max Timestamps are likely to do by other process. 41 | type meta struct { 42 | MinTimestamp int64 `json:"minTimestamp"` 43 | MaxTimestamp int64 `json:"maxTimestamp"` 44 | NumDataPoints int `json:"numDataPoints"` 45 | Metrics map[string]diskMetric `json:"metrics"` 46 | CreatedAt time.Time `json:"createdAt"` 47 | } 48 | 49 | // diskMetric holds meta data to access actual data from the memory-mapped file. 50 | type diskMetric struct { 51 | Name string `json:"name"` 52 | Offset int64 `json:"offset"` 53 | MinTimestamp int64 `json:"minTimestamp"` 54 | MaxTimestamp int64 `json:"maxTimestamp"` 55 | NumDataPoints int64 `json:"numDataPoints"` 56 | } 57 | 58 | // openDiskPartition first maps the data file into memory with memory-mapping. 59 | func openDiskPartition(dirPath string, retention time.Duration) (partition, error) { 60 | if dirPath == "" { 61 | return nil, fmt.Errorf("dir path is required") 62 | } 63 | metaFilePath := filepath.Join(dirPath, metaFileName) 64 | _, err := os.Stat(metaFilePath) 65 | if errors.Is(err, os.ErrNotExist) { 66 | return nil, errInvalidPartition 67 | } 68 | 69 | // Map data to the memory 70 | dataPath := filepath.Join(dirPath, dataFileName) 71 | f, err := os.Open(dataPath) 72 | if err != nil { 73 | return nil, fmt.Errorf("failed to read data file: %w", err) 74 | } 75 | defer f.Close() 76 | info, err := f.Stat() 77 | if err != nil { 78 | return nil, fmt.Errorf("failed to fetch file info: %w", err) 79 | } 80 | if info.Size() == 0 { 81 | return nil, ErrNoDataPoints 82 | } 83 | mapped, err := syscall.Mmap(int(f.Fd()), int(info.Size())) 84 | if err != nil { 85 | return nil, fmt.Errorf("failed to perform mmap: %w", err) 86 | } 87 | 88 | // Read metadata to the heap 89 | m := meta{} 90 | mf, err := os.Open(metaFilePath) 91 | if err != nil { 92 | return nil, fmt.Errorf("failed to read metadata: %w", err) 93 | } 94 | defer mf.Close() 95 | decoder := json.NewDecoder(mf) 96 | if err := decoder.Decode(&m); err != nil { 97 | return nil, fmt.Errorf("failed to decode metadata: %w", err) 98 | } 99 | return &diskPartition{ 100 | dirPath: dirPath, 101 | meta: m, 102 | f: f, 103 | mappedFile: mapped, 104 | retention: retention, 105 | }, nil 106 | } 107 | 108 | func (d *diskPartition) insertRows(_ []Row) ([]Row, error) { 109 | return nil, fmt.Errorf("can't insert rows into disk partition") 110 | } 111 | 112 | func (d *diskPartition) selectDataPoints(metric string, labels []Label, start, end int64) ([]*DataPoint, error) { 113 | if d.expired() { 114 | return nil, fmt.Errorf("this partition is expired: %w", ErrNoDataPoints) 115 | } 116 | name := marshalMetricName(metric, labels) 117 | mt, ok := d.meta.Metrics[name] 118 | if !ok { 119 | return nil, ErrNoDataPoints 120 | } 121 | r := bytes.NewReader(d.mappedFile) 122 | if _, err := r.Seek(mt.Offset, io.SeekStart); err != nil { 123 | return nil, fmt.Errorf("failed to seek: %w", err) 124 | } 125 | decoder, err := newSeriesDecoder(r) 126 | if err != nil { 127 | return nil, fmt.Errorf("failed to generate decoder for metric %q in %q: %w", name, d.dirPath, err) 128 | } 129 | 130 | // TODO: Divide fixed-lengh chunks when flushing, and index it. 131 | points := make([]*DataPoint, 0, mt.NumDataPoints) 132 | for i := 0; i < int(mt.NumDataPoints); i++ { 133 | point := &DataPoint{} 134 | if err := decoder.decodePoint(point); err != nil { 135 | return nil, fmt.Errorf("failed to decode point of metric %q in %q: %w", name, d.dirPath, err) 136 | } 137 | if point.Timestamp < start { 138 | continue 139 | } 140 | if point.Timestamp >= end { 141 | break 142 | } 143 | points = append(points, point) 144 | } 145 | return points, nil 146 | } 147 | 148 | func (d *diskPartition) minTimestamp() int64 { 149 | return d.meta.MinTimestamp 150 | } 151 | 152 | func (d *diskPartition) maxTimestamp() int64 { 153 | return d.meta.MaxTimestamp 154 | } 155 | 156 | func (d *diskPartition) size() int { 157 | return d.meta.NumDataPoints 158 | } 159 | 160 | // Disk partition is immutable. 161 | func (d *diskPartition) active() bool { 162 | return false 163 | } 164 | 165 | func (d *diskPartition) clean() error { 166 | if err := os.RemoveAll(d.dirPath); err != nil { 167 | return fmt.Errorf("failed to remove all files inside the partition (%d~%d): %w", d.minTimestamp(), d.maxTimestamp(), err) 168 | } 169 | 170 | return nil 171 | } 172 | 173 | func (d *diskPartition) expired() bool { 174 | diff := time.Since(d.meta.CreatedAt) 175 | if diff > d.retention { 176 | return true 177 | } 178 | return false 179 | } 180 | -------------------------------------------------------------------------------- /disk_partition_test.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestOpenDiskPartition(t *testing.T) { 11 | tests := []struct { 12 | name string 13 | dirPath string 14 | retention time.Duration 15 | want partition 16 | wantErr bool 17 | }{ 18 | { 19 | name: "empty dir name given", 20 | dirPath: "", 21 | retention: 24 * time.Hour, 22 | wantErr: true, 23 | }, 24 | { 25 | name: "non-existent dir given", 26 | dirPath: "./non-existent-dir", 27 | retention: 24 * time.Hour, 28 | wantErr: true, 29 | }, 30 | } 31 | for _, tt := range tests { 32 | t.Run(tt.name, func(t *testing.T) { 33 | got, err := openDiskPartition(tt.dirPath, tt.retention) 34 | assert.Equal(t, tt.wantErr, err != nil) 35 | assert.Equal(t, tt.want, got) 36 | }) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /disk_wal.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "bufio" 5 | "encoding/binary" 6 | "errors" 7 | "fmt" 8 | "io" 9 | "io/fs" 10 | "math" 11 | "os" 12 | "path/filepath" 13 | "strconv" 14 | "sync" 15 | "sync/atomic" 16 | ) 17 | 18 | // diskWAL contains multiple segment files. One segment is responsible for one partition. 19 | // They can be easily sorted because they are named using the created timestamp. 20 | // Macro layout is like: 21 | /* 22 | .wal/ 23 | ├── 0 24 | └── 1 25 | */ 26 | type diskWAL struct { 27 | dir string 28 | bufferedSize int 29 | // Buffered-writer to the active segment 30 | w *bufio.Writer 31 | // File descriptor to the active segment 32 | fd *os.File 33 | index uint32 34 | mu sync.Mutex 35 | } 36 | 37 | func newDiskWAL(dir string, bufferedSize int) (wal, error) { 38 | if err := os.MkdirAll(dir, fs.ModePerm); err != nil { 39 | return nil, fmt.Errorf("failed to make WAL dir: %w", err) 40 | } 41 | w := &diskWAL{ 42 | dir: dir, 43 | bufferedSize: bufferedSize, 44 | } 45 | f, err := w.createSegmentFile(dir) 46 | if err != nil { 47 | return nil, err 48 | } 49 | w.fd = f 50 | w.w = bufio.NewWriterSize(f, bufferedSize) 51 | 52 | return w, nil 53 | } 54 | 55 | // append appends the given entry to the end of a file via the file descriptor it has. 56 | func (w *diskWAL) append(op walOperation, rows []Row) error { 57 | w.mu.Lock() 58 | defer w.mu.Unlock() 59 | 60 | switch op { 61 | case operationInsert: 62 | for _, row := range rows { 63 | // Write the operation type 64 | if err := w.w.WriteByte(byte(op)); err != nil { 65 | return fmt.Errorf("failed to write operation: %w", err) 66 | } 67 | name := marshalMetricName(row.Metric, row.Labels) 68 | // Write the length of the metric name 69 | lBuf := make([]byte, binary.MaxVarintLen64) 70 | n := binary.PutUvarint(lBuf, uint64(len(name))) 71 | if _, err := w.w.Write(lBuf[:n]); err != nil { 72 | return fmt.Errorf("failed to write the length of the metric name: %w", err) 73 | } 74 | // Write the metric name 75 | if _, err := w.w.WriteString(name); err != nil { 76 | return fmt.Errorf("failed to write the metric name: %w", err) 77 | } 78 | // Write the timestamp 79 | tsBuf := make([]byte, binary.MaxVarintLen64) 80 | n = binary.PutVarint(tsBuf, row.DataPoint.Timestamp) 81 | if _, err := w.w.Write(tsBuf[:n]); err != nil { 82 | return fmt.Errorf("failed to write the timestamp: %w", err) 83 | } 84 | // Write the value 85 | vBuf := make([]byte, binary.MaxVarintLen64) 86 | n = binary.PutUvarint(vBuf, math.Float64bits(row.DataPoint.Value)) 87 | if _, err := w.w.Write(vBuf[:n]); err != nil { 88 | return fmt.Errorf("failed to write the value: %w", err) 89 | } 90 | } 91 | default: 92 | return fmt.Errorf("unknown operation %v given", op) 93 | } 94 | if w.bufferedSize == 0 { 95 | return w.flush() 96 | } 97 | 98 | return nil 99 | } 100 | 101 | // flush flushes all buffered entries to the underlying file. 102 | func (w *diskWAL) flush() error { 103 | if err := w.w.Flush(); err != nil { 104 | return fmt.Errorf("failed to flush buffered-data into the underlying WAL file: %w", err) 105 | } 106 | return nil 107 | } 108 | 109 | // punctuate set boundary and creates a new segment. 110 | func (w *diskWAL) punctuate() error { 111 | w.mu.Lock() 112 | defer w.mu.Unlock() 113 | if err := w.flush(); err != nil { 114 | return err 115 | } 116 | if err := w.fd.Close(); err != nil { 117 | return err 118 | } 119 | f, err := w.createSegmentFile(w.dir) 120 | if err != nil { 121 | return err 122 | } 123 | w.fd = f 124 | w.w = bufio.NewWriterSize(f, w.bufferedSize) 125 | return nil 126 | } 127 | 128 | // truncateOldest removes only the oldest segment. 129 | func (w *diskWAL) removeOldest() error { 130 | w.mu.Lock() 131 | defer w.mu.Unlock() 132 | files, err := os.ReadDir(w.dir) 133 | if err != nil { 134 | return fmt.Errorf("failed to read WAL directory: %w", err) 135 | } 136 | if len(files) == 0 { 137 | return fmt.Errorf("no segment found") 138 | } 139 | return os.RemoveAll(filepath.Join(w.dir, files[0].Name())) 140 | } 141 | 142 | // removeAll removes all segment files. 143 | func (w *diskWAL) removeAll() error { 144 | w.mu.Lock() 145 | defer w.mu.Unlock() 146 | if err := w.fd.Close(); err != nil { 147 | return err 148 | } 149 | if err := os.RemoveAll(w.dir); err != nil { 150 | return fmt.Errorf("failed to remove files under %q: %w", w.dir, err) 151 | } 152 | return os.MkdirAll(w.dir, fs.ModePerm) 153 | } 154 | 155 | // refresh removes all segment files and make a new segment. 156 | func (w *diskWAL) refresh() error { 157 | if err := w.removeAll(); err != nil { 158 | return err 159 | } 160 | w.mu.Lock() 161 | defer w.mu.Unlock() 162 | 163 | f, err := w.createSegmentFile(w.dir) 164 | if err != nil { 165 | return err 166 | } 167 | w.fd = f 168 | w.w = bufio.NewWriterSize(f, w.bufferedSize) 169 | return nil 170 | } 171 | 172 | // createSegmentFile creates a new file with the name of the numbering index. 173 | func (w *diskWAL) createSegmentFile(dir string) (*os.File, error) { 174 | name := strconv.Itoa(int(atomic.LoadUint32(&w.index))) 175 | f, err := os.OpenFile(filepath.Join(dir, name), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) 176 | if err != nil { 177 | return nil, fmt.Errorf("failed to create segment file: %w", err) 178 | } 179 | atomic.AddUint32(&w.index, 1) 180 | return f, nil 181 | } 182 | 183 | type walRecord struct { 184 | op walOperation 185 | row Row 186 | } 187 | 188 | type diskWALReader struct { 189 | dir string 190 | files []os.DirEntry 191 | rowsToInsert []Row 192 | } 193 | 194 | func newDiskWALReader(dir string) (*diskWALReader, error) { 195 | files, err := os.ReadDir(dir) 196 | if err != nil { 197 | return nil, fmt.Errorf("failed to read the WAL dir: %w", err) 198 | } 199 | 200 | return &diskWALReader{ 201 | dir: dir, 202 | files: files, 203 | rowsToInsert: make([]Row, 0), 204 | }, nil 205 | } 206 | 207 | // readAll reads all segment files and caches the result for each operation. 208 | func (f *diskWALReader) readAll() error { 209 | for _, file := range f.files { 210 | if file.IsDir() { 211 | return fmt.Errorf("unexpected directory found under the WAL directory: %s", file.Name()) 212 | } 213 | fd, err := os.Open(filepath.Join(f.dir, file.Name())) 214 | if err != nil { 215 | return fmt.Errorf("failed to open WAL segment file: %w", err) 216 | } 217 | segment := &segment{ 218 | file: fd, 219 | r: bufio.NewReader(fd), 220 | } 221 | for segment.next() { 222 | rec := segment.record() 223 | switch rec.op { 224 | case operationInsert: 225 | f.rowsToInsert = append(f.rowsToInsert, rec.row) 226 | } 227 | } 228 | if err := segment.close(); err != nil { 229 | return err 230 | } 231 | 232 | err = segment.error() 233 | if errors.Is(err, io.ErrUnexpectedEOF) || errors.Is(err, io.EOF) { 234 | // It is not unusual for a line to be invalid, as it may well terminate in the middle of writing to the WAL. 235 | return nil 236 | } 237 | if err != nil { 238 | return fmt.Errorf("encounter an error while reading WAL segment file %q: %w", file.Name(), segment.error()) 239 | } 240 | } 241 | return nil 242 | } 243 | 244 | // segment represents a segment file. 245 | type segment struct { 246 | file *os.File 247 | r *bufio.Reader 248 | // FIXME: Use interface to support other operation type 249 | current walRecord 250 | err error 251 | } 252 | 253 | func (f *segment) next() bool { 254 | op, err := f.r.ReadByte() 255 | if errors.Is(err, io.EOF) { 256 | return false 257 | } 258 | if err != nil { 259 | f.err = err 260 | return false 261 | } 262 | switch walOperation(op) { 263 | case operationInsert: 264 | // Read the length of metric name. 265 | metricLen, err := binary.ReadUvarint(f.r) 266 | if err != nil { 267 | f.err = fmt.Errorf("failed to read the length of metric name: %w", err) 268 | return false 269 | } 270 | // Read the metric name. 271 | metric := make([]byte, int(metricLen)) 272 | if _, err := io.ReadFull(f.r, metric); err != nil { 273 | f.err = fmt.Errorf("failed to read the metric name: %w", err) 274 | return false 275 | } 276 | // Read timestamp. 277 | ts, err := binary.ReadVarint(f.r) 278 | if err != nil { 279 | f.err = fmt.Errorf("failed to read timestamp: %w", err) 280 | return false 281 | } 282 | // Read value. 283 | val, err := binary.ReadUvarint(f.r) 284 | if err != nil { 285 | f.err = fmt.Errorf("failed to read value: %w", err) 286 | return false 287 | } 288 | f.current = walRecord{ 289 | op: walOperation(op), 290 | row: Row{ 291 | Metric: string(metric), 292 | DataPoint: DataPoint{ 293 | Timestamp: ts, 294 | Value: math.Float64frombits(val), 295 | }, 296 | }, 297 | } 298 | default: 299 | f.err = fmt.Errorf("unknown operation %v found", op) 300 | return false 301 | } 302 | 303 | return true 304 | } 305 | 306 | // error gives back an error if it has been facing an error while reading. 307 | func (f *segment) error() error { 308 | return f.err 309 | } 310 | 311 | func (f *segment) record() *walRecord { 312 | return &f.current 313 | } 314 | 315 | func (f *segment) close() error { 316 | return f.file.Close() 317 | } 318 | -------------------------------------------------------------------------------- /disk_wal_test.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "strconv" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/assert" 10 | "github.com/stretchr/testify/require" 11 | ) 12 | 13 | func Test_diskWAL_append_read(t *testing.T) { 14 | var ( 15 | op = operationInsert 16 | rows = []Row{ 17 | {Metric: "metric-1", DataPoint: DataPoint{Value: 0.1, Timestamp: 1600000000}}, 18 | {Metric: "metric-2", DataPoint: DataPoint{Value: 0.2, Timestamp: 1600000001}}, 19 | {Metric: "metric-1", DataPoint: DataPoint{Value: 0.1, Timestamp: 1600000001}}, 20 | {Metric: "metric-2", DataPoint: DataPoint{Value: 0.2, Timestamp: 1600000003}}, 21 | } 22 | ) 23 | // Append rows into wal 24 | tmpDir, err := os.MkdirTemp("", "tstorage-test") 25 | defer os.RemoveAll(tmpDir) 26 | require.NoError(t, err) 27 | path := filepath.Join(tmpDir, "wal") 28 | 29 | wal, err := newDiskWAL(path, 4096) 30 | require.NoError(t, err) 31 | 32 | // Append into two segments 33 | err = wal.append(op, rows[:2]) 34 | require.NoError(t, err) 35 | 36 | err = wal.punctuate() 37 | require.NoError(t, err) 38 | 39 | err = wal.append(op, rows[2:]) 40 | require.NoError(t, err) 41 | 42 | err = wal.flush() 43 | require.NoError(t, err) 44 | 45 | // Recover rows. 46 | reader, err := newDiskWALReader(path) 47 | require.NoError(t, err) 48 | err = reader.readAll() 49 | require.NoError(t, err) 50 | got := reader.rowsToInsert 51 | assert.Equal(t, rows, got) 52 | } 53 | 54 | func Test_diskWAL_removeOldest(t *testing.T) { 55 | tmpDir, err := os.MkdirTemp("", "tstorage-test") 56 | require.NoError(t, err) 57 | for i := 0; i < 3; i++ { 58 | err := os.Mkdir(filepath.Join(tmpDir, strconv.Itoa(i)), os.ModePerm) 59 | require.NoError(t, err) 60 | } 61 | w := &diskWAL{ 62 | dir: tmpDir, 63 | } 64 | err = w.removeOldest() 65 | require.NoError(t, err) 66 | files, err := os.ReadDir(w.dir) 67 | require.NoError(t, err) 68 | want := []string{"1", "2"} 69 | got := []string{} 70 | for _, f := range files { 71 | got = append(got, f.Name()) 72 | } 73 | assert.Equal(t, want, got) 74 | } 75 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // Package tstorage provides goroutine safe capabilities of insertion into and retrieval 2 | // from the time-series storage. 3 | package tstorage 4 | -------------------------------------------------------------------------------- /encoding.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2015,2016 Damian Gryski 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are met: 6 | // 7 | // * Redistributions of source code must retain the above copyright notice, 8 | // this list of conditions and the following disclaimer. 9 | // 10 | // * Redistributions in binary form must reproduce the above copyright notice, 11 | // this list of conditions and the following disclaimer in the documentation 12 | // and/or other materials provided with the distribution. 13 | // 14 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | package tstorage 26 | 27 | import ( 28 | "encoding/binary" 29 | "fmt" 30 | "io" 31 | "math" 32 | "math/bits" 33 | ) 34 | 35 | type seriesEncoder interface { 36 | encodePoint(point *DataPoint) error 37 | flush() error 38 | } 39 | 40 | func newSeriesEncoder(w io.Writer) seriesEncoder { 41 | return &gorillaEncoder{ 42 | w: w, 43 | buf: &bstream{stream: make([]byte, 0)}, 44 | } 45 | } 46 | 47 | // gorillaEncoder implements the Gorilla's time-series data compression. 48 | // See: http://www.vldb.org/pvldb/vol8/p1816-teller.pdf 49 | type gorillaEncoder struct { 50 | // backend stream writer 51 | w io.Writer 52 | 53 | // buffer to be used while encoding 54 | buf *bstream 55 | 56 | // Calculate the delta of delta: 57 | // D = (t_n − t_n−1) − (t_n−1 − t_n−2) 58 | // 59 | // t_0, starting timestamp t_0 60 | // immutable 61 | t0 int64 62 | // t_1, the next to starting timestamp 63 | // immutable 64 | t1 int64 65 | // t_n, timestamp of the Nth data point 66 | // mutable 67 | t int64 68 | // delta of t_n 69 | tDelta uint64 70 | 71 | // v_n, value of the Nth data point 72 | v float64 73 | leading uint8 74 | trailing uint8 75 | } 76 | 77 | // encodePoints is not goroutine safe. It's caller's responsibility to lock it. 78 | func (e *gorillaEncoder) encodePoint(point *DataPoint) error { 79 | var tDelta uint64 80 | 81 | // Borrowed from https://github.com/prometheus/prometheus/blob/39d79c3cfb86c47d6bc06a9e9317af582f1833bb/tsdb/chunkenc/xor.go#L150 82 | switch { 83 | case e.t0 == 0: 84 | // Write timestamp directly. 85 | buf := make([]byte, binary.MaxVarintLen64) 86 | for _, b := range buf[:binary.PutVarint(buf, point.Timestamp)] { 87 | e.buf.writeByte(b) 88 | } 89 | // Write value directly. 90 | e.buf.writeBits(math.Float64bits(point.Value), 64) 91 | e.t0 = point.Timestamp 92 | case e.t1 == 0: 93 | // Write delta of timestamp. 94 | tDelta = uint64(point.Timestamp - e.t0) 95 | 96 | buf := make([]byte, binary.MaxVarintLen64) 97 | for _, b := range buf[:binary.PutUvarint(buf, tDelta)] { 98 | e.buf.writeByte(b) 99 | } 100 | // Write value delta. 101 | e.writeVDelta(point.Value) 102 | e.t1 = point.Timestamp 103 | default: 104 | // Write delta-of-delta of timestamp. 105 | tDelta = uint64(point.Timestamp - e.t) 106 | deltaOfDelta := int64(tDelta - e.tDelta) 107 | switch { 108 | case deltaOfDelta == 0: 109 | e.buf.writeBit(zero) 110 | case -63 <= deltaOfDelta && deltaOfDelta <= 64: 111 | e.buf.writeBits(0x02, 2) // '10' 112 | e.buf.writeBits(uint64(deltaOfDelta), 7) 113 | case -255 <= deltaOfDelta && deltaOfDelta <= 256: 114 | e.buf.writeBits(0x06, 3) // '110' 115 | e.buf.writeBits(uint64(deltaOfDelta), 9) 116 | case -2047 <= deltaOfDelta && deltaOfDelta <= 2048: 117 | e.buf.writeBits(0x0e, 4) // '1110' 118 | e.buf.writeBits(uint64(deltaOfDelta), 12) 119 | default: 120 | e.buf.writeBits(0x0f, 4) // '1111' 121 | e.buf.writeBits(uint64(deltaOfDelta), 64) 122 | } 123 | // Write value delta. 124 | e.writeVDelta(point.Value) 125 | } 126 | 127 | e.t = point.Timestamp 128 | e.v = point.Value 129 | e.tDelta = tDelta 130 | return nil 131 | } 132 | 133 | // flush writes the buffered-bytes into the backend io.Writer 134 | // and resets everything used for computation. 135 | func (e *gorillaEncoder) flush() error { 136 | // TODO: Compress with ZStandard 137 | _, err := e.w.Write(e.buf.bytes()) 138 | if err != nil { 139 | return fmt.Errorf("failed to flush buffered bytes: %w", err) 140 | } 141 | 142 | e.buf.reset() 143 | e.t0 = 0 144 | e.t1 = 0 145 | e.t = 0 146 | e.tDelta = 0 147 | e.v = 0 148 | e.v = 0 149 | e.leading = 0 150 | e.trailing = 0 151 | 152 | return nil 153 | } 154 | 155 | func (e *gorillaEncoder) writeVDelta(v float64) { 156 | vDelta := math.Float64bits(v) ^ math.Float64bits(e.v) 157 | 158 | if vDelta == 0 { 159 | e.buf.writeBit(zero) 160 | return 161 | } 162 | e.buf.writeBit(one) 163 | 164 | leading := uint8(bits.LeadingZeros64(vDelta)) 165 | trailing := uint8(bits.TrailingZeros64(vDelta)) 166 | 167 | // Clamp number of leading zeros to avoid overflow when encoding. 168 | if leading >= 32 { 169 | leading = 31 170 | } 171 | 172 | if e.leading != 0xff && leading >= e.leading && trailing >= e.trailing { 173 | e.buf.writeBit(zero) 174 | e.buf.writeBits(vDelta>>e.trailing, 64-int(e.leading)-int(e.trailing)) 175 | } else { 176 | e.leading, e.trailing = leading, trailing 177 | 178 | e.buf.writeBit(one) 179 | e.buf.writeBits(uint64(leading), 5) 180 | 181 | // Note that if leading == trailing == 0, then sigbits == 64. But that value doesn't actually fit into the 6 bits we have. 182 | // Luckily, we never need to encode 0 significant bits, since that would put us in the other case (vdelta == 0). 183 | // So instead we write out a 0 and adjust it back to 64 on unpacking. 184 | sigbits := 64 - leading - trailing 185 | e.buf.writeBits(uint64(sigbits), 6) 186 | e.buf.writeBits(vDelta>>trailing, int(sigbits)) 187 | } 188 | } 189 | 190 | type seriesDecoder interface { 191 | decodePoint(dst *DataPoint) error 192 | } 193 | 194 | // newSeriesDecoder decompress data from the given Reader, then holds the decompressed data 195 | func newSeriesDecoder(r io.Reader) (seriesDecoder, error) { 196 | // TODO: Stop copying entire bytes, then make it possible to to make bstreamReader from io.Reader 197 | b, err := io.ReadAll(r) 198 | if err != nil { 199 | return nil, fmt.Errorf("failed to read all bytes: %w", err) 200 | } 201 | return &gorillaDecoder{ 202 | br: newBReader(b), 203 | }, nil 204 | } 205 | 206 | type gorillaDecoder struct { 207 | br bstreamReader 208 | numRead uint16 209 | 210 | // timestamp of the Nth data point 211 | t int64 212 | tDelta uint64 213 | 214 | // value of the Nth data point 215 | v float64 216 | leading uint8 217 | trailing uint8 218 | } 219 | 220 | func (d *gorillaDecoder) decodePoint(dst *DataPoint) error { 221 | if d.numRead == 0 { 222 | t, err := binary.ReadVarint(&d.br) 223 | if err != nil { 224 | return fmt.Errorf("failed to read Timestamp of T0: %w", err) 225 | } 226 | v, err := d.br.readBits(64) 227 | if err != nil { 228 | return fmt.Errorf("failed to read Value of T0: %w", err) 229 | } 230 | d.t = t 231 | d.v = math.Float64frombits(v) 232 | 233 | d.numRead++ 234 | dst.Timestamp = d.t 235 | dst.Value = d.v 236 | return nil 237 | } 238 | if d.numRead == 1 { 239 | tDelta, err := binary.ReadUvarint(&d.br) 240 | if err != nil { 241 | return err 242 | } 243 | d.tDelta = tDelta 244 | d.t = d.t + int64(d.tDelta) 245 | 246 | if err := d.readValue(); err != nil { 247 | return err 248 | } 249 | d.numRead++ 250 | dst.Timestamp = d.t 251 | dst.Value = d.v 252 | return nil 253 | } 254 | 255 | var delimiter byte 256 | // read delta-of-delta 257 | for i := 0; i < 4; i++ { 258 | delimiter <<= 1 259 | bit, err := d.br.readBitFast() 260 | if err != nil { 261 | bit, err = d.br.readBit() 262 | } 263 | if err != nil { 264 | return err 265 | } 266 | if bit == zero { 267 | break 268 | } 269 | delimiter |= 1 270 | } 271 | var sz uint8 272 | var deltaOfDelta int64 273 | switch delimiter { 274 | case 0x00: 275 | // deltaOfDelta == 0 276 | case 0x02: 277 | sz = 7 278 | case 0x06: 279 | sz = 9 280 | case 0x0e: 281 | sz = 12 282 | case 0x0f: 283 | // Do not use fast because d's very unlikely d will succeed. 284 | bits, err := d.br.readBits(64) 285 | if err != nil { 286 | return err 287 | } 288 | 289 | deltaOfDelta = int64(bits) 290 | default: 291 | return fmt.Errorf("unknown delimiter found: %v", delimiter) 292 | } 293 | 294 | if sz != 0 { 295 | bits, err := d.br.readBitsFast(sz) 296 | if err != nil { 297 | bits, err = d.br.readBits(sz) 298 | } 299 | if err != nil { 300 | return err 301 | } 302 | if bits > (1 << (sz - 1)) { 303 | // or something 304 | bits = bits - (1 << sz) 305 | } 306 | deltaOfDelta = int64(bits) 307 | } 308 | 309 | d.tDelta = uint64(int64(d.tDelta) + deltaOfDelta) 310 | d.t = d.t + int64(d.tDelta) 311 | 312 | if err := d.readValue(); err != nil { 313 | return err 314 | } 315 | dst.Timestamp = d.t 316 | dst.Value = d.v 317 | return nil 318 | } 319 | 320 | func (d *gorillaDecoder) readValue() error { 321 | bit, err := d.br.readBitFast() 322 | if err != nil { 323 | bit, err = d.br.readBit() 324 | } 325 | if err != nil { 326 | return err 327 | } 328 | 329 | if bit == zero { 330 | // d.val = d.val 331 | } else { 332 | bit, err := d.br.readBitFast() 333 | if err != nil { 334 | bit, err = d.br.readBit() 335 | } 336 | if err != nil { 337 | return err 338 | } 339 | if bit == zero { 340 | // reuse leading/trailing zero bits 341 | // d.leading, d.trailing = d.leading, d.trailing 342 | } else { 343 | bits, err := d.br.readBitsFast(5) 344 | if err != nil { 345 | bits, err = d.br.readBits(5) 346 | } 347 | if err != nil { 348 | return err 349 | } 350 | d.leading = uint8(bits) 351 | 352 | bits, err = d.br.readBitsFast(6) 353 | if err != nil { 354 | bits, err = d.br.readBits(6) 355 | } 356 | if err != nil { 357 | return err 358 | } 359 | mbits := uint8(bits) 360 | // 0 significant bits here means we overflowed and we actually need 64; see comment in encoder 361 | if mbits == 0 { 362 | mbits = 64 363 | } 364 | d.trailing = 64 - d.leading - mbits 365 | } 366 | 367 | mbits := 64 - d.leading - d.trailing 368 | bits, err := d.br.readBitsFast(mbits) 369 | if err != nil { 370 | bits, err = d.br.readBits(mbits) 371 | } 372 | if err != nil { 373 | return err 374 | } 375 | vbits := math.Float64bits(d.v) 376 | vbits ^= bits << d.trailing 377 | d.v = math.Float64frombits(vbits) 378 | } 379 | 380 | return nil 381 | } 382 | 383 | func bitRange(x int64, nbits uint8) bool { 384 | return -((1<<(nbits-1))-1) <= x && x <= 1<<(nbits-1) 385 | } 386 | -------------------------------------------------------------------------------- /encoding_test.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | "github.com/stretchr/testify/require" 9 | ) 10 | 11 | func Test_gorillaEncoder_encodePoint_decodePoint(t *testing.T) { 12 | tests := []struct { 13 | name string 14 | input []*DataPoint // to be encoded 15 | want []*DataPoint 16 | wantEncodedByteSize int 17 | wantErr bool 18 | }{ 19 | { 20 | name: "one data point", 21 | input: []*DataPoint{ 22 | {Timestamp: 1600000000, Value: 0.1}, 23 | }, 24 | want: []*DataPoint{ 25 | {Timestamp: 1600000000, Value: 0.1}, 26 | }, 27 | wantEncodedByteSize: 14, 28 | wantErr: false, 29 | }, 30 | { 31 | name: "data points at regular intervals", 32 | input: []*DataPoint{ 33 | {Timestamp: 1600000000, Value: 0.1}, 34 | {Timestamp: 1600000060, Value: 0.1}, 35 | {Timestamp: 1600000120, Value: 0.1}, 36 | {Timestamp: 1600000180, Value: 0.1}, 37 | }, 38 | want: []*DataPoint{ 39 | {Timestamp: 1600000000, Value: 0.1}, 40 | {Timestamp: 1600000060, Value: 0.1}, 41 | {Timestamp: 1600000120, Value: 0.1}, 42 | {Timestamp: 1600000180, Value: 0.1}, 43 | }, 44 | wantEncodedByteSize: 15, 45 | wantErr: false, 46 | }, 47 | { 48 | name: "data points at random intervals", 49 | input: []*DataPoint{ 50 | {Timestamp: 1600000000, Value: 0.1}, 51 | {Timestamp: 1600000060, Value: 1.1}, 52 | {Timestamp: 1600000182, Value: 15.01}, 53 | {Timestamp: 1600000400, Value: 0.01}, 54 | {Timestamp: 1600002000, Value: 10.8}, 55 | }, 56 | want: []*DataPoint{ 57 | {Timestamp: 1600000000, Value: 0.1}, 58 | {Timestamp: 1600000060, Value: 1.1}, 59 | {Timestamp: 1600000182, Value: 15.01}, 60 | {Timestamp: 1600000400, Value: 0.01}, 61 | {Timestamp: 1600002000, Value: 10.8}, 62 | }, 63 | wantEncodedByteSize: 52, 64 | wantErr: false, 65 | }, 66 | } 67 | for _, tt := range tests { 68 | t.Run(tt.name, func(t *testing.T) { 69 | // Encode 70 | var buf bytes.Buffer 71 | var num int 72 | encoder := newSeriesEncoder(&buf) 73 | for _, point := range tt.input { 74 | err := encoder.encodePoint(point) 75 | require.NoError(t, err) 76 | num++ 77 | } 78 | err := encoder.flush() 79 | require.NoError(t, err) 80 | 81 | assert.Equal(t, tt.wantEncodedByteSize, buf.Len()) 82 | 83 | // Decode 84 | decoder, err := newSeriesDecoder(&buf) 85 | require.NoError(t, err) 86 | got := make([]*DataPoint, 0, num) 87 | for i := 0; i < num; i++ { 88 | p := &DataPoint{} 89 | err := decoder.decodePoint(p) 90 | require.NoError(t, err) 91 | got = append(got, p) 92 | } 93 | assert.Equal(t, tt.want, got) 94 | }) 95 | } 96 | } 97 | 98 | func Test_bitRange(t *testing.T) { 99 | tests := []struct { 100 | name string 101 | x int64 102 | nbits uint8 103 | want bool 104 | }{ 105 | { 106 | name: "inside the range", 107 | x: 1, 108 | nbits: 1, 109 | want: true, 110 | }, 111 | } 112 | for _, tt := range tests { 113 | t.Run(tt.name, func(t *testing.T) { 114 | got := bitRange(tt.x, tt.nbits) 115 | assert.Equal(t, tt.want, got) 116 | }) 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /fake_encoder.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | type fakeEncoder struct { 4 | encodePointFunc func(*DataPoint) error 5 | flushFunc func() error 6 | } 7 | 8 | func (f *fakeEncoder) encodePoint(p *DataPoint) error { 9 | if f.encodePointFunc == nil { 10 | return nil 11 | } 12 | return f.encodePointFunc(p) 13 | } 14 | 15 | func (f *fakeEncoder) flush() error { 16 | if f.flushFunc == nil { 17 | return nil 18 | } 19 | return f.flushFunc() 20 | } 21 | -------------------------------------------------------------------------------- /fake_partition.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | type fakePartition struct { 4 | minT int64 5 | maxT int64 6 | numPoints int 7 | IsActive bool 8 | 9 | err error 10 | } 11 | 12 | func (f *fakePartition) insertRows(_ []Row) ([]Row, error) { 13 | return nil, f.err 14 | } 15 | 16 | func (f *fakePartition) selectDataPoints(_ string, _ []Label, _, _ int64) ([]*DataPoint, error) { 17 | return nil, f.err 18 | } 19 | 20 | func (f *fakePartition) minTimestamp() int64 { 21 | return f.minT 22 | } 23 | 24 | func (f *fakePartition) maxTimestamp() int64 { 25 | return f.maxT 26 | } 27 | 28 | func (f *fakePartition) size() int { 29 | return f.numPoints 30 | } 31 | 32 | func (f *fakePartition) active() bool { 33 | return f.IsActive 34 | } 35 | 36 | func (f *fakePartition) clean() error { 37 | return nil 38 | } 39 | 40 | func (f *fakePartition) expired() bool { 41 | return false 42 | } 43 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/nakabonne/tstorage 2 | 3 | go 1.20 4 | 5 | require ( 6 | github.com/davecgh/go-spew v1.1.0 // indirect 7 | github.com/pmezard/go-difflib v1.0.0 // indirect 8 | github.com/stretchr/testify v1.7.0 // indirect 9 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect 10 | ) 11 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 2 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 3 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 4 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 5 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 6 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= 7 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 8 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 9 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 10 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= 11 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 12 | -------------------------------------------------------------------------------- /internal/cgroup/cpu.go: -------------------------------------------------------------------------------- 1 | package cgroup 2 | 3 | import ( 4 | "os" 5 | "runtime" 6 | "strconv" 7 | "strings" 8 | "sync" 9 | ) 10 | 11 | // AvailableCPUs returns the number of available CPU cores for the app. 12 | func AvailableCPUs() int { 13 | availableCPUsOnce.Do(updateGOMAXPROCSToCPUQuota) 14 | return runtime.GOMAXPROCS(-1) 15 | } 16 | 17 | var availableCPUsOnce sync.Once 18 | 19 | // updateGOMAXPROCSToCPUQuota updates GOMAXPROCS to cgroup CPU quota if GOMAXPROCS isn't set in environment var. 20 | func updateGOMAXPROCSToCPUQuota() { 21 | if v := os.Getenv("GOMAXPROCS"); v != "" { 22 | // Do not override explicitly set GOMAXPROCS. 23 | return 24 | } 25 | q := getCPUQuota() 26 | if q <= 0 { 27 | // Do not change GOMAXPROCS 28 | return 29 | } 30 | gomaxprocs := int(q + 0.5) 31 | numCPU := runtime.NumCPU() 32 | if gomaxprocs > numCPU { 33 | // There is no sense in setting more GOMAXPROCS than the number of available CPU cores. 34 | return 35 | } 36 | if gomaxprocs <= 0 { 37 | gomaxprocs = 1 38 | } 39 | runtime.GOMAXPROCS(gomaxprocs) 40 | } 41 | 42 | func getCPUQuota() float64 { 43 | quotaUS, err := getCPUStat("cpu.cfs_quota_us") 44 | if err != nil { 45 | return 0 46 | } 47 | if quotaUS <= 0 { 48 | // The quota isn't set. This may be the case in multilevel containers. 49 | // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/685#issuecomment-674423728 50 | return getOnlineCPUCount() 51 | } 52 | periodUS, err := getCPUStat("cpu.cfs_period_us") 53 | if err != nil { 54 | return 0 55 | } 56 | return float64(quotaUS) / float64(periodUS) 57 | } 58 | 59 | func getCPUStat(statName string) (int64, error) { 60 | return getStatGeneric(statName, "/sys/fs/cgroup/cpu", "/proc/self/cgroup", "cpu,") 61 | } 62 | 63 | func getOnlineCPUCount() float64 { 64 | // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/685#issuecomment-674423728 65 | data, err := os.ReadFile("/sys/devices/system/cpu/online") 66 | if err != nil { 67 | return -1 68 | } 69 | n := float64(countCPUs(string(data))) 70 | if n <= 0 { 71 | return -1 72 | } 73 | return n 74 | } 75 | 76 | func countCPUs(data string) int { 77 | data = strings.TrimSpace(data) 78 | n := 0 79 | for _, s := range strings.Split(data, ",") { 80 | n++ 81 | if !strings.Contains(s, "-") { 82 | if _, err := strconv.Atoi(s); err != nil { 83 | return -1 84 | } 85 | continue 86 | } 87 | bounds := strings.Split(s, "-") 88 | if len(bounds) != 2 { 89 | return -1 90 | } 91 | start, err := strconv.Atoi(bounds[0]) 92 | if err != nil { 93 | return -1 94 | } 95 | end, err := strconv.Atoi(bounds[1]) 96 | if err != nil { 97 | return -1 98 | } 99 | n += end - start 100 | } 101 | return n 102 | } 103 | -------------------------------------------------------------------------------- /internal/cgroup/cpu_test.go: -------------------------------------------------------------------------------- 1 | package cgroup 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestCountCPUs(t *testing.T) { 8 | f := func(s string, nExpected int) { 9 | t.Helper() 10 | n := countCPUs(s) 11 | if n != nExpected { 12 | t.Fatalf("unexpected result from countCPUs(%q); got %d; want %d", s, n, nExpected) 13 | } 14 | } 15 | f("", -1) 16 | f("1", 1) 17 | f("234", 1) 18 | f("1,2", 2) 19 | f("0-1", 2) 20 | f("0-0", 1) 21 | f("1-2,3,5-9,200-210", 19) 22 | f("0-3", 4) 23 | f("0-6", 7) 24 | } 25 | -------------------------------------------------------------------------------- /internal/cgroup/mem.go: -------------------------------------------------------------------------------- 1 | package cgroup 2 | 3 | import ( 4 | "strconv" 5 | ) 6 | 7 | // GetMemoryLimit returns cgroup memory limit 8 | func GetMemoryLimit() int64 { 9 | // Try determining the amount of memory inside docker container. 10 | // See https://stackoverflow.com/questions/42187085/check-mem-limit-within-a-docker-container 11 | // 12 | // Read memory limit according to https://unix.stackexchange.com/questions/242718/how-to-find-out-how-much-memory-lxc-container-is-allowed-to-consume 13 | // This should properly determine the limit inside lxc container. 14 | // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/84 15 | n, err := getMemStat("memory.limit_in_bytes") 16 | if err != nil { 17 | return 0 18 | } 19 | return n 20 | } 21 | 22 | func getMemStat(statName string) (int64, error) { 23 | return getStatGeneric(statName, "/sys/fs/cgroup/memory", "/proc/self/cgroup", "memory") 24 | } 25 | 26 | // GetHierarchicalMemoryLimit returns hierarchical memory limit 27 | // https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt 28 | func GetHierarchicalMemoryLimit() int64 { 29 | // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/699 30 | n, err := getHierarchicalMemoryLimit("/sys/fs/cgroup/memory", "/proc/self/cgroup") 31 | if err != nil { 32 | return 0 33 | } 34 | return n 35 | } 36 | 37 | func getHierarchicalMemoryLimit(sysfsPrefix, cgroupPath string) (int64, error) { 38 | data, err := getFileContents("memory.stat", sysfsPrefix, cgroupPath, "memory") 39 | if err != nil { 40 | return 0, err 41 | } 42 | memStat, err := grepFirstMatch(data, "hierarchical_memory_limit", 1, " ") 43 | if err != nil { 44 | return 0, err 45 | } 46 | return strconv.ParseInt(memStat, 10, 64) 47 | } 48 | -------------------------------------------------------------------------------- /internal/cgroup/mem_test.go: -------------------------------------------------------------------------------- 1 | package cgroup 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestGetHierarchicalMemoryLimitSuccess(t *testing.T) { 8 | f := func(sysPath, cgroupPath string, want int64) { 9 | t.Helper() 10 | got, err := getHierarchicalMemoryLimit(sysPath, cgroupPath) 11 | if err != nil { 12 | t.Fatalf("unexpected error: %s", err) 13 | } 14 | if got != want { 15 | t.Fatalf("unexpected result, got: %d, want %d", got, want) 16 | } 17 | } 18 | f("testdata/", "testdata/self/cgroup", 16) 19 | f("testdata/cgroup", "testdata/self/cgroup", 120) 20 | } 21 | 22 | func TestGetHierarchicalMemoryLimitFailure(t *testing.T) { 23 | f := func(sysPath, cgroupPath string) { 24 | t.Helper() 25 | got, err := getHierarchicalMemoryLimit(sysPath, cgroupPath) 26 | if err == nil { 27 | t.Fatalf("expecting non-nil error") 28 | } 29 | if got != 0 { 30 | t.Fatalf("unexpected result, got: %d, want 0", got) 31 | } 32 | } 33 | f("testdata/", "testdata/none_existing_folder") 34 | } 35 | -------------------------------------------------------------------------------- /internal/cgroup/testdata/cgroup/cpu.cfs_period_us: -------------------------------------------------------------------------------- 1 | 500000 -------------------------------------------------------------------------------- /internal/cgroup/testdata/cgroup/cpu.cfs_quota_us: -------------------------------------------------------------------------------- 1 | 10 -------------------------------------------------------------------------------- /internal/cgroup/testdata/cgroup/memory.limit_in_bytes: -------------------------------------------------------------------------------- 1 | 523372036854771712 -------------------------------------------------------------------------------- /internal/cgroup/testdata/cgroup/memory.stat: -------------------------------------------------------------------------------- 1 | rss 2 2 | rss_huge 3 3 | mapped_file 4 4 | dirty 5 5 | writeback 6 6 | pgpgin 7 7 | pgpgout 8 8 | pgfault 9 9 | pgmajfault 10 10 | inactive_anon 11 11 | active_anon 12 12 | inactive_file 13 13 | active_file 14 14 | unevictable 15 15 | hierarchical_memory_limit 120 16 | hierarchical_memsw_limit 17 17 | total_cache 18 18 | total_rss 19 19 | total_rss_huge 20 20 | total_mapped_file 21 21 | total_dirty 22 22 | total_writeback 23 23 | total_pgpgin 24 24 | total_pgpgout 25 25 | total_pgfault 26 26 | total_pgmajfault 27 27 | total_inactive_anon 28 28 | total_active_anon 29 29 | total_inactive_file 30 30 | total_active_file 31 31 | total_unevictable 32 -------------------------------------------------------------------------------- /internal/cgroup/testdata/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db/cpu.cfs_period_us: -------------------------------------------------------------------------------- 1 | 100000 -------------------------------------------------------------------------------- /internal/cgroup/testdata/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db/cpu.cfs_quota_us: -------------------------------------------------------------------------------- 1 | -1 -------------------------------------------------------------------------------- /internal/cgroup/testdata/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db/memory.limit_in_bytes: -------------------------------------------------------------------------------- 1 | 9223372036854771712 -------------------------------------------------------------------------------- /internal/cgroup/testdata/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db/memory.stat: -------------------------------------------------------------------------------- 1 | rss 2 2 | rss_huge 3 3 | mapped_file 4 4 | dirty 5 5 | writeback 6 6 | pgpgin 7 7 | pgpgout 8 8 | pgfault 9 9 | pgmajfault 10 10 | inactive_anon 11 11 | active_anon 12 12 | inactive_file 13 13 | active_file 14 14 | unevictable 15 15 | hierarchical_memory_limit 16 16 | hierarchical_memsw_limit 17 17 | total_cache 18 18 | total_rss 19 19 | total_rss_huge 20 20 | total_mapped_file 21 21 | total_dirty 22 22 | total_writeback 23 23 | total_pgpgin 24 24 | total_pgpgout 25 25 | total_pgfault 26 26 | total_pgmajfault 27 27 | total_inactive_anon 28 28 | total_active_anon 29 29 | total_inactive_file 30 30 | total_active_file 31 31 | total_unevictable 32 -------------------------------------------------------------------------------- /internal/cgroup/testdata/self/cgroup: -------------------------------------------------------------------------------- 1 | 12:perf_event:/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db 2 | 11:rdma:/ 3 | 10:pids:/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db 4 | 9:freezer:/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db 5 | 8:memory:/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db 6 | 7:devices:/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db 7 | 6:cpuset:/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db 8 | 5:hugetlb:/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db 9 | 4:net_cls,net_prio:/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db 10 | 3:blkio:/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db 11 | 2:cpu,cpuacct:/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db 12 | 1:name=systemd:/docker/74c9abf42b88b9a35b1b56061b08303e56fd1707fe5c5b4df93324dedb36b5db 13 | 0::/system.slice/containerd.service -------------------------------------------------------------------------------- /internal/cgroup/util.go: -------------------------------------------------------------------------------- 1 | package cgroup 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path" 7 | "strconv" 8 | "strings" 9 | ) 10 | 11 | func getStatGeneric(statName, sysfsPrefix, cgroupPath, cgroupGrepLine string) (int64, error) { 12 | data, err := getFileContents(statName, sysfsPrefix, cgroupPath, cgroupGrepLine) 13 | if err != nil { 14 | return 0, err 15 | } 16 | n, err := strconv.ParseInt(data, 10, 64) 17 | if err != nil { 18 | return 0, err 19 | } 20 | return n, nil 21 | } 22 | 23 | func getFileContents(statName, sysfsPrefix, cgroupPath, cgroupGrepLine string) (string, error) { 24 | filepath := path.Join(sysfsPrefix, statName) 25 | data, err := os.ReadFile(filepath) 26 | if err == nil { 27 | return string(data), nil 28 | } 29 | cgroupData, err := os.ReadFile(cgroupPath) 30 | if err != nil { 31 | return "", err 32 | } 33 | subPath, err := grepFirstMatch(string(cgroupData), cgroupGrepLine, 2, ":") 34 | if err != nil { 35 | return "", err 36 | } 37 | filepath = path.Join(sysfsPrefix, subPath, statName) 38 | data, err = os.ReadFile(filepath) 39 | if err != nil { 40 | return "", err 41 | } 42 | return string(data), nil 43 | } 44 | 45 | // grepFirstMatch searches match line at data and returns item from it by index with given delimiter. 46 | func grepFirstMatch(data string, match string, index int, delimiter string) (string, error) { 47 | lines := strings.Split(string(data), "\n") 48 | for _, s := range lines { 49 | if !strings.Contains(s, match) { 50 | continue 51 | } 52 | parts := strings.Split(s, delimiter) 53 | if index < len(parts) { 54 | return strings.TrimSpace(parts[index]), nil 55 | } 56 | } 57 | return "", fmt.Errorf("cannot find %q in %q", match, data) 58 | } 59 | -------------------------------------------------------------------------------- /internal/cgroup/util_test.go: -------------------------------------------------------------------------------- 1 | package cgroup 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestGetStatGenericSuccess(t *testing.T) { 8 | f := func(statName, sysfsPrefix, cgroupPath, cgroupGrepLine string, want int64) { 9 | t.Helper() 10 | got, err := getStatGeneric(statName, sysfsPrefix, cgroupPath, cgroupGrepLine) 11 | if err != nil { 12 | t.Fatalf("unexpected error: %s", err) 13 | } 14 | if got != want { 15 | t.Fatalf("unexpected result, got: %d, want %d", got, want) 16 | } 17 | } 18 | f("cpu.cfs_quota_us", "testdata/", "testdata/self/cgroup", "cpu,", -1) 19 | f("cpu.cfs_quota_us", "testdata/cgroup", "testdata/self/cgroup", "cpu,", 10) 20 | f("cpu.cfs_period_us", "testdata/", "testdata/self/cgroup", "cpu,", 100000) 21 | f("cpu.cfs_period_us", "testdata/cgroup", "testdata/self/cgroup", "cpu,", 500000) 22 | f("memory.limit_in_bytes", "testdata/", "testdata/self/cgroup", "memory", 9223372036854771712) 23 | f("memory.limit_in_bytes", "testdata/cgroup", "testdata/self/cgroup", "memory", 523372036854771712) 24 | } 25 | 26 | func TestGetStatGenericFailure(t *testing.T) { 27 | f := func(statName, sysfsPrefix, cgroupPath, cgroupGrepLine string) { 28 | t.Helper() 29 | got, err := getStatGeneric(statName, sysfsPrefix, cgroupPath, cgroupGrepLine) 30 | if err == nil { 31 | t.Fatalf("expecting non-nil error") 32 | } 33 | if got != 0 { 34 | t.Fatalf("unexpected result, got: %d, want 0", got) 35 | } 36 | } 37 | f("cpu.cfs_quota_us", "testdata/", "testdata/missing_folder", "cpu,") 38 | f("cpu.cfs_period_us", "testdata/", "testdata/missing_folder", "cpu,") 39 | f("memory.limit_in_bytes", "testdata/", "testdata/none_existing_folder", "memory") 40 | } 41 | -------------------------------------------------------------------------------- /internal/encoding/int.go: -------------------------------------------------------------------------------- 1 | package encoding 2 | 3 | import "encoding/binary" 4 | 5 | // MarshalUint16 appends marshaled v to dst and returns the result. 6 | func MarshalUint16(dst []byte, u uint16) []byte { 7 | return append(dst, byte(u>>8), byte(u)) 8 | } 9 | 10 | // UnmarshalUint16 returns unmarshaled uint32 from src. 11 | func UnmarshalUint16(src []byte) uint16 { 12 | // This is faster than the manual conversion. 13 | return binary.BigEndian.Uint16(src) 14 | } 15 | -------------------------------------------------------------------------------- /internal/syscall/mmap.go: -------------------------------------------------------------------------------- 1 | package syscall 2 | 3 | func Mmap(fd, length int) ([]byte, error) { 4 | return mmap(fd, length) 5 | } 6 | -------------------------------------------------------------------------------- /internal/syscall/mmap_386.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Prometheus Authors 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | // +build windows 15 | 16 | package syscall 17 | 18 | const maxMapSize = 0x7FFFFFFF // 2GB 19 | -------------------------------------------------------------------------------- /internal/syscall/mmap_amd64.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Prometheus Authors 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | // +build windows 15 | 16 | package syscall 17 | 18 | const maxMapSize = 0xFFFFFFFFFFFF // 256TB 19 | -------------------------------------------------------------------------------- /internal/syscall/mmap_arm.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Prometheus Authors 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | // +build windows 15 | 16 | package syscall 17 | 18 | const maxMapSize = 0x7FFFFFFF // 2GB 19 | -------------------------------------------------------------------------------- /internal/syscall/mmap_unix.go: -------------------------------------------------------------------------------- 1 | // +build !windows,!plan9 2 | 3 | package syscall 4 | 5 | import "syscall" 6 | 7 | func mmap(fd, length int) ([]byte, error) { 8 | return syscall.Mmap( 9 | fd, 10 | 0, 11 | length, 12 | syscall.PROT_READ, 13 | syscall.MAP_SHARED, 14 | ) 15 | } 16 | -------------------------------------------------------------------------------- /internal/syscall/mmap_windows.go: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Prometheus Authors 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | package syscall 15 | 16 | import ( 17 | "os" 18 | "syscall" 19 | "unsafe" 20 | ) 21 | 22 | func mmap(fd, size int) ([]byte, error) { 23 | low, high := uint32(size), uint32(size>>32) 24 | h, errno := syscall.CreateFileMapping(syscall.Handle(fd), nil, syscall.PAGE_READONLY, high, low, nil) 25 | if h == 0 { 26 | return nil, os.NewSyscallError("CreateFileMapping", errno) 27 | } 28 | 29 | addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(size)) 30 | if addr == 0 { 31 | return nil, os.NewSyscallError("MapViewOfFile", errno) 32 | } 33 | 34 | if err := syscall.CloseHandle(syscall.Handle(h)); err != nil { 35 | return nil, os.NewSyscallError("CloseHandle", err) 36 | } 37 | 38 | return (*[maxMapSize]byte)(unsafe.Pointer(addr))[:size], nil 39 | } 40 | -------------------------------------------------------------------------------- /internal/timerpool/timerpool.go: -------------------------------------------------------------------------------- 1 | package timerpool 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | ) 7 | 8 | var timerPool sync.Pool 9 | 10 | // Get returns a timer for the given duration d from the pool. 11 | // 12 | // Return back the timer to the pool with Put. 13 | func Get(d time.Duration) *time.Timer { 14 | if v := timerPool.Get(); v != nil { 15 | t := v.(*time.Timer) 16 | if t.Reset(d) { 17 | panic("active timer trapped to the pool!") 18 | } 19 | return t 20 | } 21 | return time.NewTimer(d) 22 | } 23 | 24 | // Put returns t to the pool. 25 | // 26 | // t cannot be accessed after returning to the pool. 27 | func Put(t *time.Timer) { 28 | if !t.Stop() { 29 | // Drain t.C if it wasn't obtained by the caller yet. 30 | select { 31 | case <-t.C: 32 | default: 33 | } 34 | } 35 | timerPool.Put(t) 36 | } 37 | -------------------------------------------------------------------------------- /internal/timerpool/timerpool_test.go: -------------------------------------------------------------------------------- 1 | package timerpool 2 | -------------------------------------------------------------------------------- /label.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "sort" 5 | 6 | "github.com/nakabonne/tstorage/internal/encoding" 7 | ) 8 | 9 | const ( 10 | // The maximum length of label name. 11 | // 12 | // Longer names are truncated. 13 | maxLabelNameLen = 256 14 | 15 | // The maximum length of label value. 16 | // 17 | // Longer values are truncated. 18 | maxLabelValueLen = 16 * 1024 19 | ) 20 | 21 | // Label is a time-series label. 22 | // A label with missing name or value is invalid. 23 | type Label struct { 24 | Name string 25 | Value string 26 | } 27 | 28 | // marshalMetricName builds a unique bytes by encoding labels. 29 | func marshalMetricName(metric string, labels []Label) string { 30 | if len(labels) == 0 { 31 | return metric 32 | } 33 | invalid := func(name, value string) bool { 34 | return name == "" || value == "" 35 | } 36 | 37 | // Determine the bytes size in advance. 38 | size := len(metric) + 2 39 | sort.Slice(labels, func(i, j int) bool { 40 | return labels[i].Name < labels[j].Name 41 | }) 42 | for i := range labels { 43 | label := &labels[i] 44 | if invalid(label.Name, label.Value) { 45 | continue 46 | } 47 | if len(label.Name) > maxLabelNameLen { 48 | label.Name = label.Name[:maxLabelNameLen] 49 | } 50 | if len(label.Value) > maxLabelValueLen { 51 | label.Value = label.Value[:maxLabelValueLen] 52 | } 53 | size += len(label.Name) 54 | size += len(label.Value) 55 | size += 4 56 | } 57 | 58 | // Start building the bytes. 59 | out := make([]byte, 0, size) 60 | out = encoding.MarshalUint16(out, uint16(len(metric))) 61 | out = append(out, metric...) 62 | for i := range labels { 63 | label := &labels[i] 64 | if invalid(label.Name, label.Value) { 65 | continue 66 | } 67 | out = encoding.MarshalUint16(out, uint16(len(label.Name))) 68 | out = append(out, label.Name...) 69 | out = encoding.MarshalUint16(out, uint16(len(label.Value))) 70 | out = append(out, label.Value...) 71 | } 72 | return string(out) 73 | } 74 | -------------------------------------------------------------------------------- /label_test.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestMarshalMetricName(t *testing.T) { 10 | tests := []struct { 11 | name string 12 | metric string 13 | labels []Label 14 | want string 15 | }{ 16 | { 17 | name: "only metric", 18 | metric: "metric1", 19 | want: "metric1", 20 | }, 21 | { 22 | name: "missing label name", 23 | metric: "metric1", 24 | labels: []Label{ 25 | {Value: "value1"}, 26 | }, 27 | 28 | want: "\x00\ametric1", 29 | }, 30 | { 31 | name: "missing label value", 32 | metric: "metric1", 33 | labels: []Label{ 34 | {Name: "metric1"}, 35 | }, 36 | 37 | want: "\x00\ametric1", 38 | }, 39 | { 40 | name: "metric with a single label", 41 | metric: "metric1", 42 | labels: []Label{ 43 | {Name: "name1", Value: "value1"}, 44 | }, 45 | want: "\x00\ametric1\x00\x05name1\x00\x06value1", 46 | }, 47 | } 48 | for _, tt := range tests { 49 | t.Run(tt.name, func(t *testing.T) { 50 | got := marshalMetricName(tt.metric, tt.labels) 51 | assert.Equal(t, tt.want, got) 52 | }) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /logger.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | // TODO: Think about another abstraction way 4 | 5 | // Logger is a logging interface 6 | type Logger interface { 7 | Printf(format string, v ...interface{}) 8 | } 9 | 10 | type nopLogger struct{} 11 | 12 | func (l *nopLogger) Printf(_ string, _ ...interface{}) { 13 | // Do nothing 14 | return 15 | } 16 | -------------------------------------------------------------------------------- /memory_partition.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "sync" 7 | "sync/atomic" 8 | "time" 9 | ) 10 | 11 | // A memoryPartition implements a partition to store data points on heap. 12 | // It offers a goroutine safe capabilities. 13 | type memoryPartition struct { 14 | // The number of data points 15 | numPoints int64 16 | // minT is immutable. 17 | minT int64 18 | maxT int64 19 | 20 | // A hash map from metric name to memoryMetric. 21 | metrics sync.Map 22 | 23 | // Write ahead log. 24 | wal wal 25 | // The timestamp range of partitions after which they get persisted 26 | partitionDuration int64 27 | timestampPrecision TimestampPrecision 28 | once sync.Once 29 | } 30 | 31 | func newMemoryPartition(wal wal, partitionDuration time.Duration, precision TimestampPrecision) partition { 32 | if wal == nil { 33 | wal = &nopWAL{} 34 | } 35 | var d int64 36 | switch precision { 37 | case Nanoseconds: 38 | d = partitionDuration.Nanoseconds() 39 | case Microseconds: 40 | d = partitionDuration.Microseconds() 41 | case Milliseconds: 42 | d = partitionDuration.Milliseconds() 43 | case Seconds: 44 | d = int64(partitionDuration.Seconds()) 45 | default: 46 | d = partitionDuration.Nanoseconds() 47 | } 48 | return &memoryPartition{ 49 | partitionDuration: d, 50 | wal: wal, 51 | timestampPrecision: precision, 52 | } 53 | } 54 | 55 | // insertRows inserts the given rows to partition. 56 | func (m *memoryPartition) insertRows(rows []Row) ([]Row, error) { 57 | if len(rows) == 0 { 58 | return nil, fmt.Errorf("no rows given") 59 | } 60 | // FIXME: Just emitting log is enough 61 | err := m.wal.append(operationInsert, rows) 62 | if err != nil { 63 | return nil, fmt.Errorf("failed to write to WAL: %w", err) 64 | } 65 | 66 | // Set min timestamp at only first. 67 | m.once.Do(func() { 68 | min := rows[0].Timestamp 69 | for i := range rows { 70 | row := rows[i] 71 | if row.Timestamp < min { 72 | min = row.Timestamp 73 | } 74 | } 75 | atomic.StoreInt64(&m.minT, min) 76 | }) 77 | 78 | outdatedRows := make([]Row, 0) 79 | maxTimestamp := rows[0].Timestamp 80 | var rowsNum int64 81 | for i := range rows { 82 | row := rows[i] 83 | if row.Timestamp < m.minTimestamp() { 84 | outdatedRows = append(outdatedRows, row) 85 | continue 86 | } 87 | if row.Timestamp == 0 { 88 | row.Timestamp = toUnix(time.Now(), m.timestampPrecision) 89 | } 90 | if row.Timestamp > maxTimestamp { 91 | maxTimestamp = row.Timestamp 92 | } 93 | name := marshalMetricName(row.Metric, row.Labels) 94 | mt := m.getMetric(name) 95 | mt.insertPoint(&row.DataPoint) 96 | rowsNum++ 97 | } 98 | atomic.AddInt64(&m.numPoints, rowsNum) 99 | 100 | // Make max timestamp up-to-date. 101 | if atomic.LoadInt64(&m.maxT) < maxTimestamp { 102 | atomic.SwapInt64(&m.maxT, maxTimestamp) 103 | } 104 | 105 | return outdatedRows, nil 106 | } 107 | 108 | func toUnix(t time.Time, precision TimestampPrecision) int64 { 109 | switch precision { 110 | case Nanoseconds: 111 | return t.UnixNano() 112 | case Microseconds: 113 | return t.UnixNano() / 1e3 114 | case Milliseconds: 115 | return t.UnixNano() / 1e6 116 | case Seconds: 117 | return t.Unix() 118 | default: 119 | return t.UnixNano() 120 | } 121 | } 122 | 123 | func (m *memoryPartition) selectDataPoints(metric string, labels []Label, start, end int64) ([]*DataPoint, error) { 124 | name := marshalMetricName(metric, labels) 125 | mt := m.getMetric(name) 126 | return mt.selectPoints(start, end), nil 127 | } 128 | 129 | // getMetric gives back the reference to the metrics list whose name is the given one. 130 | // If none, it creates a new one. 131 | func (m *memoryPartition) getMetric(name string) *memoryMetric { 132 | value, ok := m.metrics.Load(name) 133 | if !ok { 134 | value = &memoryMetric{ 135 | name: name, 136 | points: make([]*DataPoint, 0, 1000), 137 | outOfOrderPoints: make([]*DataPoint, 0), 138 | } 139 | m.metrics.Store(name, value) 140 | } 141 | return value.(*memoryMetric) 142 | } 143 | 144 | func (m *memoryPartition) minTimestamp() int64 { 145 | return atomic.LoadInt64(&m.minT) 146 | } 147 | 148 | func (m *memoryPartition) maxTimestamp() int64 { 149 | return atomic.LoadInt64(&m.maxT) 150 | } 151 | 152 | func (m *memoryPartition) size() int { 153 | return int(atomic.LoadInt64(&m.numPoints)) 154 | } 155 | 156 | func (m *memoryPartition) active() bool { 157 | return m.maxTimestamp()-m.minTimestamp()+1 < m.partitionDuration 158 | } 159 | 160 | func (m *memoryPartition) clean() error { 161 | // What all data managed by memoryPartition is on heap that is automatically removed by GC. 162 | // So do nothing. 163 | return nil 164 | } 165 | 166 | func (m *memoryPartition) expired() bool { 167 | return false 168 | } 169 | 170 | // memoryMetric has a list of ordered data points that belong to the memoryMetric 171 | type memoryMetric struct { 172 | name string 173 | size int64 174 | minTimestamp int64 175 | maxTimestamp int64 176 | // points must kept in order 177 | points []*DataPoint 178 | outOfOrderPoints []*DataPoint 179 | mu sync.RWMutex 180 | } 181 | 182 | func (m *memoryMetric) insertPoint(point *DataPoint) { 183 | size := atomic.LoadInt64(&m.size) 184 | // TODO: Consider to stop using mutex every time. 185 | // Instead, fix the capacity of points slice, kind of like: 186 | /* 187 | m.points := make([]*DataPoint, 1000) 188 | for i := 0; i < 1000; i++ { 189 | m.points[i] = point 190 | } 191 | */ 192 | m.mu.Lock() 193 | defer m.mu.Unlock() 194 | 195 | // First insertion 196 | if size == 0 { 197 | m.points = append(m.points, point) 198 | atomic.StoreInt64(&m.minTimestamp, point.Timestamp) 199 | atomic.StoreInt64(&m.maxTimestamp, point.Timestamp) 200 | atomic.AddInt64(&m.size, 1) 201 | return 202 | } 203 | // Insert point in order 204 | if m.points[size-1].Timestamp < point.Timestamp { 205 | m.points = append(m.points, point) 206 | atomic.StoreInt64(&m.maxTimestamp, point.Timestamp) 207 | atomic.AddInt64(&m.size, 1) 208 | return 209 | } 210 | 211 | m.outOfOrderPoints = append(m.outOfOrderPoints, point) 212 | } 213 | 214 | // selectPoints returns a new slice by re-slicing with [startIdx:endIdx]. 215 | func (m *memoryMetric) selectPoints(start, end int64) []*DataPoint { 216 | size := atomic.LoadInt64(&m.size) 217 | minTimestamp := atomic.LoadInt64(&m.minTimestamp) 218 | maxTimestamp := atomic.LoadInt64(&m.maxTimestamp) 219 | var startIdx, endIdx int 220 | 221 | if end <= minTimestamp { 222 | return []*DataPoint{} 223 | } 224 | 225 | m.mu.RLock() 226 | defer m.mu.RUnlock() 227 | if start <= minTimestamp { 228 | startIdx = 0 229 | } else { 230 | // Use binary search because points are in-order. 231 | startIdx = sort.Search(int(size), func(i int) bool { 232 | return m.points[i].Timestamp >= start 233 | }) 234 | } 235 | 236 | if end > maxTimestamp { 237 | endIdx = int(size) 238 | } else { 239 | // Use binary search because points are in-order. 240 | endIdx = sort.Search(int(size), func(i int) bool { 241 | return m.points[i].Timestamp >= end 242 | }) 243 | } 244 | return m.points[startIdx:endIdx] 245 | } 246 | 247 | // encodeAllPoints uses the given seriesEncoder to encode all metric data points in order by timestamp, 248 | // including outOfOrderPoints. 249 | func (m *memoryMetric) encodeAllPoints(encoder seriesEncoder) error { 250 | sort.Slice(m.outOfOrderPoints, func(i, j int) bool { 251 | return m.outOfOrderPoints[i].Timestamp < m.outOfOrderPoints[j].Timestamp 252 | }) 253 | 254 | var oi, pi int 255 | for oi < len(m.outOfOrderPoints) && pi < len(m.points) { 256 | if m.outOfOrderPoints[oi].Timestamp < m.points[pi].Timestamp { 257 | if err := encoder.encodePoint(m.outOfOrderPoints[oi]); err != nil { 258 | return err 259 | } 260 | oi++ 261 | } else { 262 | if err := encoder.encodePoint(m.points[pi]); err != nil { 263 | return err 264 | } 265 | pi++ 266 | } 267 | } 268 | for oi < len(m.outOfOrderPoints) { 269 | if err := encoder.encodePoint(m.outOfOrderPoints[oi]); err != nil { 270 | return err 271 | } 272 | oi++ 273 | } 274 | for pi < len(m.points) { 275 | if err := encoder.encodePoint(m.points[pi]); err != nil { 276 | return err 277 | } 278 | pi++ 279 | } 280 | 281 | return nil 282 | } 283 | -------------------------------------------------------------------------------- /memory_partition_test.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "time" 7 | 8 | "github.com/stretchr/testify/assert" 9 | "github.com/stretchr/testify/require" 10 | ) 11 | 12 | func Test_memoryPartition_InsertRows(t *testing.T) { 13 | tests := []struct { 14 | name string 15 | memoryPartition *memoryPartition 16 | rows []Row 17 | wantErr bool 18 | wantDataPoints []*DataPoint 19 | wantOutOfOrderRows []Row 20 | }{ 21 | { 22 | name: "insert in-order rows", 23 | memoryPartition: newMemoryPartition(nil, 0, "").(*memoryPartition), 24 | rows: []Row{ 25 | {Metric: "metric1", DataPoint: DataPoint{Timestamp: 1, Value: 0.1}}, 26 | {Metric: "metric1", DataPoint: DataPoint{Timestamp: 2, Value: 0.1}}, 27 | {Metric: "metric1", DataPoint: DataPoint{Timestamp: 3, Value: 0.1}}, 28 | }, 29 | wantDataPoints: []*DataPoint{ 30 | {Timestamp: 1, Value: 0.1}, 31 | {Timestamp: 2, Value: 0.1}, 32 | {Timestamp: 3, Value: 0.1}, 33 | }, 34 | wantOutOfOrderRows: []Row{}, 35 | }, 36 | { 37 | name: "insert out-of-order rows", 38 | memoryPartition: func() *memoryPartition { 39 | m := newMemoryPartition(nil, 0, "").(*memoryPartition) 40 | m.insertRows([]Row{ 41 | {Metric: "metric1", DataPoint: DataPoint{Timestamp: 2, Value: 0.1}}, 42 | }) 43 | return m 44 | }(), 45 | rows: []Row{ 46 | {Metric: "metric1", DataPoint: DataPoint{Timestamp: 1, Value: 0.1}}, 47 | }, 48 | wantDataPoints: []*DataPoint{ 49 | {Timestamp: 2, Value: 0.1}, 50 | }, 51 | wantOutOfOrderRows: []Row{ 52 | {Metric: "metric1", DataPoint: DataPoint{Timestamp: 1, Value: 0.1}}, 53 | }, 54 | }, 55 | } 56 | for _, tt := range tests { 57 | t.Run(tt.name, func(t *testing.T) { 58 | gotOutOfOrder, err := tt.memoryPartition.insertRows(tt.rows) 59 | assert.Equal(t, tt.wantErr, err != nil) 60 | assert.Equal(t, tt.wantOutOfOrderRows, gotOutOfOrder) 61 | 62 | got, _ := tt.memoryPartition.selectDataPoints("metric1", nil, 0, 4) 63 | assert.Equal(t, tt.wantDataPoints, got) 64 | }) 65 | } 66 | } 67 | 68 | func Test_memoryPartition_SelectDataPoints(t *testing.T) { 69 | tests := []struct { 70 | name string 71 | metric string 72 | labels []Label 73 | start int64 74 | end int64 75 | memoryPartition *memoryPartition 76 | want []*DataPoint 77 | }{ 78 | { 79 | name: "given non-exist metric name", 80 | metric: "unknown", 81 | start: 1, 82 | end: 2, 83 | memoryPartition: newMemoryPartition(nil, 0, "").(*memoryPartition), 84 | want: []*DataPoint{}, 85 | }, 86 | { 87 | name: "select some points", 88 | metric: "metric1", 89 | start: 2, 90 | end: 5, 91 | memoryPartition: func() *memoryPartition { 92 | m := newMemoryPartition(nil, 0, "").(*memoryPartition) 93 | m.insertRows([]Row{ 94 | { 95 | Metric: "metric1", 96 | DataPoint: DataPoint{Timestamp: 1, Value: 0.1}, 97 | }, 98 | { 99 | Metric: "metric1", 100 | DataPoint: DataPoint{Timestamp: 2, Value: 0.1}, 101 | }, 102 | { 103 | Metric: "metric1", 104 | DataPoint: DataPoint{Timestamp: 3, Value: 0.1}, 105 | }, 106 | { 107 | Metric: "metric1", 108 | DataPoint: DataPoint{Timestamp: 4, Value: 0.1}, 109 | }, 110 | { 111 | Metric: "metric1", 112 | DataPoint: DataPoint{Timestamp: 5, Value: 0.1}, 113 | }, 114 | }) 115 | return m 116 | }(), 117 | want: []*DataPoint{ 118 | {Timestamp: 2, Value: 0.1}, 119 | {Timestamp: 3, Value: 0.1}, 120 | {Timestamp: 4, Value: 0.1}, 121 | }, 122 | }, 123 | { 124 | name: "select all points", 125 | metric: "metric1", 126 | start: 1, 127 | end: 4, 128 | memoryPartition: func() *memoryPartition { 129 | m := newMemoryPartition(nil, 0, "").(*memoryPartition) 130 | m.insertRows([]Row{ 131 | { 132 | Metric: "metric1", 133 | DataPoint: DataPoint{Timestamp: 1, Value: 0.1}, 134 | }, 135 | { 136 | Metric: "metric1", 137 | DataPoint: DataPoint{Timestamp: 2, Value: 0.1}, 138 | }, 139 | { 140 | Metric: "metric1", 141 | DataPoint: DataPoint{Timestamp: 3, Value: 0.1}, 142 | }, 143 | }) 144 | return m 145 | }(), 146 | want: []*DataPoint{ 147 | {Timestamp: 1, Value: 0.1}, 148 | {Timestamp: 2, Value: 0.1}, 149 | {Timestamp: 3, Value: 0.1}, 150 | }, 151 | }, 152 | } 153 | for _, tt := range tests { 154 | t.Run(tt.name, func(t *testing.T) { 155 | got, _ := tt.memoryPartition.selectDataPoints(tt.metric, tt.labels, tt.start, tt.end) 156 | assert.Equal(t, tt.want, got) 157 | }) 158 | } 159 | } 160 | 161 | func Test_memoryMetric_EncodeAllPoints_sorted(t *testing.T) { 162 | mt := memoryMetric{ 163 | points: []*DataPoint{ 164 | {Timestamp: 1, Value: 0.1}, 165 | {Timestamp: 3, Value: 0.1}, 166 | }, 167 | outOfOrderPoints: []*DataPoint{ 168 | {Timestamp: 4, Value: 0.1}, 169 | {Timestamp: 2, Value: 0.1}, 170 | }, 171 | } 172 | allTimestamps := make([]int64, 0, 4) 173 | encoder := fakeEncoder{ 174 | encodePointFunc: func(p *DataPoint) error { 175 | allTimestamps = append(allTimestamps, p.Timestamp) 176 | return nil 177 | }, 178 | } 179 | err := mt.encodeAllPoints(&encoder) 180 | require.NoError(t, err) 181 | assert.Equal(t, []int64{1, 2, 3, 4}, allTimestamps) 182 | } 183 | 184 | func Test_memoryMetric_EncodeAllPoints_error(t *testing.T) { 185 | mt := memoryMetric{ 186 | points: []*DataPoint{{Timestamp: 1, Value: 0.1}}, 187 | } 188 | encoder := fakeEncoder{ 189 | encodePointFunc: func(p *DataPoint) error { 190 | return fmt.Errorf("some error") 191 | }, 192 | } 193 | err := mt.encodeAllPoints(&encoder) 194 | assert.Error(t, err) 195 | } 196 | 197 | func Test_toUnix(t *testing.T) { 198 | tests := []struct { 199 | name string 200 | t time.Time 201 | precision TimestampPrecision 202 | want int64 203 | }{ 204 | { 205 | name: "to nanosecond", 206 | t: time.Unix(1600000000, 0), 207 | precision: Nanoseconds, 208 | want: 1600000000000000000, 209 | }, 210 | { 211 | name: "to microsecond", 212 | t: time.Unix(1600000000, 0), 213 | precision: Microseconds, 214 | want: 1600000000000000, 215 | }, 216 | { 217 | name: "to millisecond", 218 | t: time.Unix(1600000000, 0), 219 | precision: Milliseconds, 220 | want: 1600000000000, 221 | }, 222 | { 223 | name: "to second", 224 | t: time.Unix(1600000000, 0), 225 | precision: Seconds, 226 | want: 1600000000, 227 | }, 228 | } 229 | for _, tt := range tests { 230 | t.Run(tt.name, func(t *testing.T) { 231 | got := toUnix(tt.t, tt.precision) 232 | assert.Equal(t, tt.want, got) 233 | }) 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /partition.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | // partition is a chunk of time-series data with the timestamp range. 4 | // A partition acts as a fully independent database containing all data 5 | // points for its time range. 6 | // 7 | // The partition's lifecycle is: Writable -> ReadOnly. 8 | // *Writable*: 9 | // it can be written. Only one partition can be writable within a partition list. 10 | // *ReadOnly*: 11 | // it can't be written. Partitions will be ReadOnly if it exceeds the partition range. 12 | type partition interface { 13 | // Write operations 14 | // 15 | // insertRows is a goroutine safe way to insert data points into itself. 16 | // If data points older than its min timestamp were given, they won't be 17 | // ingested, instead, gave back as a first returned value. 18 | insertRows(rows []Row) (outdatedRows []Row, err error) 19 | // clean removes everything managed by this partition. 20 | clean() error 21 | 22 | // Read operations 23 | // 24 | // selectDataPoints gives back certain metric's data points within the given range. 25 | selectDataPoints(metric string, labels []Label, start, end int64) ([]*DataPoint, error) 26 | // minTimestamp returns the minimum Unix timestamp in milliseconds. 27 | minTimestamp() int64 28 | // maxTimestamp returns the maximum Unix timestamp in milliseconds. 29 | maxTimestamp() int64 30 | // size returns the number of data points the partition holds. 31 | size() int 32 | // active means not only writable but having the qualities to be the head partition. 33 | active() bool 34 | // expired means it should get removed. 35 | expired() bool 36 | } 37 | -------------------------------------------------------------------------------- /partition_list.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "sync" 7 | "sync/atomic" 8 | ) 9 | 10 | // partitionList represents a linked list for partitions. 11 | // Each partition is arranged in order order of newest to oldest. 12 | // That is, the head node is always the newest, the tail node is the oldest. 13 | // 14 | // Head and its next partitions must be writable to accept out-of-order data points 15 | // even if it's inactive. 16 | type partitionList interface { 17 | // insert appends a new node to the head. 18 | insert(partition partition) 19 | // remove eliminates the given partition from the list. 20 | remove(partition partition) error 21 | // swap replaces the old partition with the new one. 22 | swap(old, new partition) error 23 | // getHead gives back the head node which is the newest one. 24 | getHead() partition 25 | // size returns the number of partitions of itself. 26 | size() int 27 | // newIterator gives back the iterator object fot this list. 28 | // If you need to inspect all nodes within the list, use this one. 29 | newIterator() partitionIterator 30 | 31 | String() string 32 | } 33 | 34 | // Iterator represents an iterator for partition list. The basic usage is: 35 | /* 36 | for iterator.next() { 37 | partition, err := iterator.value() 38 | // Do something with partition 39 | } 40 | */ 41 | type partitionIterator interface { 42 | // next positions the iterator at the next node in the list. 43 | // It will be positioned at the head on the first call. 44 | // The return value will be true if a value can be read from the list. 45 | next() bool 46 | // value gives back the current partition in the iterator. 47 | // If it was called even though next() returns false, it will return nil. 48 | value() partition 49 | 50 | currentNode() *partitionNode 51 | } 52 | 53 | type partitionListImpl struct { 54 | numPartitions int64 55 | head *partitionNode 56 | tail *partitionNode 57 | mu sync.RWMutex 58 | } 59 | 60 | func newPartitionList() partitionList { 61 | return &partitionListImpl{} 62 | } 63 | 64 | func (p *partitionListImpl) getHead() partition { 65 | if p.size() <= 0 { 66 | return nil 67 | } 68 | p.mu.RLock() 69 | defer p.mu.RUnlock() 70 | return p.head.value() 71 | } 72 | 73 | func (p *partitionListImpl) insert(partition partition) { 74 | node := &partitionNode{ 75 | val: partition, 76 | } 77 | p.mu.RLock() 78 | head := p.head 79 | p.mu.RUnlock() 80 | if head != nil { 81 | node.next = head 82 | } 83 | 84 | p.setHead(node) 85 | atomic.AddInt64(&p.numPartitions, 1) 86 | } 87 | 88 | func (p *partitionListImpl) remove(target partition) error { 89 | if p.size() <= 0 { 90 | return fmt.Errorf("empty partition") 91 | } 92 | 93 | // Iterate over itself from the head. 94 | var prev, next *partitionNode 95 | iterator := p.newIterator() 96 | for iterator.next() { 97 | current := iterator.currentNode() 98 | if !samePartitions(current.value(), target) { 99 | prev = current 100 | continue 101 | } 102 | 103 | // remove the current node. 104 | 105 | iterator.next() 106 | next = iterator.currentNode() 107 | switch { 108 | case prev == nil: 109 | // removing the head node 110 | p.setHead(next) 111 | case next == nil: 112 | // removing the tail node 113 | prev.setNext(nil) 114 | p.setTail(prev) 115 | default: 116 | // removing the middle node 117 | prev.setNext(next) 118 | } 119 | atomic.AddInt64(&p.numPartitions, -1) 120 | 121 | if err := current.value().clean(); err != nil { 122 | return fmt.Errorf("failed to clean resources managed by partition to be removed: %w", err) 123 | } 124 | return nil 125 | } 126 | 127 | return fmt.Errorf("the given partition was not found") 128 | } 129 | 130 | func (p *partitionListImpl) swap(old, new partition) error { 131 | if p.size() <= 0 { 132 | return fmt.Errorf("empty partition") 133 | } 134 | 135 | // Iterate over itself from the head. 136 | var prev, next *partitionNode 137 | iterator := p.newIterator() 138 | for iterator.next() { 139 | current := iterator.currentNode() 140 | if !samePartitions(current.value(), old) { 141 | prev = current 142 | continue 143 | } 144 | 145 | // swap the current node. 146 | 147 | newNode := &partitionNode{ 148 | val: new, 149 | next: current.getNext(), 150 | } 151 | iterator.next() 152 | next = iterator.currentNode() 153 | switch { 154 | case prev == nil: 155 | // swapping the head node 156 | p.setHead(newNode) 157 | case next == nil: 158 | // swapping the tail node 159 | prev.setNext(newNode) 160 | p.setTail(newNode) 161 | default: 162 | // swapping the middle node 163 | prev.setNext(newNode) 164 | } 165 | return nil 166 | } 167 | 168 | return fmt.Errorf("the given partition was not found") 169 | } 170 | 171 | func samePartitions(x, y partition) bool { 172 | return x.minTimestamp() == y.minTimestamp() 173 | } 174 | 175 | func (p *partitionListImpl) size() int { 176 | return int(atomic.LoadInt64(&p.numPartitions)) 177 | } 178 | 179 | func (p *partitionListImpl) newIterator() partitionIterator { 180 | p.mu.RLock() 181 | head := p.head 182 | p.mu.RUnlock() 183 | // Put a dummy node so that it positions the head on the first next() call. 184 | dummy := &partitionNode{ 185 | next: head, 186 | } 187 | return &partitionIteratorImpl{ 188 | current: dummy, 189 | } 190 | } 191 | 192 | func (p *partitionListImpl) setHead(node *partitionNode) { 193 | p.mu.Lock() 194 | defer p.mu.Unlock() 195 | p.head = node 196 | } 197 | 198 | func (p *partitionListImpl) setTail(node *partitionNode) { 199 | p.mu.Lock() 200 | defer p.mu.Unlock() 201 | p.tail = node 202 | } 203 | 204 | func (p *partitionListImpl) String() string { 205 | b := &strings.Builder{} 206 | iterator := p.newIterator() 207 | for iterator.next() { 208 | p := iterator.value() 209 | if _, ok := p.(*memoryPartition); ok { 210 | b.WriteString("[Memory Partition]") 211 | } else if _, ok := p.(*diskPartition); ok { 212 | b.WriteString("[Disk Partition]") 213 | } else { 214 | b.WriteString("[Unknown Partition]") 215 | } 216 | b.WriteString("->") 217 | } 218 | return strings.TrimSuffix(b.String(), "->") 219 | } 220 | 221 | // partitionNode wraps a partition to hold the pointer to the next one. 222 | type partitionNode struct { 223 | // val is immutable 224 | val partition 225 | next *partitionNode 226 | mu sync.RWMutex 227 | } 228 | 229 | // value gives back the actual partition of the node. 230 | func (p *partitionNode) value() partition { 231 | return p.val 232 | } 233 | 234 | func (p *partitionNode) setNext(node *partitionNode) { 235 | p.mu.Lock() 236 | defer p.mu.Unlock() 237 | p.next = node 238 | } 239 | 240 | func (p *partitionNode) getNext() *partitionNode { 241 | p.mu.RLock() 242 | defer p.mu.RUnlock() 243 | return p.next 244 | } 245 | 246 | type partitionIteratorImpl struct { 247 | current *partitionNode 248 | } 249 | 250 | func (i *partitionIteratorImpl) next() bool { 251 | if i.current == nil { 252 | return false 253 | } 254 | next := i.current.getNext() 255 | i.current = next 256 | return i.current != nil 257 | } 258 | 259 | func (i *partitionIteratorImpl) value() partition { 260 | if i.current == nil { 261 | return nil 262 | } 263 | return i.current.value() 264 | } 265 | 266 | func (i *partitionIteratorImpl) currentNode() *partitionNode { 267 | return i.current 268 | } 269 | -------------------------------------------------------------------------------- /partition_list_test.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func Test_partitionList_Remove(t *testing.T) { 10 | tests := []struct { 11 | name string 12 | partitionList partitionListImpl 13 | target partition 14 | wantErr bool 15 | wantPartitionList partitionListImpl 16 | }{ 17 | { 18 | name: "empty partition", 19 | partitionList: partitionListImpl{}, 20 | wantErr: true, 21 | }, 22 | { 23 | name: "remove the head node", 24 | partitionList: func() partitionListImpl { 25 | second := &partitionNode{ 26 | val: &fakePartition{ 27 | minT: 2, 28 | }, 29 | } 30 | 31 | first := &partitionNode{ 32 | val: &fakePartition{ 33 | minT: 1, 34 | }, 35 | next: second, 36 | } 37 | return partitionListImpl{ 38 | numPartitions: 2, 39 | head: first, 40 | tail: second, 41 | } 42 | }(), 43 | target: &fakePartition{ 44 | minT: 1, 45 | }, 46 | wantPartitionList: partitionListImpl{ 47 | numPartitions: 1, 48 | head: &partitionNode{ 49 | val: &fakePartition{ 50 | minT: 2, 51 | }, 52 | }, 53 | tail: &partitionNode{ 54 | val: &fakePartition{ 55 | minT: 2, 56 | }, 57 | }, 58 | }, 59 | }, 60 | { 61 | name: "remove the tail node", 62 | partitionList: func() partitionListImpl { 63 | second := &partitionNode{ 64 | val: &fakePartition{ 65 | minT: 2, 66 | }, 67 | } 68 | 69 | first := &partitionNode{ 70 | val: &fakePartition{ 71 | minT: 1, 72 | }, 73 | next: second, 74 | } 75 | return partitionListImpl{ 76 | numPartitions: 2, 77 | head: first, 78 | tail: second, 79 | } 80 | }(), 81 | target: &fakePartition{ 82 | minT: 2, 83 | }, 84 | wantPartitionList: partitionListImpl{ 85 | numPartitions: 1, 86 | head: &partitionNode{ 87 | val: &fakePartition{ 88 | minT: 1, 89 | }, 90 | }, 91 | tail: &partitionNode{ 92 | val: &fakePartition{ 93 | minT: 1, 94 | }, 95 | }, 96 | }, 97 | }, 98 | { 99 | name: "remove the middle node", 100 | partitionList: func() partitionListImpl { 101 | third := &partitionNode{ 102 | val: &fakePartition{ 103 | minT: 3, 104 | }, 105 | } 106 | second := &partitionNode{ 107 | val: &fakePartition{ 108 | minT: 2, 109 | }, 110 | next: third, 111 | } 112 | first := &partitionNode{ 113 | val: &fakePartition{ 114 | minT: 1, 115 | }, 116 | next: second, 117 | } 118 | return partitionListImpl{ 119 | numPartitions: 3, 120 | head: first, 121 | tail: third, 122 | } 123 | }(), 124 | target: &fakePartition{ 125 | minT: 2, 126 | }, 127 | wantPartitionList: partitionListImpl{ 128 | numPartitions: 2, 129 | head: &partitionNode{ 130 | val: &fakePartition{ 131 | minT: 1, 132 | }, 133 | next: &partitionNode{ 134 | val: &fakePartition{ 135 | minT: 3, 136 | }, 137 | }, 138 | }, 139 | tail: &partitionNode{ 140 | val: &fakePartition{ 141 | minT: 3, 142 | }, 143 | }, 144 | }, 145 | }, 146 | { 147 | name: "given node not found", 148 | partitionList: func() partitionListImpl { 149 | second := &partitionNode{ 150 | val: &fakePartition{ 151 | minT: 2, 152 | }, 153 | } 154 | 155 | first := &partitionNode{ 156 | val: &fakePartition{ 157 | minT: 1, 158 | }, 159 | next: second, 160 | } 161 | return partitionListImpl{ 162 | numPartitions: 2, 163 | head: first, 164 | tail: second, 165 | } 166 | }(), 167 | target: &fakePartition{ 168 | minT: 3, 169 | }, 170 | wantPartitionList: func() partitionListImpl { 171 | second := &partitionNode{ 172 | val: &fakePartition{ 173 | minT: 2, 174 | }, 175 | } 176 | 177 | first := &partitionNode{ 178 | val: &fakePartition{ 179 | minT: 1, 180 | }, 181 | next: second, 182 | } 183 | return partitionListImpl{ 184 | numPartitions: 2, 185 | head: first, 186 | tail: second, 187 | } 188 | }(), 189 | wantErr: true, 190 | }, 191 | } 192 | for _, tt := range tests { 193 | t.Run(tt.name, func(t *testing.T) { 194 | err := tt.partitionList.remove(tt.target) 195 | assert.Equal(t, tt.wantErr, err != nil) 196 | assert.Equal(t, tt.wantPartitionList, tt.partitionList) 197 | }) 198 | } 199 | } 200 | 201 | func Test_partitionList_Swap(t *testing.T) { 202 | tests := []struct { 203 | name string 204 | partitionList partitionListImpl 205 | old partition 206 | new partition 207 | wantErr bool 208 | wantPartitionList partitionListImpl 209 | }{ 210 | { 211 | name: "empty partition", 212 | partitionList: partitionListImpl{}, 213 | wantErr: true, 214 | }, 215 | { 216 | name: "swap the head node", 217 | partitionList: func() partitionListImpl { 218 | second := &partitionNode{ 219 | val: &fakePartition{ 220 | minT: 2, 221 | }, 222 | } 223 | 224 | first := &partitionNode{ 225 | val: &fakePartition{ 226 | minT: 1, 227 | }, 228 | next: second, 229 | } 230 | return partitionListImpl{ 231 | numPartitions: 2, 232 | head: first, 233 | tail: second, 234 | } 235 | }(), 236 | old: &fakePartition{ 237 | minT: 1, 238 | }, 239 | new: &fakePartition{ 240 | minT: 100, 241 | }, 242 | wantPartitionList: partitionListImpl{ 243 | numPartitions: 2, 244 | head: &partitionNode{ 245 | val: &fakePartition{ 246 | minT: 100, 247 | }, 248 | next: &partitionNode{ 249 | val: &fakePartition{ 250 | minT: 2, 251 | }, 252 | }, 253 | }, 254 | tail: &partitionNode{ 255 | val: &fakePartition{ 256 | minT: 2, 257 | }, 258 | }, 259 | }, 260 | }, 261 | { 262 | name: "swap the tail node", 263 | partitionList: func() partitionListImpl { 264 | second := &partitionNode{ 265 | val: &fakePartition{ 266 | minT: 2, 267 | }, 268 | } 269 | 270 | first := &partitionNode{ 271 | val: &fakePartition{ 272 | minT: 1, 273 | }, 274 | next: second, 275 | } 276 | return partitionListImpl{ 277 | numPartitions: 2, 278 | head: first, 279 | tail: second, 280 | } 281 | }(), 282 | old: &fakePartition{ 283 | minT: 2, 284 | }, 285 | new: &fakePartition{ 286 | minT: 100, 287 | }, 288 | wantPartitionList: partitionListImpl{ 289 | numPartitions: 2, 290 | head: &partitionNode{ 291 | val: &fakePartition{ 292 | minT: 1, 293 | }, 294 | next: &partitionNode{ 295 | val: &fakePartition{ 296 | minT: 100, 297 | }, 298 | }, 299 | }, 300 | tail: &partitionNode{ 301 | val: &fakePartition{ 302 | minT: 100, 303 | }, 304 | }, 305 | }, 306 | }, 307 | { 308 | name: "swap the middle node", 309 | partitionList: func() partitionListImpl { 310 | third := &partitionNode{ 311 | val: &fakePartition{ 312 | minT: 3, 313 | }, 314 | } 315 | second := &partitionNode{ 316 | val: &fakePartition{ 317 | minT: 2, 318 | }, 319 | next: third, 320 | } 321 | 322 | first := &partitionNode{ 323 | val: &fakePartition{ 324 | minT: 1, 325 | }, 326 | next: second, 327 | } 328 | return partitionListImpl{ 329 | numPartitions: 3, 330 | head: first, 331 | tail: third, 332 | } 333 | }(), 334 | old: &fakePartition{ 335 | minT: 2, 336 | }, 337 | new: &fakePartition{ 338 | minT: 100, 339 | }, 340 | wantPartitionList: partitionListImpl{ 341 | numPartitions: 3, 342 | head: &partitionNode{ 343 | val: &fakePartition{ 344 | minT: 1, 345 | }, 346 | next: &partitionNode{ 347 | val: &fakePartition{ 348 | minT: 100, 349 | }, 350 | next: &partitionNode{ 351 | val: &fakePartition{ 352 | minT: 3, 353 | }, 354 | }, 355 | }, 356 | }, 357 | tail: &partitionNode{ 358 | val: &fakePartition{ 359 | minT: 3, 360 | }, 361 | }, 362 | }, 363 | }, 364 | { 365 | name: "given node not found", 366 | partitionList: func() partitionListImpl { 367 | second := &partitionNode{ 368 | val: &fakePartition{ 369 | minT: 2, 370 | }, 371 | } 372 | 373 | first := &partitionNode{ 374 | val: &fakePartition{ 375 | minT: 1, 376 | }, 377 | next: second, 378 | } 379 | return partitionListImpl{ 380 | numPartitions: 2, 381 | head: first, 382 | tail: second, 383 | } 384 | }(), 385 | old: &fakePartition{ 386 | minT: 100, 387 | }, 388 | wantPartitionList: partitionListImpl{ 389 | numPartitions: 2, 390 | head: &partitionNode{ 391 | val: &fakePartition{ 392 | minT: 1, 393 | }, 394 | next: &partitionNode{ 395 | val: &fakePartition{ 396 | minT: 2, 397 | }, 398 | }, 399 | }, 400 | tail: &partitionNode{ 401 | val: &fakePartition{ 402 | minT: 2, 403 | }, 404 | }, 405 | }, 406 | wantErr: true, 407 | }, 408 | } 409 | for _, tt := range tests { 410 | t.Run(tt.name, func(t *testing.T) { 411 | err := tt.partitionList.swap(tt.old, tt.new) 412 | assert.Equal(t, tt.wantErr, err != nil) 413 | assert.Equal(t, tt.wantPartitionList, tt.partitionList) 414 | }) 415 | } 416 | } 417 | -------------------------------------------------------------------------------- /storage.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "fmt" 7 | "io" 8 | "io/fs" 9 | "os" 10 | "path/filepath" 11 | "regexp" 12 | "sort" 13 | "sync" 14 | "time" 15 | 16 | "github.com/nakabonne/tstorage/internal/cgroup" 17 | "github.com/nakabonne/tstorage/internal/timerpool" 18 | ) 19 | 20 | var ( 21 | ErrNoDataPoints = errors.New("no data points found") 22 | 23 | // Limit the concurrency for data ingestion to GOMAXPROCS, since this operation 24 | // is CPU bound, so there is no sense in running more than GOMAXPROCS concurrent 25 | // goroutines on data ingestion path. 26 | defaultWorkersLimit = cgroup.AvailableCPUs() 27 | 28 | partitionDirRegex = regexp.MustCompile(`^p-.+`) 29 | ) 30 | 31 | // TimestampPrecision represents precision of timestamps. See WithTimestampPrecision 32 | type TimestampPrecision string 33 | 34 | const ( 35 | Nanoseconds TimestampPrecision = "ns" 36 | Microseconds TimestampPrecision = "us" 37 | Milliseconds TimestampPrecision = "ms" 38 | Seconds TimestampPrecision = "s" 39 | 40 | defaultPartitionDuration = 1 * time.Hour 41 | defaultRetention = 336 * time.Hour 42 | defaultTimestampPrecision = Nanoseconds 43 | defaultWriteTimeout = 30 * time.Second 44 | defaultWALBufferedSize = 4096 45 | 46 | writablePartitionsNum = 2 47 | checkExpiredInterval = time.Hour 48 | 49 | walDirName = "wal" 50 | ) 51 | 52 | // Storage provides goroutine safe capabilities of insertion into and retrieval from the time-series storage. 53 | type Storage interface { 54 | Reader 55 | // InsertRows ingests the given rows to the time-series storage. 56 | // If the timestamp is empty, it uses the machine's local timestamp in UTC. 57 | // The precision of timestamps is nanoseconds by default. It can be changed using WithTimestampPrecision. 58 | InsertRows(rows []Row) error 59 | // Close gracefully shutdowns by flushing any unwritten data to the underlying disk partition. 60 | Close() error 61 | } 62 | 63 | // Reader provides reading access to time series data. 64 | type Reader interface { 65 | // Select gives back a list of data points that matches a set of the given metric and 66 | // labels within the given start-end range. Keep in mind that start is inclusive, end is exclusive, 67 | // and both must be Unix timestamp. ErrNoDataPoints will be returned if no data points found. 68 | Select(metric string, labels []Label, start, end int64) (points []*DataPoint, err error) 69 | } 70 | 71 | // Row includes a data point along with properties to identify a kind of metrics. 72 | type Row struct { 73 | // The unique name of metric. 74 | // This field must be set. 75 | Metric string 76 | // An optional key-value properties to further detailed identification. 77 | Labels []Label 78 | // This field must be set. 79 | DataPoint 80 | } 81 | 82 | // DataPoint represents a data point, the smallest unit of time series data. 83 | type DataPoint struct { 84 | // The actual value. This field must be set. 85 | Value float64 86 | // Unix timestamp. 87 | Timestamp int64 88 | } 89 | 90 | // Option is an optional setting for NewStorage. 91 | type Option func(*storage) 92 | 93 | // WithDataPath specifies the path to directory that stores time-series data. 94 | // Use this to make time-series data persistent on disk. 95 | // 96 | // Defaults to empty string which means no data will get persisted. 97 | func WithDataPath(dataPath string) Option { 98 | return func(s *storage) { 99 | s.dataPath = dataPath 100 | } 101 | } 102 | 103 | // WithPartitionDuration specifies the timestamp range of partitions. 104 | // Once it exceeds the given time range, the new partition gets inserted. 105 | // 106 | // A partition is a chunk of time-series data with the timestamp range. 107 | // It acts as a fully independent database containing all data 108 | // points for its time range. 109 | // 110 | // Defaults to 1h 111 | func WithPartitionDuration(duration time.Duration) Option { 112 | return func(s *storage) { 113 | s.partitionDuration = duration 114 | } 115 | } 116 | 117 | // WithRetention specifies when to remove old data. 118 | // Data points will get automatically removed from the disk after a 119 | // specified period of time after a disk partition was created. 120 | // Defaults to 14d. 121 | func WithRetention(retention time.Duration) Option { 122 | return func(s *storage) { 123 | s.retention = retention 124 | } 125 | } 126 | 127 | // WithTimestampPrecision specifies the precision of timestamps to be used by all operations. 128 | // 129 | // Defaults to Nanoseconds 130 | func WithTimestampPrecision(precision TimestampPrecision) Option { 131 | return func(s *storage) { 132 | s.timestampPrecision = precision 133 | } 134 | } 135 | 136 | // WithWriteTimeout specifies the timeout to wait when workers are busy. 137 | // 138 | // The storage limits the number of concurrent goroutines to prevent from out of memory 139 | // errors and CPU trashing even if too many goroutines attempt to write. 140 | // 141 | // Defaults to 30s. 142 | func WithWriteTimeout(timeout time.Duration) Option { 143 | return func(s *storage) { 144 | s.writeTimeout = timeout 145 | } 146 | } 147 | 148 | // WithLogger specifies the logger to emit verbose output. 149 | // 150 | // Defaults to a logger implementation that does nothing. 151 | func WithLogger(logger Logger) Option { 152 | return func(s *storage) { 153 | s.logger = logger 154 | } 155 | } 156 | 157 | // WithWAL specifies the buffered byte size before flushing a WAL file. 158 | // The larger the size, the less frequently the file is written and more write performance at the expense of durability. 159 | // Giving 0 means it writes to a file whenever data point comes in. 160 | // Giving -1 disables using WAL. 161 | // 162 | // Defaults to 4096. 163 | func WithWALBufferedSize(size int) Option { 164 | return func(s *storage) { 165 | s.walBufferedSize = size 166 | } 167 | } 168 | 169 | // NewStorage gives back a new storage, which stores time-series data in the process memory by default. 170 | // 171 | // Give the WithDataPath option for running as a on-disk storage. Specify a directory with data already exists, 172 | // then it will be read as the initial data. 173 | func NewStorage(opts ...Option) (Storage, error) { 174 | s := &storage{ 175 | partitionList: newPartitionList(), 176 | workersLimitCh: make(chan struct{}, defaultWorkersLimit), 177 | partitionDuration: defaultPartitionDuration, 178 | retention: defaultRetention, 179 | timestampPrecision: defaultTimestampPrecision, 180 | writeTimeout: defaultWriteTimeout, 181 | walBufferedSize: defaultWALBufferedSize, 182 | wal: &nopWAL{}, 183 | logger: &nopLogger{}, 184 | doneCh: make(chan struct{}, 0), 185 | } 186 | for _, opt := range opts { 187 | opt(s) 188 | } 189 | 190 | if s.inMemoryMode() { 191 | s.newPartition(nil, false) 192 | return s, nil 193 | } 194 | 195 | if err := os.MkdirAll(s.dataPath, fs.ModePerm); err != nil { 196 | return nil, fmt.Errorf("failed to make data directory %s: %w", s.dataPath, err) 197 | } 198 | 199 | walDir := filepath.Join(s.dataPath, walDirName) 200 | if s.walBufferedSize >= 0 { 201 | wal, err := newDiskWAL(walDir, s.walBufferedSize) 202 | if err != nil { 203 | return nil, err 204 | } 205 | s.wal = wal 206 | } 207 | 208 | // Read existent partitions from the disk. 209 | dirs, err := os.ReadDir(s.dataPath) 210 | if err != nil { 211 | return nil, fmt.Errorf("failed to open data directory: %w", err) 212 | } 213 | if len(dirs) == 0 { 214 | s.newPartition(nil, false) 215 | return s, nil 216 | } 217 | isPartitionDir := func(f fs.DirEntry) bool { 218 | return f.IsDir() && partitionDirRegex.MatchString(f.Name()) 219 | } 220 | partitions := make([]partition, 0, len(dirs)) 221 | for _, e := range dirs { 222 | if !isPartitionDir(e) { 223 | continue 224 | } 225 | path := filepath.Join(s.dataPath, e.Name()) 226 | part, err := openDiskPartition(path, s.retention) 227 | if errors.Is(err, ErrNoDataPoints) { 228 | continue 229 | } 230 | if errors.Is(err, errInvalidPartition) { 231 | // It should be recovered by WAL 232 | continue 233 | } 234 | if err != nil { 235 | return nil, fmt.Errorf("failed to open disk partition for %s: %w", path, err) 236 | } 237 | partitions = append(partitions, part) 238 | } 239 | sort.Slice(partitions, func(i, j int) bool { 240 | return partitions[i].minTimestamp() < partitions[j].minTimestamp() 241 | }) 242 | for _, p := range partitions { 243 | s.newPartition(p, false) 244 | } 245 | // Start WAL recovery if there is. 246 | if err := s.recoverWAL(walDir); err != nil { 247 | return nil, fmt.Errorf("failed to recover WAL: %w", err) 248 | } 249 | s.newPartition(nil, false) 250 | 251 | // periodically check and permanently remove expired partitions. 252 | go func() { 253 | ticker := time.NewTicker(checkExpiredInterval) 254 | defer ticker.Stop() 255 | for { 256 | select { 257 | case <-s.doneCh: 258 | return 259 | case <-ticker.C: 260 | err := s.removeExpiredPartitions() 261 | if err != nil { 262 | s.logger.Printf("%v\n", err) 263 | } 264 | } 265 | } 266 | }() 267 | return s, nil 268 | } 269 | 270 | type storage struct { 271 | partitionList partitionList 272 | 273 | walBufferedSize int 274 | wal wal 275 | partitionDuration time.Duration 276 | retention time.Duration 277 | timestampPrecision TimestampPrecision 278 | dataPath string 279 | writeTimeout time.Duration 280 | 281 | logger Logger 282 | workersLimitCh chan struct{} 283 | // wg must be incremented to guarantee all writes are done gracefully. 284 | wg sync.WaitGroup 285 | 286 | doneCh chan struct{} 287 | } 288 | 289 | func (s *storage) InsertRows(rows []Row) error { 290 | s.wg.Add(1) 291 | defer s.wg.Done() 292 | 293 | insert := func() error { 294 | defer func() { <-s.workersLimitCh }() 295 | if err := s.ensureActiveHead(); err != nil { 296 | return err 297 | } 298 | iterator := s.partitionList.newIterator() 299 | n := s.partitionList.size() 300 | rowsToInsert := rows 301 | // Starting at the head partition, try to insert rows, and loop to insert outdated rows 302 | // into older partitions. Any rows more than `writablePartitionsNum` partitions out 303 | // of date are dropped. 304 | for i := 0; i < n && i < writablePartitionsNum; i++ { 305 | if len(rowsToInsert) == 0 { 306 | break 307 | } 308 | if !iterator.next() { 309 | break 310 | } 311 | outdatedRows, err := iterator.value().insertRows(rowsToInsert) 312 | if err != nil { 313 | return fmt.Errorf("failed to insert rows: %w", err) 314 | } 315 | rowsToInsert = outdatedRows 316 | } 317 | return nil 318 | } 319 | 320 | // Limit the number of concurrent goroutines to prevent from out of memory 321 | // errors and CPU trashing even if too many goroutines attempt to write. 322 | select { 323 | case s.workersLimitCh <- struct{}{}: 324 | return insert() 325 | default: 326 | } 327 | 328 | // Seems like all workers are busy; wait for up to writeTimeout 329 | 330 | t := timerpool.Get(s.writeTimeout) 331 | select { 332 | case s.workersLimitCh <- struct{}{}: 333 | timerpool.Put(t) 334 | return insert() 335 | case <-t.C: 336 | timerpool.Put(t) 337 | return fmt.Errorf("failed to write a data point in %s, since it is overloaded with %d concurrent writers", 338 | s.writeTimeout, defaultWorkersLimit) 339 | } 340 | } 341 | 342 | // ensureActiveHead ensures the head of partitionList is an active partition. 343 | // If none, it creates a new one. 344 | func (s *storage) ensureActiveHead() error { 345 | head := s.partitionList.getHead() 346 | if head != nil && head.active() { 347 | return nil 348 | } 349 | 350 | // All partitions seems to be inactive so add a new partition to the list. 351 | if err := s.newPartition(nil, true); err != nil { 352 | return err 353 | } 354 | go func() { 355 | if err := s.flushPartitions(); err != nil { 356 | s.logger.Printf("failed to flush in-memory partitions: %v", err) 357 | } 358 | }() 359 | return nil 360 | } 361 | 362 | func (s *storage) Select(metric string, labels []Label, start, end int64) ([]*DataPoint, error) { 363 | if metric == "" { 364 | return nil, fmt.Errorf("metric must be set") 365 | } 366 | if start >= end { 367 | return nil, fmt.Errorf("the given start is greater than end") 368 | } 369 | points := make([]*DataPoint, 0) 370 | 371 | // Iterate over all partitions from the newest one. 372 | iterator := s.partitionList.newIterator() 373 | for iterator.next() { 374 | part := iterator.value() 375 | if part == nil { 376 | return nil, fmt.Errorf("unexpected empty partition found") 377 | } 378 | if part.minTimestamp() == 0 { 379 | // Skip the partition that has no points. 380 | continue 381 | } 382 | if part.maxTimestamp() < start { 383 | // No need to keep going anymore 384 | break 385 | } 386 | if part.minTimestamp() > end { 387 | continue 388 | } 389 | ps, err := part.selectDataPoints(metric, labels, start, end) 390 | if errors.Is(err, ErrNoDataPoints) { 391 | continue 392 | } 393 | if err != nil { 394 | return nil, fmt.Errorf("failed to select data points: %w", err) 395 | } 396 | // in order to keep the order in ascending. 397 | points = append(ps, points...) 398 | } 399 | if len(points) == 0 { 400 | return nil, ErrNoDataPoints 401 | } 402 | return points, nil 403 | } 404 | 405 | func (s *storage) Close() error { 406 | s.wg.Wait() 407 | close(s.doneCh) 408 | if err := s.wal.flush(); err != nil { 409 | return fmt.Errorf("failed to flush buffered WAL: %w", err) 410 | } 411 | 412 | // TODO: Prevent from new goroutines calling InsertRows(), for graceful shutdown. 413 | 414 | // Make all writable partitions read-only by inserting as same number of those. 415 | for i := 0; i < writablePartitionsNum; i++ { 416 | if err := s.newPartition(nil, true); err != nil { 417 | return err 418 | } 419 | } 420 | if err := s.flushPartitions(); err != nil { 421 | return fmt.Errorf("failed to close storage: %w", err) 422 | } 423 | if err := s.removeExpiredPartitions(); err != nil { 424 | return fmt.Errorf("failed to remove expired partitions: %w", err) 425 | } 426 | // All partitions have been flushed, so WAL isn't needed anymore. 427 | if err := s.wal.removeAll(); err != nil { 428 | return fmt.Errorf("failed to remove WAL: %w", err) 429 | } 430 | return nil 431 | } 432 | 433 | func (s *storage) newPartition(p partition, punctuateWal bool) error { 434 | if p == nil { 435 | p = newMemoryPartition(s.wal, s.partitionDuration, s.timestampPrecision) 436 | } 437 | s.partitionList.insert(p) 438 | if punctuateWal { 439 | return s.wal.punctuate() 440 | } 441 | return nil 442 | } 443 | 444 | // flushPartitions persists all in-memory partitions ready to persisted. 445 | // For the in-memory mode, just removes it from the partition list. 446 | func (s *storage) flushPartitions() error { 447 | // Keep the first two partitions as is even if they are inactive, 448 | // to accept out-of-order data points. 449 | i := 0 450 | iterator := s.partitionList.newIterator() 451 | for iterator.next() { 452 | if i < writablePartitionsNum { 453 | i++ 454 | continue 455 | } 456 | part := iterator.value() 457 | if part == nil { 458 | return fmt.Errorf("unexpected empty partition found") 459 | } 460 | memPart, ok := part.(*memoryPartition) 461 | if !ok { 462 | continue 463 | } 464 | 465 | if s.inMemoryMode() { 466 | if err := s.partitionList.remove(part); err != nil { 467 | return fmt.Errorf("failed to remove partition: %w", err) 468 | } 469 | continue 470 | } 471 | 472 | // Start swapping in-memory partition for disk one. 473 | // The disk partition will place at where in-memory one existed. 474 | 475 | dir := filepath.Join(s.dataPath, fmt.Sprintf("p-%d-%d", memPart.minTimestamp(), memPart.maxTimestamp())) 476 | if err := s.flush(dir, memPart); err != nil { 477 | return fmt.Errorf("failed to compact memory partition into %s: %w", dir, err) 478 | } 479 | newPart, err := openDiskPartition(dir, s.retention) 480 | if errors.Is(err, ErrNoDataPoints) { 481 | if err := s.partitionList.remove(part); err != nil { 482 | return fmt.Errorf("failed to remove partition: %w", err) 483 | } 484 | continue 485 | } 486 | if err != nil { 487 | return fmt.Errorf("failed to generate disk partition for %s: %w", dir, err) 488 | } 489 | if err := s.partitionList.swap(part, newPart); err != nil { 490 | return fmt.Errorf("failed to swap partitions: %w", err) 491 | } 492 | 493 | if err := s.wal.removeOldest(); err != nil { 494 | return fmt.Errorf("failed to remove oldest WAL segment: %w", err) 495 | } 496 | } 497 | return nil 498 | } 499 | 500 | // flush compacts the data points in the given partition and flushes them to the given directory. 501 | func (s *storage) flush(dirPath string, m *memoryPartition) error { 502 | if dirPath == "" { 503 | return fmt.Errorf("dir path is required") 504 | } 505 | 506 | if err := os.MkdirAll(dirPath, fs.ModePerm); err != nil { 507 | return fmt.Errorf("failed to make directory %q: %w", dirPath, err) 508 | } 509 | 510 | f, err := os.Create(filepath.Join(dirPath, dataFileName)) 511 | if err != nil { 512 | return fmt.Errorf("failed to create file %q: %w", dirPath, err) 513 | } 514 | defer f.Close() 515 | encoder := newSeriesEncoder(f) 516 | 517 | metrics := map[string]diskMetric{} 518 | m.metrics.Range(func(key, value interface{}) bool { 519 | mt, ok := value.(*memoryMetric) 520 | if !ok { 521 | s.logger.Printf("unknown value found\n") 522 | return false 523 | } 524 | offset, err := f.Seek(0, io.SeekCurrent) 525 | if err != nil { 526 | s.logger.Printf("failed to set file offset of metric %q: %v\n", mt.name, err) 527 | return false 528 | } 529 | 530 | if err := mt.encodeAllPoints(encoder); err != nil { 531 | s.logger.Printf("failed to encode a data point that metric is %q: %v\n", mt.name, err) 532 | return false 533 | } 534 | 535 | if err := encoder.flush(); err != nil { 536 | s.logger.Printf("failed to flush data points that metric is %q: %v\n", mt.name, err) 537 | return false 538 | } 539 | 540 | totalNumPoints := mt.size + int64(len(mt.outOfOrderPoints)) 541 | metrics[mt.name] = diskMetric{ 542 | Name: mt.name, 543 | Offset: offset, 544 | MinTimestamp: mt.minTimestamp, 545 | MaxTimestamp: mt.maxTimestamp, 546 | NumDataPoints: totalNumPoints, 547 | } 548 | return true 549 | }) 550 | 551 | b, err := json.Marshal(&meta{ 552 | MinTimestamp: m.minTimestamp(), 553 | MaxTimestamp: m.maxTimestamp(), 554 | NumDataPoints: m.size(), 555 | Metrics: metrics, 556 | CreatedAt: time.Now(), 557 | }) 558 | if err != nil { 559 | return fmt.Errorf("failed to encode metadata: %w", err) 560 | } 561 | 562 | // It should write the meta file at last because what valid meta file exists proves the disk partition is valid. 563 | metaPath := filepath.Join(dirPath, metaFileName) 564 | if err := os.WriteFile(metaPath, b, fs.ModePerm); err != nil { 565 | return fmt.Errorf("failed to write metadata to %s: %w", metaPath, err) 566 | } 567 | return nil 568 | } 569 | 570 | func (s *storage) removeExpiredPartitions() error { 571 | expiredList := make([]partition, 0) 572 | iterator := s.partitionList.newIterator() 573 | for iterator.next() { 574 | part := iterator.value() 575 | if part == nil { 576 | return fmt.Errorf("unexpected nil partition found") 577 | } 578 | if part.expired() { 579 | expiredList = append(expiredList, part) 580 | } 581 | } 582 | 583 | for i := range expiredList { 584 | if err := s.partitionList.remove(expiredList[i]); err != nil { 585 | return fmt.Errorf("failed to remove expired partition") 586 | } 587 | } 588 | return nil 589 | } 590 | 591 | // recoverWAL inserts all records within the given wal, and then removes all WAL segment files. 592 | func (s *storage) recoverWAL(walDir string) error { 593 | reader, err := newDiskWALReader(walDir) 594 | if errors.Is(err, os.ErrNotExist) { 595 | return nil 596 | } 597 | if err != nil { 598 | return err 599 | } 600 | 601 | if err := reader.readAll(); err != nil { 602 | return fmt.Errorf("failed to read WAL: %w", err) 603 | } 604 | 605 | if len(reader.rowsToInsert) == 0 { 606 | return nil 607 | } 608 | if err := s.InsertRows(reader.rowsToInsert); err != nil { 609 | return fmt.Errorf("failed to insert rows recovered from WAL: %w", err) 610 | } 611 | return s.wal.refresh() 612 | } 613 | 614 | func (s *storage) inMemoryMode() bool { 615 | return s.dataPath == "" 616 | } 617 | -------------------------------------------------------------------------------- /storage_benchmark_test.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/require" 7 | ) 8 | 9 | func BenchmarkStorage_InsertRows(b *testing.B) { 10 | storage, err := NewStorage() 11 | require.NoError(b, err) 12 | b.ResetTimer() 13 | for i := 1; i < b.N; i++ { 14 | storage.InsertRows([]Row{ 15 | {Metric: "metric1", DataPoint: DataPoint{Timestamp: int64(i), Value: 0.1}}, 16 | }) 17 | } 18 | } 19 | 20 | // Select data points among a thousand data in memory 21 | func BenchmarkStorage_SelectAmongThousandPoints(b *testing.B) { 22 | storage, err := NewStorage() 23 | require.NoError(b, err) 24 | for i := 1; i < 1000; i++ { 25 | storage.InsertRows([]Row{ 26 | {Metric: "metric1", DataPoint: DataPoint{Timestamp: int64(i), Value: 0.1}}, 27 | }) 28 | } 29 | b.ResetTimer() 30 | for i := 1; i < b.N; i++ { 31 | _, _ = storage.Select("metric1", nil, 10, 100) 32 | } 33 | } 34 | 35 | // Select data points among a million data in memory 36 | func BenchmarkStorage_SelectAmongMillionPoints(b *testing.B) { 37 | storage, err := NewStorage() 38 | require.NoError(b, err) 39 | for i := 1; i < 1000000; i++ { 40 | storage.InsertRows([]Row{ 41 | {Metric: "metric1", DataPoint: DataPoint{Timestamp: int64(i), Value: 0.1}}, 42 | }) 43 | } 44 | b.ResetTimer() 45 | for i := 1; i < b.N; i++ { 46 | _, _ = storage.Select("metric1", nil, 10, 100) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /storage_examples_test.go: -------------------------------------------------------------------------------- 1 | package tstorage_test 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "os" 7 | "sync" 8 | "time" 9 | 10 | "github.com/nakabonne/tstorage" 11 | ) 12 | 13 | func ExampleNewStorage_withDataPath() { 14 | // It will make time-series data persistent under "./data". 15 | storage, err := tstorage.NewStorage( 16 | tstorage.WithDataPath("./data"), 17 | ) 18 | if err != nil { 19 | panic(err) 20 | } 21 | storage.Close() 22 | } 23 | 24 | func ExampleNewStorage_withPartitionDuration() { 25 | storage, err := tstorage.NewStorage( 26 | tstorage.WithPartitionDuration(30*time.Minute), 27 | tstorage.WithTimestampPrecision(tstorage.Seconds), 28 | ) 29 | if err != nil { 30 | panic(err) 31 | } 32 | defer storage.Close() 33 | } 34 | 35 | func ExampleStorage_InsertRows() { 36 | storage, err := tstorage.NewStorage( 37 | tstorage.WithTimestampPrecision(tstorage.Seconds), 38 | ) 39 | if err != nil { 40 | panic(err) 41 | } 42 | defer func() { 43 | if err := storage.Close(); err != nil { 44 | panic(err) 45 | } 46 | }() 47 | err = storage.InsertRows([]tstorage.Row{ 48 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: 1600000000, Value: 0.1}}, 49 | }) 50 | if err != nil { 51 | panic(err) 52 | } 53 | points, err := storage.Select("metric1", nil, 1600000000, 1600000001) 54 | if err != nil { 55 | panic(err) 56 | } 57 | for _, p := range points { 58 | fmt.Printf("timestamp: %v, value: %v\n", p.Timestamp, p.Value) 59 | } 60 | // Output: 61 | // timestamp: 1600000000, value: 0.1 62 | } 63 | 64 | // simulates writing and reading in concurrent. 65 | func ExampleStorage_InsertRows_Select_concurrent() { 66 | storage, err := tstorage.NewStorage( 67 | tstorage.WithPartitionDuration(5*time.Hour), 68 | tstorage.WithTimestampPrecision(tstorage.Seconds), 69 | ) 70 | if err != nil { 71 | panic(err) 72 | } 73 | defer func() { 74 | if err := storage.Close(); err != nil { 75 | panic(err) 76 | } 77 | }() 78 | 79 | var wg sync.WaitGroup 80 | 81 | // Start write workers that insert 10000 times in concurrent, as fast as possible. 82 | wg.Add(1) 83 | go func() { 84 | defer wg.Done() 85 | for i := int64(1600000000); i < 1600010000; i++ { 86 | wg.Add(1) 87 | go func(timestamp int64) { 88 | defer wg.Done() 89 | if err := storage.InsertRows([]tstorage.Row{ 90 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: timestamp}}, 91 | }); err != nil { 92 | panic(err) 93 | } 94 | }(i) 95 | } 96 | }() 97 | 98 | // Start read workers that read 100 times in concurrent, as fast as possible. 99 | wg.Add(1) 100 | go func() { 101 | defer wg.Done() 102 | for i := 0; i < 100; i++ { 103 | wg.Add(1) 104 | go func() { 105 | defer wg.Done() 106 | points, err := storage.Select("metric1", nil, 1600000000, 1600010000) 107 | if errors.Is(err, tstorage.ErrNoDataPoints) { 108 | return 109 | } 110 | if err != nil { 111 | panic(err) 112 | } 113 | for _, p := range points { 114 | _ = p.Timestamp 115 | _ = p.Value 116 | } 117 | }() 118 | } 119 | }() 120 | wg.Wait() 121 | } 122 | 123 | func ExampleStorage_Select_from_memory() { 124 | tmpDir, err := os.MkdirTemp("", "tstorage-example") 125 | if err != nil { 126 | panic(err) 127 | } 128 | defer os.RemoveAll(tmpDir) 129 | 130 | storage, err := tstorage.NewStorage( 131 | tstorage.WithDataPath(tmpDir), 132 | tstorage.WithPartitionDuration(2*time.Hour), 133 | tstorage.WithTimestampPrecision(tstorage.Seconds), 134 | ) 135 | if err != nil { 136 | panic(err) 137 | } 138 | defer func() { 139 | if err := storage.Close(); err != nil { 140 | panic(err) 141 | } 142 | }() 143 | 144 | // Ingest data points of metric1 145 | for timestamp := int64(1600000000); timestamp < 1600000050; timestamp++ { 146 | err := storage.InsertRows([]tstorage.Row{ 147 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: timestamp, Value: 0.1}}, 148 | }) 149 | if err != nil { 150 | panic(err) 151 | } 152 | } 153 | // Ingest data points of metric2 154 | for timestamp := int64(1600000050); timestamp < 1600000100; timestamp++ { 155 | err := storage.InsertRows([]tstorage.Row{ 156 | {Metric: "metric2", DataPoint: tstorage.DataPoint{Timestamp: timestamp, Value: 0.2}}, 157 | }) 158 | if err != nil { 159 | panic(err) 160 | } 161 | } 162 | 163 | points, err := storage.Select("metric1", nil, 1600000000, 1600000050) 164 | if errors.Is(err, tstorage.ErrNoDataPoints) { 165 | return 166 | } 167 | if err != nil { 168 | panic(err) 169 | } 170 | fmt.Println("Data points of metric1:") 171 | for _, p := range points { 172 | fmt.Printf("Timestamp: %v, Value: %v\n", p.Timestamp, p.Value) 173 | } 174 | 175 | points2, err := storage.Select("metric2", nil, 1600000050, 1600000100) 176 | if errors.Is(err, tstorage.ErrNoDataPoints) { 177 | return 178 | } 179 | if err != nil { 180 | panic(err) 181 | } 182 | fmt.Println("Data points of metric2:") 183 | for _, p := range points2 { 184 | fmt.Printf("Timestamp: %v, Value: %v\n", p.Timestamp, p.Value) 185 | } 186 | // Output: 187 | //Data points of metric1: 188 | //Timestamp: 1600000000, Value: 0.1 189 | //Timestamp: 1600000001, Value: 0.1 190 | //Timestamp: 1600000002, Value: 0.1 191 | //Timestamp: 1600000003, Value: 0.1 192 | //Timestamp: 1600000004, Value: 0.1 193 | //Timestamp: 1600000005, Value: 0.1 194 | //Timestamp: 1600000006, Value: 0.1 195 | //Timestamp: 1600000007, Value: 0.1 196 | //Timestamp: 1600000008, Value: 0.1 197 | //Timestamp: 1600000009, Value: 0.1 198 | //Timestamp: 1600000010, Value: 0.1 199 | //Timestamp: 1600000011, Value: 0.1 200 | //Timestamp: 1600000012, Value: 0.1 201 | //Timestamp: 1600000013, Value: 0.1 202 | //Timestamp: 1600000014, Value: 0.1 203 | //Timestamp: 1600000015, Value: 0.1 204 | //Timestamp: 1600000016, Value: 0.1 205 | //Timestamp: 1600000017, Value: 0.1 206 | //Timestamp: 1600000018, Value: 0.1 207 | //Timestamp: 1600000019, Value: 0.1 208 | //Timestamp: 1600000020, Value: 0.1 209 | //Timestamp: 1600000021, Value: 0.1 210 | //Timestamp: 1600000022, Value: 0.1 211 | //Timestamp: 1600000023, Value: 0.1 212 | //Timestamp: 1600000024, Value: 0.1 213 | //Timestamp: 1600000025, Value: 0.1 214 | //Timestamp: 1600000026, Value: 0.1 215 | //Timestamp: 1600000027, Value: 0.1 216 | //Timestamp: 1600000028, Value: 0.1 217 | //Timestamp: 1600000029, Value: 0.1 218 | //Timestamp: 1600000030, Value: 0.1 219 | //Timestamp: 1600000031, Value: 0.1 220 | //Timestamp: 1600000032, Value: 0.1 221 | //Timestamp: 1600000033, Value: 0.1 222 | //Timestamp: 1600000034, Value: 0.1 223 | //Timestamp: 1600000035, Value: 0.1 224 | //Timestamp: 1600000036, Value: 0.1 225 | //Timestamp: 1600000037, Value: 0.1 226 | //Timestamp: 1600000038, Value: 0.1 227 | //Timestamp: 1600000039, Value: 0.1 228 | //Timestamp: 1600000040, Value: 0.1 229 | //Timestamp: 1600000041, Value: 0.1 230 | //Timestamp: 1600000042, Value: 0.1 231 | //Timestamp: 1600000043, Value: 0.1 232 | //Timestamp: 1600000044, Value: 0.1 233 | //Timestamp: 1600000045, Value: 0.1 234 | //Timestamp: 1600000046, Value: 0.1 235 | //Timestamp: 1600000047, Value: 0.1 236 | //Timestamp: 1600000048, Value: 0.1 237 | //Timestamp: 1600000049, Value: 0.1 238 | //Data points of metric2: 239 | //Timestamp: 1600000050, Value: 0.2 240 | //Timestamp: 1600000051, Value: 0.2 241 | //Timestamp: 1600000052, Value: 0.2 242 | //Timestamp: 1600000053, Value: 0.2 243 | //Timestamp: 1600000054, Value: 0.2 244 | //Timestamp: 1600000055, Value: 0.2 245 | //Timestamp: 1600000056, Value: 0.2 246 | //Timestamp: 1600000057, Value: 0.2 247 | //Timestamp: 1600000058, Value: 0.2 248 | //Timestamp: 1600000059, Value: 0.2 249 | //Timestamp: 1600000060, Value: 0.2 250 | //Timestamp: 1600000061, Value: 0.2 251 | //Timestamp: 1600000062, Value: 0.2 252 | //Timestamp: 1600000063, Value: 0.2 253 | //Timestamp: 1600000064, Value: 0.2 254 | //Timestamp: 1600000065, Value: 0.2 255 | //Timestamp: 1600000066, Value: 0.2 256 | //Timestamp: 1600000067, Value: 0.2 257 | //Timestamp: 1600000068, Value: 0.2 258 | //Timestamp: 1600000069, Value: 0.2 259 | //Timestamp: 1600000070, Value: 0.2 260 | //Timestamp: 1600000071, Value: 0.2 261 | //Timestamp: 1600000072, Value: 0.2 262 | //Timestamp: 1600000073, Value: 0.2 263 | //Timestamp: 1600000074, Value: 0.2 264 | //Timestamp: 1600000075, Value: 0.2 265 | //Timestamp: 1600000076, Value: 0.2 266 | //Timestamp: 1600000077, Value: 0.2 267 | //Timestamp: 1600000078, Value: 0.2 268 | //Timestamp: 1600000079, Value: 0.2 269 | //Timestamp: 1600000080, Value: 0.2 270 | //Timestamp: 1600000081, Value: 0.2 271 | //Timestamp: 1600000082, Value: 0.2 272 | //Timestamp: 1600000083, Value: 0.2 273 | //Timestamp: 1600000084, Value: 0.2 274 | //Timestamp: 1600000085, Value: 0.2 275 | //Timestamp: 1600000086, Value: 0.2 276 | //Timestamp: 1600000087, Value: 0.2 277 | //Timestamp: 1600000088, Value: 0.2 278 | //Timestamp: 1600000089, Value: 0.2 279 | //Timestamp: 1600000090, Value: 0.2 280 | //Timestamp: 1600000091, Value: 0.2 281 | //Timestamp: 1600000092, Value: 0.2 282 | //Timestamp: 1600000093, Value: 0.2 283 | //Timestamp: 1600000094, Value: 0.2 284 | //Timestamp: 1600000095, Value: 0.2 285 | //Timestamp: 1600000096, Value: 0.2 286 | //Timestamp: 1600000097, Value: 0.2 287 | //Timestamp: 1600000098, Value: 0.2 288 | //Timestamp: 1600000099, Value: 0.2 289 | } 290 | 291 | // simulates writing and reading on disk. 292 | func ExampleStorage_Select_from_disk() { 293 | tmpDir, err := os.MkdirTemp("", "tstorage-example") 294 | if err != nil { 295 | panic(err) 296 | } 297 | defer os.RemoveAll(tmpDir) 298 | 299 | storage, err := tstorage.NewStorage( 300 | tstorage.WithDataPath(tmpDir), 301 | tstorage.WithPartitionDuration(100*time.Second), 302 | tstorage.WithTimestampPrecision(tstorage.Seconds), 303 | ) 304 | if err != nil { 305 | panic(err) 306 | } 307 | 308 | // Ingest data points 309 | for timestamp := int64(1600000000); timestamp < 1600000050; timestamp++ { 310 | err := storage.InsertRows([]tstorage.Row{ 311 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: timestamp, Value: 0.1}}, 312 | }) 313 | if err != nil { 314 | panic(err) 315 | } 316 | err = storage.InsertRows([]tstorage.Row{ 317 | {Metric: "metric2", DataPoint: tstorage.DataPoint{Timestamp: timestamp, Value: 0.2}}, 318 | }) 319 | if err != nil { 320 | panic(err) 321 | } 322 | } 323 | // Flush all data points 324 | if err := storage.Close(); err != nil { 325 | panic(err) 326 | } 327 | 328 | // Re-open storage from the persisted data 329 | storage, err = tstorage.NewStorage( 330 | tstorage.WithDataPath(tmpDir), 331 | tstorage.WithPartitionDuration(10*time.Second), 332 | tstorage.WithTimestampPrecision(tstorage.Seconds), 333 | ) 334 | if err != nil { 335 | panic(err) 336 | } 337 | defer func() { 338 | if err := storage.Close(); err != nil { 339 | panic(err) 340 | } 341 | }() 342 | 343 | points, err := storage.Select("metric1", nil, 1600000000, 1600000050) 344 | if errors.Is(err, tstorage.ErrNoDataPoints) { 345 | return 346 | } 347 | if err != nil { 348 | panic(err) 349 | } 350 | fmt.Println("Data points of metric1:") 351 | for _, p := range points { 352 | fmt.Printf("Timestamp: %v, Value: %v\n", p.Timestamp, p.Value) 353 | } 354 | 355 | points2, err := storage.Select("metric2", nil, 1600000000, 1600000050) 356 | if errors.Is(err, tstorage.ErrNoDataPoints) { 357 | return 358 | } 359 | if err != nil { 360 | panic(err) 361 | } 362 | fmt.Println("Data points of metric2:") 363 | for _, p := range points2 { 364 | fmt.Printf("Timestamp: %v, Value: %v\n", p.Timestamp, p.Value) 365 | } 366 | // Output: 367 | //Data points of metric1: 368 | //Timestamp: 1600000000, Value: 0.1 369 | //Timestamp: 1600000001, Value: 0.1 370 | //Timestamp: 1600000002, Value: 0.1 371 | //Timestamp: 1600000003, Value: 0.1 372 | //Timestamp: 1600000004, Value: 0.1 373 | //Timestamp: 1600000005, Value: 0.1 374 | //Timestamp: 1600000006, Value: 0.1 375 | //Timestamp: 1600000007, Value: 0.1 376 | //Timestamp: 1600000008, Value: 0.1 377 | //Timestamp: 1600000009, Value: 0.1 378 | //Timestamp: 1600000010, Value: 0.1 379 | //Timestamp: 1600000011, Value: 0.1 380 | //Timestamp: 1600000012, Value: 0.1 381 | //Timestamp: 1600000013, Value: 0.1 382 | //Timestamp: 1600000014, Value: 0.1 383 | //Timestamp: 1600000015, Value: 0.1 384 | //Timestamp: 1600000016, Value: 0.1 385 | //Timestamp: 1600000017, Value: 0.1 386 | //Timestamp: 1600000018, Value: 0.1 387 | //Timestamp: 1600000019, Value: 0.1 388 | //Timestamp: 1600000020, Value: 0.1 389 | //Timestamp: 1600000021, Value: 0.1 390 | //Timestamp: 1600000022, Value: 0.1 391 | //Timestamp: 1600000023, Value: 0.1 392 | //Timestamp: 1600000024, Value: 0.1 393 | //Timestamp: 1600000025, Value: 0.1 394 | //Timestamp: 1600000026, Value: 0.1 395 | //Timestamp: 1600000027, Value: 0.1 396 | //Timestamp: 1600000028, Value: 0.1 397 | //Timestamp: 1600000029, Value: 0.1 398 | //Timestamp: 1600000030, Value: 0.1 399 | //Timestamp: 1600000031, Value: 0.1 400 | //Timestamp: 1600000032, Value: 0.1 401 | //Timestamp: 1600000033, Value: 0.1 402 | //Timestamp: 1600000034, Value: 0.1 403 | //Timestamp: 1600000035, Value: 0.1 404 | //Timestamp: 1600000036, Value: 0.1 405 | //Timestamp: 1600000037, Value: 0.1 406 | //Timestamp: 1600000038, Value: 0.1 407 | //Timestamp: 1600000039, Value: 0.1 408 | //Timestamp: 1600000040, Value: 0.1 409 | //Timestamp: 1600000041, Value: 0.1 410 | //Timestamp: 1600000042, Value: 0.1 411 | //Timestamp: 1600000043, Value: 0.1 412 | //Timestamp: 1600000044, Value: 0.1 413 | //Timestamp: 1600000045, Value: 0.1 414 | //Timestamp: 1600000046, Value: 0.1 415 | //Timestamp: 1600000047, Value: 0.1 416 | //Timestamp: 1600000048, Value: 0.1 417 | //Timestamp: 1600000049, Value: 0.1 418 | //Data points of metric2: 419 | //Timestamp: 1600000000, Value: 0.2 420 | //Timestamp: 1600000001, Value: 0.2 421 | //Timestamp: 1600000002, Value: 0.2 422 | //Timestamp: 1600000003, Value: 0.2 423 | //Timestamp: 1600000004, Value: 0.2 424 | //Timestamp: 1600000005, Value: 0.2 425 | //Timestamp: 1600000006, Value: 0.2 426 | //Timestamp: 1600000007, Value: 0.2 427 | //Timestamp: 1600000008, Value: 0.2 428 | //Timestamp: 1600000009, Value: 0.2 429 | //Timestamp: 1600000010, Value: 0.2 430 | //Timestamp: 1600000011, Value: 0.2 431 | //Timestamp: 1600000012, Value: 0.2 432 | //Timestamp: 1600000013, Value: 0.2 433 | //Timestamp: 1600000014, Value: 0.2 434 | //Timestamp: 1600000015, Value: 0.2 435 | //Timestamp: 1600000016, Value: 0.2 436 | //Timestamp: 1600000017, Value: 0.2 437 | //Timestamp: 1600000018, Value: 0.2 438 | //Timestamp: 1600000019, Value: 0.2 439 | //Timestamp: 1600000020, Value: 0.2 440 | //Timestamp: 1600000021, Value: 0.2 441 | //Timestamp: 1600000022, Value: 0.2 442 | //Timestamp: 1600000023, Value: 0.2 443 | //Timestamp: 1600000024, Value: 0.2 444 | //Timestamp: 1600000025, Value: 0.2 445 | //Timestamp: 1600000026, Value: 0.2 446 | //Timestamp: 1600000027, Value: 0.2 447 | //Timestamp: 1600000028, Value: 0.2 448 | //Timestamp: 1600000029, Value: 0.2 449 | //Timestamp: 1600000030, Value: 0.2 450 | //Timestamp: 1600000031, Value: 0.2 451 | //Timestamp: 1600000032, Value: 0.2 452 | //Timestamp: 1600000033, Value: 0.2 453 | //Timestamp: 1600000034, Value: 0.2 454 | //Timestamp: 1600000035, Value: 0.2 455 | //Timestamp: 1600000036, Value: 0.2 456 | //Timestamp: 1600000037, Value: 0.2 457 | //Timestamp: 1600000038, Value: 0.2 458 | //Timestamp: 1600000039, Value: 0.2 459 | //Timestamp: 1600000040, Value: 0.2 460 | //Timestamp: 1600000041, Value: 0.2 461 | //Timestamp: 1600000042, Value: 0.2 462 | //Timestamp: 1600000043, Value: 0.2 463 | //Timestamp: 1600000044, Value: 0.2 464 | //Timestamp: 1600000045, Value: 0.2 465 | //Timestamp: 1600000046, Value: 0.2 466 | //Timestamp: 1600000047, Value: 0.2 467 | //Timestamp: 1600000048, Value: 0.2 468 | //Timestamp: 1600000049, Value: 0.2 469 | } 470 | 471 | // Out of order data points that are not yet flushed are in the buffer 472 | // but do not appear in select. 473 | func ExampleStorage_Select_from_memory_out_of_order() { 474 | storage, err := tstorage.NewStorage( 475 | tstorage.WithTimestampPrecision(tstorage.Seconds), 476 | ) 477 | if err != nil { 478 | panic(err) 479 | } 480 | defer func() { 481 | if err := storage.Close(); err != nil { 482 | panic(err) 483 | } 484 | }() 485 | err = storage.InsertRows([]tstorage.Row{ 486 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: 1600000000, Value: 0.1}}, 487 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: 1600000002, Value: 0.1}}, 488 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: 1600000001, Value: 0.1}}, 489 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: 1600000003, Value: 0.1}}, 490 | }) 491 | if err != nil { 492 | panic(err) 493 | } 494 | points, err := storage.Select("metric1", nil, 1600000000, 1600000003) 495 | if err != nil { 496 | panic(err) 497 | } 498 | for _, p := range points { 499 | fmt.Printf("Timestamp: %v, Value: %v\n", p.Timestamp, p.Value) 500 | } 501 | 502 | // Out-of-order data points are ignored because they will get merged when flushing. 503 | 504 | // Output: 505 | // Timestamp: 1600000000, Value: 0.1 506 | // Timestamp: 1600000002, Value: 0.1 507 | // Timestamp: 1600000003, Value: 0.1 508 | } 509 | 510 | // Out of order data points that are flushed should appear in select. 511 | func ExampleStorage_Select_from_disk_out_of_order() { 512 | tmpDir, err := os.MkdirTemp("", "tstorage-example") 513 | if err != nil { 514 | panic(err) 515 | } 516 | defer os.RemoveAll(tmpDir) 517 | 518 | storage, err := tstorage.NewStorage( 519 | tstorage.WithDataPath(tmpDir), 520 | tstorage.WithPartitionDuration(100*time.Second), 521 | tstorage.WithTimestampPrecision(tstorage.Seconds), 522 | ) 523 | if err != nil { 524 | panic(err) 525 | } 526 | 527 | err = storage.InsertRows([]tstorage.Row{ 528 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: 1600000000, Value: 0.1}}, 529 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: 1600000002, Value: 0.1}}, 530 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: 1600000001, Value: 0.1}}, 531 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: 1600000003, Value: 0.1}}, 532 | }) 533 | if err != nil { 534 | panic(err) 535 | } 536 | 537 | // Flush all data points 538 | if err := storage.Close(); err != nil { 539 | panic(err) 540 | } 541 | 542 | // Re-open storage from the persisted data 543 | storage, err = tstorage.NewStorage( 544 | tstorage.WithDataPath(tmpDir), 545 | tstorage.WithPartitionDuration(100*time.Second), 546 | tstorage.WithTimestampPrecision(tstorage.Seconds), 547 | ) 548 | if err != nil { 549 | panic(err) 550 | } 551 | defer func() { 552 | if err := storage.Close(); err != nil { 553 | panic(err) 554 | } 555 | }() 556 | 557 | points, err := storage.Select("metric1", nil, 1600000000, 1600000004) 558 | if errors.Is(err, tstorage.ErrNoDataPoints) { 559 | return 560 | } 561 | if err != nil { 562 | panic(err) 563 | } 564 | for _, p := range points { 565 | fmt.Printf("timestamp: %v, value: %v\n", p.Timestamp, p.Value) 566 | } 567 | // Output: 568 | // timestamp: 1600000000, value: 0.1 569 | // timestamp: 1600000001, value: 0.1 570 | // timestamp: 1600000002, value: 0.1 571 | // timestamp: 1600000003, value: 0.1 572 | } 573 | 574 | // Simulates inserting an outdated row that forces inserting into a non-head partition. 575 | func ExampleStorage_InsertRows_outdated() { 576 | tmpDir, err := os.MkdirTemp("", "tstorage-example") 577 | if err != nil { 578 | panic(err) 579 | } 580 | defer os.RemoveAll(tmpDir) 581 | 582 | storage, err := tstorage.NewStorage( 583 | tstorage.WithDataPath(tmpDir), 584 | tstorage.WithTimestampPrecision(tstorage.Seconds), 585 | tstorage.WithPartitionDuration(3*time.Second), 586 | ) 587 | if err != nil { 588 | panic(err) 589 | } 590 | 591 | // Force two partitions with timestamps: (min: 1, max: 3), (min: 4, max: 5) 592 | err = storage.InsertRows([]tstorage.Row{ 593 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000001, Value: 0.1}, Metric: "metric1"}, 594 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000003, Value: 0.1}, Metric: "metric1"}, 595 | }) 596 | if err != nil { 597 | panic(err) 598 | } 599 | err = storage.InsertRows([]tstorage.Row{ 600 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000004, Value: 0.1}, Metric: "metric1"}, 601 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000005, Value: 0.1}, Metric: "metric1"}, 602 | }) 603 | if err != nil { 604 | panic(err) 605 | } 606 | 607 | // Insert a data point that doesn't belong to the head partition. This will be inserted 608 | // into the next partition out of order. 609 | err = storage.InsertRows([]tstorage.Row{ 610 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000002, Value: 0.1}, Metric: "metric1"}, 611 | }) 612 | if err != nil { 613 | panic(err) 614 | } 615 | 616 | // Flush all data points 617 | if err := storage.Close(); err != nil { 618 | panic(err) 619 | } 620 | 621 | // Re-open storage from the persisted data 622 | storage, err = tstorage.NewStorage( 623 | tstorage.WithDataPath(tmpDir), 624 | tstorage.WithTimestampPrecision(tstorage.Seconds), 625 | tstorage.WithPartitionDuration(3*time.Second), 626 | ) 627 | if err != nil { 628 | panic(err) 629 | } 630 | defer func() { 631 | if err := storage.Close(); err != nil { 632 | panic(err) 633 | } 634 | }() 635 | 636 | points, err := storage.Select("metric1", nil, 1600000001, 1600000006) 637 | if err != nil { 638 | panic(err) 639 | } 640 | for _, p := range points { 641 | fmt.Printf("Timestamp: %v, Value: %v\n", p.Timestamp, p.Value) 642 | } 643 | // Output: 644 | // Timestamp: 1600000001, Value: 0.1 645 | // Timestamp: 1600000002, Value: 0.1 646 | // Timestamp: 1600000003, Value: 0.1 647 | // Timestamp: 1600000004, Value: 0.1 648 | // Timestamp: 1600000005, Value: 0.1 649 | } 650 | 651 | // Simulates inserting a row that's outside of the writable time window. 652 | func ExampleStorage_InsertRows_expired() { 653 | tmpDir, err := os.MkdirTemp("", "tstorage-example") 654 | if err != nil { 655 | panic(err) 656 | } 657 | defer os.RemoveAll(tmpDir) 658 | 659 | storage, err := tstorage.NewStorage( 660 | tstorage.WithDataPath(tmpDir), 661 | tstorage.WithTimestampPrecision(tstorage.Seconds), 662 | tstorage.WithPartitionDuration(3*time.Second), 663 | ) 664 | if err != nil { 665 | panic(err) 666 | } 667 | 668 | // Force three partitions with timestamps: (min: 1, max: 3), (min: 4, max: 6), (min: 7, max: 8). 669 | // Inserting the third partition will force the first one to be flushed to disk and become unwritable. 670 | err = storage.InsertRows([]tstorage.Row{ 671 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000001, Value: 0.1}, Metric: "metric1"}, 672 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000003, Value: 0.1}, Metric: "metric1"}, 673 | }) 674 | if err != nil { 675 | panic(err) 676 | } 677 | err = storage.InsertRows([]tstorage.Row{ 678 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000004, Value: 0.1}, Metric: "metric1"}, 679 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000005, Value: 0.1}, Metric: "metric1"}, 680 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000006, Value: 0.1}, Metric: "metric1"}, 681 | }) 682 | if err != nil { 683 | panic(err) 684 | } 685 | err = storage.InsertRows([]tstorage.Row{ 686 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000007, Value: 0.1}, Metric: "metric1"}, 687 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000008, Value: 0.1}, Metric: "metric1"}, 688 | }) 689 | if err != nil { 690 | panic(err) 691 | } 692 | 693 | // Try to insert a data point into an already flushed partition. 694 | err = storage.InsertRows([]tstorage.Row{ 695 | {DataPoint: tstorage.DataPoint{Timestamp: 1600000002, Value: 0.1}, Metric: "metric1"}, 696 | }) 697 | if err != nil { 698 | panic(err) 699 | } 700 | 701 | // Flush all data points 702 | if err := storage.Close(); err != nil { 703 | panic(err) 704 | } 705 | 706 | // Re-open storage from the persisted data 707 | storage, err = tstorage.NewStorage( 708 | tstorage.WithDataPath(tmpDir), 709 | tstorage.WithTimestampPrecision(tstorage.Seconds), 710 | tstorage.WithPartitionDuration(3*time.Second), 711 | ) 712 | if err != nil { 713 | panic(err) 714 | } 715 | defer func() { 716 | if err := storage.Close(); err != nil { 717 | panic(err) 718 | } 719 | }() 720 | 721 | points, err := storage.Select("metric1", nil, 1600000001, 1600000009) 722 | if err != nil { 723 | panic(err) 724 | } 725 | for _, p := range points { 726 | fmt.Printf("Timestamp: %v, Value: %v\n", p.Timestamp, p.Value) 727 | } 728 | 729 | // Missing data point at 1600000002 because it was dropped. 730 | 731 | // Output: 732 | // Timestamp: 1600000001, Value: 0.1 733 | // Timestamp: 1600000003, Value: 0.1 734 | // Timestamp: 1600000004, Value: 0.1 735 | // Timestamp: 1600000005, Value: 0.1 736 | // Timestamp: 1600000006, Value: 0.1 737 | // Timestamp: 1600000007, Value: 0.1 738 | // Timestamp: 1600000008, Value: 0.1 739 | } 740 | 741 | func ExampleStorage_InsertRows_concurrent() { 742 | storage, err := tstorage.NewStorage( 743 | tstorage.WithTimestampPrecision(tstorage.Seconds), 744 | ) 745 | if err != nil { 746 | panic(err) 747 | } 748 | defer storage.Close() 749 | 750 | // First insert in order to ensure min timestamp 751 | if err := storage.InsertRows([]tstorage.Row{ 752 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: 1600000000}}, 753 | }); err != nil { 754 | panic(err) 755 | } 756 | 757 | var wg sync.WaitGroup 758 | for i := int64(1600000001); i < 1600000100; i++ { 759 | wg.Add(1) 760 | go func(timestamp int64) { 761 | if err := storage.InsertRows([]tstorage.Row{ 762 | {Metric: "metric1", DataPoint: tstorage.DataPoint{Timestamp: timestamp}}, 763 | }); err != nil { 764 | panic(err) 765 | } 766 | wg.Done() 767 | }(i) 768 | } 769 | wg.Wait() 770 | 771 | points, err := storage.Select("metric1", nil, 1600000000, 1600000100) 772 | if err != nil { 773 | panic(err) 774 | } 775 | for _, p := range points { 776 | fmt.Printf("timestamp: %v, value: %v\n", p.Timestamp, p.Value) 777 | } 778 | } 779 | -------------------------------------------------------------------------------- /storage_test.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func Test_storage_Select(t *testing.T) { 11 | tests := []struct { 12 | name string 13 | storage storage 14 | metric string 15 | labels []Label 16 | start int64 17 | end int64 18 | want []*DataPoint 19 | wantErr bool 20 | }{ 21 | { 22 | name: "select from single partition", 23 | metric: "metric1", 24 | start: 1, 25 | end: 4, 26 | storage: func() storage { 27 | part1 := newMemoryPartition(nil, 1*time.Hour, Seconds) 28 | _, err := part1.insertRows([]Row{ 29 | {DataPoint: DataPoint{Timestamp: 1}, Metric: "metric1"}, 30 | {DataPoint: DataPoint{Timestamp: 2}, Metric: "metric1"}, 31 | {DataPoint: DataPoint{Timestamp: 3}, Metric: "metric1"}, 32 | }) 33 | if err != nil { 34 | panic(err) 35 | } 36 | list := newPartitionList() 37 | list.insert(part1) 38 | return storage{ 39 | partitionList: list, 40 | workersLimitCh: make(chan struct{}, defaultWorkersLimit), 41 | } 42 | }(), 43 | want: []*DataPoint{ 44 | {Timestamp: 1}, 45 | {Timestamp: 2}, 46 | {Timestamp: 3}, 47 | }, 48 | }, 49 | { 50 | name: "select from three partitions", 51 | metric: "metric1", 52 | start: 1, 53 | end: 10, 54 | storage: func() storage { 55 | part1 := newMemoryPartition(nil, 1*time.Hour, Seconds) 56 | _, err := part1.insertRows([]Row{ 57 | {DataPoint: DataPoint{Timestamp: 1}, Metric: "metric1"}, 58 | {DataPoint: DataPoint{Timestamp: 2}, Metric: "metric1"}, 59 | {DataPoint: DataPoint{Timestamp: 3}, Metric: "metric1"}, 60 | }) 61 | if err != nil { 62 | panic(err) 63 | } 64 | part2 := newMemoryPartition(nil, 1*time.Hour, Seconds) 65 | _, err = part2.insertRows([]Row{ 66 | {DataPoint: DataPoint{Timestamp: 4}, Metric: "metric1"}, 67 | {DataPoint: DataPoint{Timestamp: 5}, Metric: "metric1"}, 68 | {DataPoint: DataPoint{Timestamp: 6}, Metric: "metric1"}, 69 | }) 70 | if err != nil { 71 | panic(err) 72 | } 73 | part3 := newMemoryPartition(nil, 1*time.Hour, Seconds) 74 | _, err = part3.insertRows([]Row{ 75 | {DataPoint: DataPoint{Timestamp: 7}, Metric: "metric1"}, 76 | {DataPoint: DataPoint{Timestamp: 8}, Metric: "metric1"}, 77 | {DataPoint: DataPoint{Timestamp: 9}, Metric: "metric1"}, 78 | }) 79 | if err != nil { 80 | panic(err) 81 | } 82 | list := newPartitionList() 83 | list.insert(part1) 84 | list.insert(part2) 85 | list.insert(part3) 86 | 87 | return storage{ 88 | partitionList: list, 89 | workersLimitCh: make(chan struct{}, defaultWorkersLimit), 90 | } 91 | }(), 92 | want: []*DataPoint{ 93 | {Timestamp: 1}, 94 | {Timestamp: 2}, 95 | {Timestamp: 3}, 96 | {Timestamp: 4}, 97 | {Timestamp: 5}, 98 | {Timestamp: 6}, 99 | {Timestamp: 7}, 100 | {Timestamp: 8}, 101 | {Timestamp: 9}, 102 | }, 103 | }, 104 | } 105 | for _, tt := range tests { 106 | t.Run(tt.name, func(t *testing.T) { 107 | got, err := tt.storage.Select(tt.metric, tt.labels, tt.start, tt.end) 108 | assert.Equal(t, tt.wantErr, err != nil) 109 | assert.Equal(t, tt.want, got) 110 | assert.Equal(t, tt.want, got) 111 | }) 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /testdata/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "minTimestamp": 1600000000, 3 | "maxTimestamp": 1600000001, 4 | "numDatapoints": 2 5 | } 6 | -------------------------------------------------------------------------------- /wal.go: -------------------------------------------------------------------------------- 1 | package tstorage 2 | 3 | import ( 4 | "os" 5 | "sync" 6 | ) 7 | 8 | type walOperation byte 9 | 10 | const ( 11 | // The record format for operateInsert is as shown below: 12 | /* 13 | +--------+---------------------+--------+--------------------+----------------+ 14 | | op(1b) | len metric(varints) | metric | timestamp(varints) | value(varints) | 15 | +--------+---------------------+--------+--------------------+----------------+ 16 | */ 17 | operationInsert walOperation = iota 18 | ) 19 | 20 | // wal represents a write-ahead log, which offers durability guarantees. 21 | type wal interface { 22 | append(op walOperation, rows []Row) error 23 | flush() error 24 | punctuate() error 25 | removeOldest() error 26 | removeAll() error 27 | refresh() error 28 | } 29 | 30 | type nopWAL struct { 31 | filename string 32 | f *os.File 33 | mu sync.Mutex 34 | } 35 | 36 | func (f *nopWAL) append(_ walOperation, _ []Row) error { 37 | return nil 38 | } 39 | 40 | func (f *nopWAL) flush() error { 41 | return nil 42 | } 43 | 44 | func (f *nopWAL) punctuate() error { 45 | return nil 46 | } 47 | 48 | func (f *nopWAL) removeOldest() error { 49 | return nil 50 | } 51 | 52 | func (f *nopWAL) removeAll() error { 53 | return nil 54 | } 55 | 56 | func (f *nopWAL) refresh() error { 57 | return nil 58 | } 59 | --------------------------------------------------------------------------------