├── .github └── workflows │ └── go.yaml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── api ├── grpc │ └── thanos.go └── http │ ├── api.go │ └── query.go ├── cmd ├── cmd.go ├── config.go ├── convert.go └── serve.go ├── convert ├── chunks.go ├── convert.go ├── sort.go └── tsdb.go ├── db ├── block.go ├── db.go ├── db_test.go ├── discover.go ├── iterator.go ├── metrics.go ├── seriesset.go ├── shard.go ├── syncer.go └── util.go ├── go.mod ├── go.sum ├── internal ├── encoding │ └── zigzag.go ├── tracing │ └── tracer.go └── util │ ├── date.go │ └── interval.go ├── proto └── metapb │ ├── meta.pb.go │ └── meta.proto ├── revive.toml ├── schema ├── block.go ├── block_test.go └── schema.go ├── search ├── constraint.go ├── metrics.go ├── rowrange.go ├── rowrange_test.go ├── search.go └── search_test.go └── shell.nix /.github/workflows/go.yaml: -------------------------------------------------------------------------------- 1 | name: go 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | permissions: 10 | contents: read 11 | 12 | jobs: 13 | unittests: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v2 17 | - uses: cachix/install-nix-action@v14.1 18 | with: 19 | nix_path: nixpkgs=channel:nixos-unstable 20 | - uses: dx-oss/nix-shell-action@v11 21 | with: 22 | file: shell.nix 23 | script: make test 24 | 25 | build: 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v2 29 | - uses: cachix/install-nix-action@v14.1 30 | with: 31 | nix_path: nixpkgs=channel:nixos-unstable 32 | - uses: dx-oss/nix-shell-action@v11 33 | with: 34 | file: shell.nix 35 | script: make build 36 | 37 | lint: 38 | runs-on: ubuntu-latest 39 | steps: 40 | - uses: actions/checkout@v2 41 | - uses: cachix/install-nix-action@v14.1 42 | with: 43 | nix_path: nixpkgs=channel:nixos-unstable 44 | - uses: dx-oss/nix-shell-action@v11 45 | with: 46 | file: shell.nix 47 | script: make lint 48 | 49 | codespell: 50 | runs-on: ubuntu-latest 51 | name: Check misspelled words 52 | steps: 53 | - name: Checkout code 54 | uses: actions/checkout@v4 55 | - name: Run codespell 56 | uses: codespell-project/actions-codespell@v2 57 | with: 58 | check_filenames: false 59 | check_hidden: true 60 | skip: ./pkg/ui/*,./pkg/store/6545postingsrepro,./internal/*,./mixin/vendor/*,./.bingo/*,go.mod,go.sum 61 | ignore_words_list: intrumentation,mmaped,nd,ot,re-use,ser,serie,sme,sudu,tast,te,ans 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | parquet-gateway 2 | .cover 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all build proto lint test 2 | 3 | all: build 4 | 5 | build: proto parquet-gateway 6 | 7 | GO = go 8 | GOIMPORTS = goimports 9 | REVIVE = revive 10 | PROTOC = protoc 11 | 12 | lint: $(wildcard **/*.go) 13 | @echo ">> running lint..." 14 | @$(REVIVE) -config revive.toml ./... 15 | find . -name '*.go' ! -path './proto/*' | xargs $(GOIMPORTS) -l -w -local $(head -n 1 go.mod | cut -d ' ' -f 2) 16 | 17 | test: $(wildcard **/*.go) 18 | @echo ">> running tests..." 19 | @mkdir -p .cover 20 | $(GO) test -v -race -count=1 ./... -coverprofile .cover/cover.out 21 | 22 | parquet-gateway: $(wildcard **/*.go) 23 | @echo ">> building binaries..." 24 | @$(GO) build -o parquet-gateway github.com/cloudflare/parquet-tsdb-poc/cmd 25 | 26 | proto: proto/metapb/meta.pb.go 27 | 28 | proto/metapb/meta.pb.go: proto/metapb/meta.proto 29 | @echo ">> compiling protos..." 30 | @$(PROTOC) -I=proto/metapb/ --go_out=paths=source_relative:./proto/metapb/ proto/metapb/meta.proto 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # parquet-tsdb-poc 2 | 3 | POC for a parquet based TSDB in object storage. 4 | 5 | ## Why 6 | 7 | This project was inspired by [this excellent talk](https://www.youtube.com/watch?v=V8Y4VuUwg8I) by Shopify's Filip Petkovski. It is an attempt to build a service that can convert Prometheus TSDB blocks into [parquet](https://parquet.apache.org/) files and serve [PromQL](https://prometheus.io) queries to backfill a [Thanos deployment](https://thanos.io/). 8 | 9 | ## Developing 10 | 11 | We recommend to use `nix` to fulfill all development dependencies. Visit [Nix Download](https://nixos.org/download/) to get started. To activate the development environment simply run `nix-shell` in the project root. 12 | 13 | - to build the binary run `nix-shell --run 'make build'` 14 | - to run tests run `nix-shell --run 'make test'` 15 | 16 | ## Running 17 | 18 | ### Server 19 | 20 | Once built, you can run the server using something like: 21 | 22 | ```bash 23 | parquet-gateway serve \ 24 | --storage.prefix my-prefix \ 25 | --http.internal.port=6060 \ 26 | --http.prometheus.port=9090 \ 27 | --http.thanos.port=9091 \ 28 | --block.syncer.interval=30m \ 29 | --block.syncer.concurrency=32 \ 30 | --block.discovery.interval=30m \ 31 | --block.discovery.concurrency=32 \ 32 | --query.external-label=prometheus=my-prometheus \ 33 | --query.external-label=replica=ha-1 34 | ``` 35 | 36 | This will: 37 | 38 | - load blocks from the `.data/my-prefix` directory 39 | - expose internal metrics and readiness handlers on port 6060 40 | - expose a subset of the Prometheus HTTP API on port 9090 41 | - expose an Thanos Info and Query gRPC service on port 9091 42 | 43 | You can now query it by pointing a Thanos Querier at it or through curl: 44 | 45 | ```bash 46 | curl 'http://0.0.0.0:9000/api/v1/query' \ 47 | -sq \ 48 | -H 'content-type: application/x-www-form-urlencoded' \ 49 | --data-urlencode 'query=vector(1)' | jq 50 | { 51 | "status": "success", 52 | "data": { 53 | "resultType": "vector", 54 | "result": [ 55 | { 56 | "metric": {}, 57 | "value": [ 58 | 1741267893.103, 59 | "1" 60 | ] 61 | } 62 | ] 63 | } 64 | } 65 | 66 | ``` 67 | 68 | ### Converter 69 | 70 | To convert TSDB blocks in the `.data/source` directory that overlap `09/2021` and write the resulting parquet files into the `.data/destination` directory. 71 | 72 | ```bash 73 | parquet-gateway convert \ 74 | --tsdb.storage.prefix source \ 75 | --parquet.storage.prefix destination \ 76 | --convert.start=2021-09-01T00:00:00Z \ 77 | --convert.end=2021-10-01T00:00:00Z 78 | ``` 79 | -------------------------------------------------------------------------------- /api/grpc/thanos.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package grpc 6 | 7 | import ( 8 | "context" 9 | "fmt" 10 | "time" 11 | 12 | "github.com/prometheus/prometheus/model/labels" 13 | "github.com/prometheus/prometheus/promql" 14 | "github.com/prometheus/prometheus/util/stats" 15 | "google.golang.org/grpc/codes" 16 | "google.golang.org/grpc/status" 17 | 18 | "github.com/thanos-io/thanos/pkg/api/query/querypb" 19 | "github.com/thanos-io/thanos/pkg/component" 20 | "github.com/thanos-io/thanos/pkg/info/infopb" 21 | "github.com/thanos-io/thanos/pkg/store/labelpb" 22 | "github.com/thanos-io/thanos/pkg/store/storepb/prompb" 23 | 24 | cfdb "github.com/cloudflare/parquet-tsdb-poc/db" 25 | ) 26 | 27 | // Taken from https://github.com/thanos-community/thanos-promql-connector/blob/main/main.go 28 | 29 | type infoServer struct { 30 | infopb.UnimplementedInfoServer 31 | 32 | db *cfdb.DB 33 | } 34 | 35 | func NewInfoServer(db *cfdb.DB) infopb.InfoServer { 36 | return &infoServer{db: db} 37 | } 38 | 39 | func (info *infoServer) Info(_ context.Context, _ *infopb.InfoRequest) (*infopb.InfoResponse, error) { 40 | mint, maxt := info.db.Timerange() 41 | extlabels := info.db.Extlabels() 42 | return &infopb.InfoResponse{ 43 | ComponentType: component.Query.String(), 44 | LabelSets: labelpb.ZLabelSetsFromPromLabels(extlabels), 45 | Store: &infopb.StoreInfo{ 46 | MinTime: mint, 47 | MaxTime: maxt, 48 | TsdbInfos: []infopb.TSDBInfo{ 49 | { 50 | MinTime: mint, 51 | MaxTime: maxt, 52 | Labels: labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(extlabels)}, 53 | }, 54 | }, 55 | }, 56 | Query: &infopb.QueryAPIInfo{}, 57 | }, nil 58 | } 59 | 60 | type queryServer struct { 61 | querypb.UnimplementedQueryServer 62 | 63 | db *cfdb.DB 64 | engine promql.QueryEngine 65 | } 66 | 67 | func NewQueryServer(db *cfdb.DB, engine promql.QueryEngine) querypb.QueryServer { 68 | return &queryServer{db: db, engine: engine} 69 | } 70 | 71 | func (qs *queryServer) Query(req *querypb.QueryRequest, srv querypb.Query_QueryServer) error { 72 | ts := time.Unix(req.TimeSeconds, 0) 73 | timeout := time.Duration(req.TimeoutSeconds) * time.Second 74 | 75 | ctx, cancel := context.WithTimeout(srv.Context(), timeout) 76 | defer cancel() 77 | 78 | opts := promql.NewPrometheusQueryOpts(false, time.Duration(req.LookbackDeltaSeconds)) 79 | 80 | qryable := qs.db.ReplicaQueryable(req.ReplicaLabels) 81 | 82 | qry, err := qs.engine.NewInstantQuery(ctx, qryable, opts, req.Query, ts) 83 | if err != nil { 84 | return status.Error(codes.Aborted, fmt.Sprintf("unable to create query: %s", err)) 85 | } 86 | defer qry.Close() 87 | 88 | res := qry.Exec(ctx) 89 | if err := res.Err; err != nil { 90 | return status.Error(codes.Internal, fmt.Sprintf("query eval error: %s", err)) 91 | } 92 | if warnings := res.Warnings.AsErrors(); len(warnings) > 0 { 93 | errs := make([]error, 0, len(warnings)) 94 | for _, warning := range warnings { 95 | errs = append(errs, warning) 96 | } 97 | if err = srv.SendMsg(querypb.NewQueryWarningsResponse(errs...)); err != nil { 98 | return err 99 | } 100 | } 101 | switch results := res.Value.(type) { 102 | case promql.Vector: 103 | for _, result := range results { 104 | series := &prompb.TimeSeries{ 105 | Samples: []prompb.Sample{{Value: float64(result.F), Timestamp: int64(result.T)}}, 106 | Labels: zLabelsFromMetric(result.Metric), 107 | } 108 | if err := srv.Send(querypb.NewQueryResponse(series)); err != nil { 109 | return err 110 | } 111 | } 112 | case promql.Scalar: 113 | series := &prompb.TimeSeries{Samples: []prompb.Sample{{Value: float64(results.V), Timestamp: int64(results.T)}}} 114 | if err := srv.Send(querypb.NewQueryResponse(series)); err != nil { 115 | return err 116 | } 117 | } 118 | if stats := qry.Stats(); stats != nil { 119 | if err := srv.Send(querypb.NewQueryStatsResponse(toQueryStats(stats))); err != nil { 120 | return err 121 | } 122 | } 123 | return nil 124 | } 125 | 126 | func (qs *queryServer) QueryRange(req *querypb.QueryRangeRequest, srv querypb.Query_QueryRangeServer) error { 127 | start := time.Unix(req.StartTimeSeconds, 0) 128 | end := time.Unix(req.EndTimeSeconds, 0) 129 | step := time.Duration(req.IntervalSeconds) * time.Second 130 | timeout := time.Duration(req.TimeoutSeconds) * time.Second 131 | 132 | ctx, cancel := context.WithTimeout(srv.Context(), timeout) 133 | defer cancel() 134 | 135 | qryable := qs.db.ReplicaQueryable(req.ReplicaLabels) 136 | 137 | opts := promql.NewPrometheusQueryOpts(false, time.Duration(req.LookbackDeltaSeconds)) 138 | qry, err := qs.engine.NewRangeQuery(ctx, qryable, opts, req.Query, start, end, step) 139 | if err != nil { 140 | return status.Error(codes.Aborted, fmt.Sprintf("unable to create query: %s", err)) 141 | } 142 | defer qry.Close() 143 | 144 | res := qry.Exec(ctx) 145 | if err := res.Err; err != nil { 146 | return status.Error(codes.Internal, fmt.Sprintf("query eval error: %s", err)) 147 | } 148 | if warnings := res.Warnings.AsErrors(); len(warnings) > 0 { 149 | errs := make([]error, 0, len(warnings)) 150 | for _, warning := range warnings { 151 | errs = append(errs, warning) 152 | } 153 | if err = srv.SendMsg(querypb.NewQueryWarningsResponse(errs...)); err != nil { 154 | return err 155 | } 156 | } 157 | switch results := res.Value.(type) { 158 | case promql.Matrix: 159 | for _, result := range results { 160 | series := &prompb.TimeSeries{ 161 | Samples: samplesFromModel(result.Floats), 162 | Labels: zLabelsFromMetric(result.Metric), 163 | } 164 | if err := srv.Send(querypb.NewQueryRangeResponse(series)); err != nil { 165 | return err 166 | } 167 | } 168 | case promql.Vector: 169 | for _, result := range results { 170 | series := &prompb.TimeSeries{ 171 | Samples: []prompb.Sample{{Value: float64(result.F), Timestamp: int64(result.T)}}, 172 | Labels: zLabelsFromMetric(result.Metric), 173 | } 174 | if err := srv.Send(querypb.NewQueryRangeResponse(series)); err != nil { 175 | return err 176 | } 177 | } 178 | case promql.Scalar: 179 | series := &prompb.TimeSeries{Samples: []prompb.Sample{{Value: float64(results.V), Timestamp: int64(results.T)}}} 180 | if err := srv.Send(querypb.NewQueryRangeResponse(series)); err != nil { 181 | return err 182 | } 183 | } 184 | 185 | if stats := qry.Stats(); stats != nil { 186 | if err := srv.Send(querypb.NewQueryRangeStatsResponse(toQueryStats(stats))); err != nil { 187 | return err 188 | } 189 | } 190 | 191 | return nil 192 | } 193 | 194 | func zLabelsFromMetric(metric labels.Labels) []labelpb.ZLabel { 195 | zlabels := make([]labelpb.ZLabel, 0, metric.Len()) 196 | metric.Range(func(lbl labels.Label) { 197 | zlabel := labelpb.ZLabel{Name: lbl.Name, Value: lbl.Value} 198 | zlabels = append(zlabels, zlabel) 199 | }) 200 | return zlabels 201 | } 202 | 203 | func samplesFromModel(samples []promql.FPoint) []prompb.Sample { 204 | result := make([]prompb.Sample, 0, len(samples)) 205 | for _, s := range samples { 206 | result = append(result, prompb.Sample{ 207 | Value: float64(s.F), 208 | Timestamp: int64(s.T), 209 | }) 210 | } 211 | return result 212 | } 213 | 214 | func toQueryStats(stats *stats.Statistics) *querypb.QueryStats { 215 | return &querypb.QueryStats{ 216 | SamplesTotal: stats.Samples.TotalSamples, 217 | PeakSamples: int64(stats.Samples.PeakSamples), 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /api/http/api.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package http 6 | 7 | import ( 8 | "net/http" 9 | 10 | "github.com/prometheus/common/route" 11 | "github.com/prometheus/prometheus/promql" 12 | "github.com/prometheus/prometheus/storage" 13 | ) 14 | 15 | type apiConfig struct { 16 | queryAPIOpts []QueryAPIOption 17 | } 18 | 19 | type APIOption func(*apiConfig) 20 | 21 | func QueryOptions(opts ...QueryAPIOption) APIOption { 22 | return func(cfg *apiConfig) { 23 | cfg.queryAPIOpts = opts 24 | } 25 | } 26 | 27 | func NewAPI(queryable storage.Queryable, engine promql.QueryEngine, opts ...APIOption) http.Handler { 28 | cfg := &apiConfig{} 29 | for i := range opts { 30 | opts[i](cfg) 31 | } 32 | 33 | r := route.New() 34 | 35 | api := r.WithPrefix("/api/v1") 36 | RegisterQueryV1(api, queryable, engine, cfg.queryAPIOpts...) 37 | 38 | return r 39 | } 40 | -------------------------------------------------------------------------------- /api/http/query.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package http 6 | 7 | import ( 8 | "context" 9 | "encoding/json" 10 | "errors" 11 | "fmt" 12 | "math" 13 | "net/http" 14 | "strconv" 15 | "time" 16 | 17 | "github.com/prometheus/common/model" 18 | "github.com/prometheus/common/route" 19 | "github.com/prometheus/prometheus/model/labels" 20 | "github.com/prometheus/prometheus/model/timestamp" 21 | "github.com/prometheus/prometheus/promql" 22 | "github.com/prometheus/prometheus/promql/parser" 23 | "github.com/prometheus/prometheus/storage" 24 | "github.com/prometheus/prometheus/util/annotations" 25 | 26 | "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" 27 | "go.opentelemetry.io/otel/attribute" 28 | 29 | "github.com/cloudflare/parquet-tsdb-poc/internal/tracing" 30 | ) 31 | 32 | type queryAPI struct { 33 | engine promql.QueryEngine 34 | queryable storage.Queryable 35 | 36 | defaultLookback time.Duration 37 | defaultStep time.Duration 38 | defaultTimeout time.Duration 39 | } 40 | 41 | type QueryAPIOption func(*queryAPI) 42 | 43 | func DefaultLookback(d time.Duration) QueryAPIOption { 44 | return func(qapi *queryAPI) { 45 | qapi.defaultLookback = d 46 | } 47 | } 48 | 49 | func DefaultStep(s time.Duration) QueryAPIOption { 50 | return func(qapi *queryAPI) { 51 | qapi.defaultStep = s 52 | } 53 | } 54 | 55 | func DefaultTimeout(s time.Duration) QueryAPIOption { 56 | return func(qapi *queryAPI) { 57 | qapi.defaultStep = s 58 | } 59 | } 60 | 61 | func withTracing(r *route.Router, path string, h http.HandlerFunc) { 62 | tracedHandler := otelhttp.NewMiddleware(path)(h) 63 | 64 | r.Get(path, tracedHandler.ServeHTTP) 65 | r.Post(path, tracedHandler.ServeHTTP) 66 | } 67 | 68 | func RegisterQueryV1(r *route.Router, queryable storage.Queryable, engine promql.QueryEngine, opts ...QueryAPIOption) { 69 | qapi := &queryAPI{ 70 | engine: engine, 71 | queryable: queryable, 72 | defaultLookback: 5 * time.Minute, 73 | defaultStep: 30 * time.Second, 74 | defaultTimeout: 30 * time.Second, 75 | } 76 | for i := range opts { 77 | opts[i](qapi) 78 | } 79 | 80 | withTracing(r, "/query", qapi.query) 81 | withTracing(r, "/query_range", qapi.queryRange) 82 | withTracing(r, "/series", qapi.series) 83 | withTracing(r, "/labels", qapi.labelNames) 84 | withTracing(r, "/label/:name/values", qapi.labelValues) 85 | } 86 | 87 | const ( 88 | errBadRequest = "bad_request" 89 | errInternal = "internal" 90 | errCanceled = "canceled" 91 | errTimeout = "timeout" 92 | errUnimplemented = "unimplemented" 93 | 94 | statusSuccess = "success" 95 | statusError = "error" 96 | ) 97 | 98 | type apiResponse struct { 99 | Status string `json:"status"` 100 | Data any `json:"data,omitempty"` 101 | Warnings []string `json:"warnings,omitempty"` 102 | Infos []string `json:"infos,omitempty"` 103 | ErrorType string `json:"errorType,omitempty"` 104 | Error string `json:"error,omitempty"` 105 | } 106 | 107 | type errorResponse struct { 108 | Typ string 109 | Err error 110 | } 111 | 112 | func writeErrorResponse(w http.ResponseWriter, r errorResponse) { 113 | switch r.Typ { 114 | case errUnimplemented: 115 | w.WriteHeader(http.StatusNotFound) 116 | case errBadRequest: 117 | w.WriteHeader(http.StatusBadRequest) 118 | case errInternal: 119 | w.WriteHeader(http.StatusInternalServerError) 120 | case errCanceled, errTimeout: 121 | w.WriteHeader(http.StatusRequestTimeout) 122 | } 123 | json.NewEncoder(w).Encode(apiResponse{ 124 | Status: statusError, 125 | ErrorType: r.Typ, 126 | Error: r.Err.Error(), 127 | }) 128 | } 129 | 130 | type queryResponse struct { 131 | ResultType parser.ValueType `json:"resultType"` 132 | Result parser.Value `json:"result"` 133 | } 134 | 135 | func writeQueryResponse(w http.ResponseWriter, r *promql.Result) { 136 | w.WriteHeader(http.StatusOK) 137 | warns, infos := r.Warnings.AsStrings("", 0, 0) 138 | json.NewEncoder(w).Encode(apiResponse{ 139 | Status: statusSuccess, 140 | Data: queryResponse{ 141 | ResultType: r.Value.Type(), 142 | Result: r.Value, 143 | }, 144 | Warnings: warns, 145 | Infos: infos, 146 | }) 147 | } 148 | 149 | func writeSeriesResponse(w http.ResponseWriter, series []labels.Labels, annos annotations.Annotations) { 150 | w.WriteHeader(http.StatusOK) 151 | warns, infos := annos.AsStrings("", 0, 0) 152 | json.NewEncoder(w).Encode(apiResponse{ 153 | Status: statusSuccess, 154 | Data: series, 155 | Warnings: warns, 156 | Infos: infos, 157 | }) 158 | } 159 | 160 | func writeLabelsResponse(w http.ResponseWriter, values []string, annos annotations.Annotations) { 161 | w.WriteHeader(http.StatusOK) 162 | warns, infos := annos.AsStrings("", 0, 0) 163 | json.NewEncoder(w).Encode(apiResponse{ 164 | Status: statusSuccess, 165 | Data: values, 166 | Warnings: warns, 167 | Infos: infos, 168 | }) 169 | } 170 | 171 | func parseTime(s string) (time.Time, error) { 172 | if t, err := strconv.ParseFloat(s, 64); err == nil { 173 | s, ns := math.Modf(t) 174 | ns = math.Round(ns*1000) / 1000 175 | return time.Unix(int64(s), int64(ns*float64(time.Second))), nil 176 | } 177 | if t, err := time.Parse(time.RFC3339Nano, s); err == nil { 178 | return t, nil 179 | } 180 | return time.Time{}, fmt.Errorf("cannot parse %q to a valid timestamp", s) 181 | } 182 | 183 | func parseTimeParam(r *http.Request, param string, defaultValue time.Time) (time.Time, error) { 184 | val := r.FormValue(param) 185 | if val == "" { 186 | return defaultValue, nil 187 | } 188 | result, err := parseTime(val) 189 | if err != nil { 190 | return time.Time{}, fmt.Errorf("invalid time value for '%s': %s", param, err) 191 | } 192 | return result, nil 193 | } 194 | 195 | func parseDurationParam(r *http.Request, param string, defaultValue time.Duration) (time.Duration, error) { 196 | val := r.FormValue(param) 197 | if val == "" { 198 | return defaultValue, nil 199 | } 200 | result, err := parseDuration(val) 201 | if err != nil { 202 | return 0, fmt.Errorf("invalid duration value for '%s': %s", param, err) 203 | } 204 | return result, nil 205 | } 206 | 207 | func parseDuration(s string) (time.Duration, error) { 208 | if d, err := strconv.ParseFloat(s, 64); err == nil { 209 | ts := d * float64(time.Second) 210 | if ts > float64(math.MaxInt64) || ts < float64(math.MinInt64) { 211 | return 0, fmt.Errorf("cannot parse %q to a valid duration. It overflows int64", s) 212 | } 213 | return time.Duration(ts), nil 214 | } 215 | if d, err := model.ParseDuration(s); err == nil { 216 | return time.Duration(d), nil 217 | } 218 | return 0, fmt.Errorf("cannot parse %q to a valid duration", s) 219 | } 220 | 221 | func parseQueryParam(r *http.Request) string { 222 | return r.FormValue("query") 223 | } 224 | 225 | func parseMatchersParam(r *http.Request) ([][]*labels.Matcher, error) { 226 | matchers := r.Form["match[]"] 227 | 228 | if len(matchers) == 0 { 229 | return nil, errors.New("no match[] parameter provided") 230 | } 231 | matcherSets, err := parser.ParseMetricSelectors(matchers) 232 | if err != nil { 233 | return nil, err 234 | } 235 | 236 | OUTER: 237 | for _, ms := range matcherSets { 238 | for _, lm := range ms { 239 | if lm != nil && !lm.Matches("") { 240 | continue OUTER 241 | } 242 | } 243 | return nil, errors.New("match[] must contain at least one non-empty matcher") 244 | } 245 | return matcherSets, nil 246 | } 247 | 248 | func parseLimitParam(r *http.Request) (int, error) { 249 | s := r.FormValue("limit") 250 | if s == "" { 251 | return 0, nil 252 | } 253 | 254 | limit, err := strconv.Atoi(s) 255 | if err != nil { 256 | return 0, fmt.Errorf("cannot parse %q to a valid limit", s) 257 | } 258 | if limit < 0 { 259 | return 0, errors.New("limit must be non-negative") 260 | } 261 | 262 | return limit, nil 263 | } 264 | 265 | func (qapi *queryAPI) queryOpts() promql.QueryOpts { 266 | return promql.NewPrometheusQueryOpts(false, qapi.defaultLookback) 267 | } 268 | 269 | func (qapi *queryAPI) query(w http.ResponseWriter, r *http.Request) { 270 | ctx := r.Context() 271 | span := tracing.SpanFromContext(ctx) 272 | 273 | if err := r.ParseForm(); err != nil { 274 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to parse form data: %s", err)}) 275 | return 276 | } 277 | 278 | t, err := parseTimeParam(r, "time", time.Now()) 279 | if err != nil { 280 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get timestamp: %s", err)}) 281 | return 282 | } 283 | timeout, err := parseDurationParam(r, "timeout", qapi.defaultTimeout) 284 | if err != nil { 285 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get timeout: %s", err)}) 286 | return 287 | } 288 | q := parseQueryParam(r) 289 | 290 | span.SetAttributes(attribute.String("query.expr", q)) 291 | span.SetAttributes(attribute.String("query.time", t.String())) 292 | span.SetAttributes(attribute.String("query.timeout", timeout.String())) 293 | 294 | ctx, cancel := context.WithTimeout(ctx, timeout) 295 | defer cancel() 296 | 297 | query, err := qapi.engine.NewInstantQuery(ctx, qapi.queryable, qapi.queryOpts(), q, t) 298 | if err != nil { 299 | writeErrorResponse(w, errorResponse{Typ: errInternal, Err: fmt.Errorf("unable to create query: %s", err)}) 300 | return 301 | } 302 | defer query.Close() 303 | 304 | res := query.Exec(ctx) 305 | if res.Err != nil { 306 | switch res.Err.(type) { 307 | case promql.ErrQueryCanceled: 308 | writeErrorResponse(w, errorResponse{Typ: errCanceled, Err: res.Err}) 309 | case promql.ErrQueryTimeout: 310 | writeErrorResponse(w, errorResponse{Typ: errTimeout, Err: res.Err}) 311 | case promql.ErrStorage: 312 | writeErrorResponse(w, errorResponse{Typ: errInternal, Err: res.Err}) 313 | default: 314 | writeErrorResponse(w, errorResponse{Typ: errInternal, Err: res.Err}) 315 | } 316 | return 317 | } 318 | writeQueryResponse(w, res) 319 | } 320 | 321 | func (qapi *queryAPI) queryRange(w http.ResponseWriter, r *http.Request) { 322 | ctx := r.Context() 323 | span := tracing.SpanFromContext(ctx) 324 | 325 | if err := r.ParseForm(); err != nil { 326 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to parse form data: %s", err)}) 327 | return 328 | } 329 | 330 | start, err := parseTimeParam(r, "start", time.Now()) 331 | if err != nil { 332 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get start: %s", err)}) 333 | return 334 | } 335 | end, err := parseTimeParam(r, "end", time.Now()) 336 | if err != nil { 337 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get end: %s", err)}) 338 | return 339 | } 340 | step, err := parseDurationParam(r, "step", qapi.defaultStep) 341 | if err != nil { 342 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get step: %s", err)}) 343 | return 344 | } 345 | timeout, err := parseDurationParam(r, "timeout", qapi.defaultTimeout) 346 | if err != nil { 347 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get timeout: %s", err)}) 348 | return 349 | } 350 | q := parseQueryParam(r) 351 | 352 | span.SetAttributes(attribute.String("query.expr", q)) 353 | span.SetAttributes(attribute.String("query.start", start.String())) 354 | span.SetAttributes(attribute.String("query.end", end.String())) 355 | span.SetAttributes(attribute.String("query.range", end.Sub(start).String())) 356 | span.SetAttributes(attribute.String("query.step", step.String())) 357 | span.SetAttributes(attribute.String("query.timeout", timeout.String())) 358 | 359 | ctx, cancel := context.WithTimeout(ctx, timeout) 360 | defer cancel() 361 | 362 | query, err := qapi.engine.NewRangeQuery(ctx, qapi.queryable, qapi.queryOpts(), q, start, end, step) 363 | if err != nil { 364 | writeErrorResponse(w, errorResponse{Typ: errInternal, Err: fmt.Errorf("unable to create query: %s", err)}) 365 | return 366 | } 367 | defer query.Close() 368 | 369 | res := query.Exec(ctx) 370 | if res.Err != nil { 371 | switch res.Err.(type) { 372 | case promql.ErrQueryCanceled: 373 | writeErrorResponse(w, errorResponse{Typ: errCanceled, Err: res.Err}) 374 | case promql.ErrQueryTimeout: 375 | writeErrorResponse(w, errorResponse{Typ: errTimeout, Err: res.Err}) 376 | case promql.ErrStorage: 377 | writeErrorResponse(w, errorResponse{Typ: errInternal, Err: res.Err}) 378 | default: 379 | writeErrorResponse(w, errorResponse{Typ: errInternal, Err: res.Err}) 380 | } 381 | return 382 | } 383 | writeQueryResponse(w, res) 384 | } 385 | 386 | func (qapi *queryAPI) series(w http.ResponseWriter, r *http.Request) { 387 | ctx := r.Context() 388 | span := tracing.SpanFromContext(ctx) 389 | 390 | if err := r.ParseForm(); err != nil { 391 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to parse form data: %s", err)}) 392 | return 393 | } 394 | 395 | start, err := parseTimeParam(r, "start", time.Now()) 396 | if err != nil { 397 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get start: %s", err)}) 398 | return 399 | } 400 | end, err := parseTimeParam(r, "end", time.Now()) 401 | if err != nil { 402 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get end: %s", err)}) 403 | return 404 | } 405 | limit, err := parseLimitParam(r) 406 | if err != nil { 407 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get limit: %s", err)}) 408 | return 409 | } 410 | ms, err := parseMatchersParam(r) 411 | if err != nil { 412 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get labelmatchers: %s", err)}) 413 | return 414 | } 415 | 416 | span.SetAttributes(attribute.String("series.start", start.String())) 417 | span.SetAttributes(attribute.String("series.end", end.String())) 418 | span.SetAttributes(attribute.StringSlice("series.matchers", r.Form["match[]"])) 419 | span.SetAttributes(attribute.Int("series.limit", limit)) 420 | 421 | ctx, cancel := context.WithTimeout(ctx, qapi.defaultTimeout) 422 | defer cancel() 423 | 424 | q, err := qapi.queryable.Querier(timestamp.FromTime(start), timestamp.FromTime(end)) 425 | if err != nil { 426 | writeErrorResponse(w, errorResponse{Typ: errInternal, Err: fmt.Errorf("unable to create querier: %s", err)}) 427 | return 428 | } 429 | defer q.Close() 430 | 431 | var ( 432 | series []labels.Labels 433 | sets []storage.SeriesSet 434 | ) 435 | 436 | hints := &storage.SelectHints{ 437 | Limit: limit, 438 | Start: start.UnixMilli(), 439 | End: end.UnixMilli(), 440 | Func: "series", 441 | } 442 | 443 | for _, mset := range ms { 444 | sets = append(sets, q.Select(ctx, false, hints, mset...)) 445 | } 446 | 447 | set := storage.NewMergeSeriesSet(sets, storage.ChainedSeriesMerge) 448 | warnings := set.Warnings() 449 | for set.Next() { 450 | series = append(series, set.At().Labels()) 451 | if limit > 0 && len(series) > limit { 452 | series = series[:limit] 453 | warnings.Add(errors.New("results truncated due to limit")) 454 | break 455 | } 456 | } 457 | if err := set.Err(); err != nil { 458 | writeErrorResponse(w, errorResponse{Typ: errInternal, Err: fmt.Errorf("unable to merge series: %s", err)}) 459 | return 460 | } 461 | 462 | writeSeriesResponse(w, series, warnings) 463 | } 464 | 465 | func (qapi *queryAPI) labelValues(w http.ResponseWriter, r *http.Request) { 466 | ctx := r.Context() 467 | span := tracing.SpanFromContext(ctx) 468 | 469 | if err := r.ParseForm(); err != nil { 470 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to parse form data: %s", err)}) 471 | return 472 | } 473 | 474 | name := route.Param(ctx, "name") 475 | if !model.LabelNameRE.MatchString(name) { 476 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("invalid label name: %q", name)}) 477 | } 478 | 479 | // TODO: support more labels 480 | if name != model.MetricNameLabel { 481 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: errors.New("label values is only supported for the __name__ label")}) 482 | } 483 | 484 | start, err := parseTimeParam(r, "start", time.Now()) 485 | if err != nil { 486 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get start: %s", err)}) 487 | return 488 | } 489 | end, err := parseTimeParam(r, "end", time.Now()) 490 | if err != nil { 491 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get end: %s", err)}) 492 | return 493 | } 494 | limit, err := parseLimitParam(r) 495 | if err != nil { 496 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: fmt.Errorf("unable to get limit: %s", err)}) 497 | return 498 | } 499 | if len(r.Form["match[]"]) != 0 { 500 | writeErrorResponse(w, errorResponse{Typ: errBadRequest, Err: errors.New("label values with matchers is not supported")}) 501 | return 502 | } 503 | 504 | span.SetAttributes(attribute.String("label_values.start", start.String())) 505 | span.SetAttributes(attribute.String("label_values.end", end.String())) 506 | span.SetAttributes(attribute.StringSlice("label_values.matchers", r.Form["match[]"])) 507 | span.SetAttributes(attribute.Int("label_values.limit", limit)) 508 | 509 | ctx, cancel := context.WithTimeout(ctx, qapi.defaultTimeout) 510 | defer cancel() 511 | 512 | q, err := qapi.queryable.Querier(timestamp.FromTime(start), timestamp.FromTime(end)) 513 | if err != nil { 514 | writeErrorResponse(w, errorResponse{Typ: errInternal, Err: fmt.Errorf("unable to create querier: %s", err)}) 515 | return 516 | } 517 | defer q.Close() 518 | 519 | hints := &storage.LabelHints{ 520 | Limit: limit, 521 | } 522 | 523 | labelValues, annos, err := q.LabelValues(ctx, name, hints) 524 | if err != nil { 525 | writeErrorResponse(w, errorResponse{Typ: errInternal, Err: fmt.Errorf("unable to query label values: %w", err)}) 526 | } 527 | 528 | writeLabelsResponse(w, labelValues, annos) 529 | } 530 | 531 | func (qapi *queryAPI) labelNames(w http.ResponseWriter, _ *http.Request) { 532 | writeErrorResponse(w, errorResponse{Typ: errUnimplemented, Err: errors.New("unimplemented")}) 533 | } 534 | -------------------------------------------------------------------------------- /cmd/cmd.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package main 6 | 7 | import ( 8 | "context" 9 | "fmt" 10 | "log/slog" 11 | "os" 12 | "os/signal" 13 | "syscall" 14 | 15 | "github.com/KimMachineGun/automemlimit/memlimit" 16 | "github.com/hashicorp/go-multierror" 17 | "github.com/prometheus/client_golang/prometheus" 18 | "gopkg.in/alecthomas/kingpin.v2" 19 | 20 | "github.com/cloudflare/parquet-tsdb-poc/db" 21 | "github.com/cloudflare/parquet-tsdb-poc/search" 22 | ) 23 | 24 | var logLevelMap = map[string]slog.Level{ 25 | "DEBUG": slog.LevelDebug, 26 | "INFO": slog.LevelInfo, 27 | "WARN": slog.LevelWarn, 28 | "ERROR": slog.LevelError, 29 | } 30 | 31 | func main() { 32 | app := kingpin.New("parquet-tsdb-poc", "A POC for a TSDB in parquet.") 33 | memratio := app.Flag("memlimit.ratio", "gomemlimit ratio").Default("0.9").Float() 34 | logLevel := app.Flag("logger.level", "log level").Default("INFO").Enum("DEBUG", "INFO", "WARN", "ERROR") 35 | 36 | log := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ 37 | Level: logLevelMap[*logLevel], 38 | })) 39 | 40 | tsdbConvert, tsdbConvertF := registerConvertApp(app) 41 | serve, serveF := registerServeApp(app) 42 | parsed := kingpin.MustParse(app.Parse(os.Args[1:])) 43 | 44 | memlimit.SetGoMemLimitWithOpts( 45 | memlimit.WithRatio(*memratio), 46 | memlimit.WithProvider( 47 | memlimit.ApplyFallback( 48 | memlimit.FromCgroup, 49 | memlimit.FromSystem, 50 | ), 51 | ), 52 | ) 53 | 54 | reg, err := setupPrometheusRegistry() 55 | if err != nil { 56 | log.Error("Could not setup prometheus", slog.Any("err", err)) 57 | return 58 | } 59 | 60 | sigC := make(chan os.Signal, 1) 61 | signal.Notify(sigC, syscall.SIGTERM, syscall.SIGINT) 62 | 63 | ctx, cancel := context.WithCancel(context.Background()) 64 | go func() { 65 | s := <-sigC 66 | log.Warn("Caught signal, canceling context", slog.String("signal", s.String())) 67 | cancel() 68 | }() 69 | 70 | switch parsed { 71 | case tsdbConvert.FullCommand(): 72 | log.Info("Running convert") 73 | if err := tsdbConvertF(ctx, log); err != nil { 74 | log.Error("Error converting tsdb block", slog.Any("err", err)) 75 | os.Exit(1) 76 | } 77 | case serve.FullCommand(): 78 | log.Info("Running serve") 79 | if err := serveF(ctx, log, reg); err != nil { 80 | log.Error("Error running serve", slog.Any("err", err)) 81 | os.Exit(1) 82 | } 83 | } 84 | log.Info("Done") 85 | } 86 | 87 | func setupPrometheusRegistry() (*prometheus.Registry, error) { 88 | reg := prometheus.NewRegistry() 89 | registerer := prometheus.WrapRegistererWithPrefix("cf_metrics_", reg) 90 | 91 | if err := multierror.Append( 92 | nil, 93 | db.RegisterMetrics(prometheus.WrapRegistererWithPrefix("db_", registerer)), 94 | search.RegisterMetrics(prometheus.WrapRegistererWithPrefix("search_", registerer)), 95 | ); err.ErrorOrNil() != nil { 96 | return nil, fmt.Errorf("unable to register metrics: %w", err) 97 | } 98 | return reg, nil 99 | } 100 | -------------------------------------------------------------------------------- /cmd/config.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package main 6 | 7 | import ( 8 | "context" 9 | "fmt" 10 | "log/slog" 11 | "strings" 12 | 13 | "gopkg.in/alecthomas/kingpin.v2" 14 | "gopkg.in/yaml.v3" 15 | 16 | "go.opentelemetry.io/otel" 17 | "go.opentelemetry.io/otel/exporters/jaeger" 18 | "go.opentelemetry.io/otel/exporters/stdout/stdouttrace" 19 | "go.opentelemetry.io/otel/sdk/resource" 20 | "go.opentelemetry.io/otel/sdk/trace" 21 | semconv "go.opentelemetry.io/otel/semconv/v1.21.0" 22 | 23 | "github.com/thanos-io/objstore" 24 | "github.com/thanos-io/objstore/client" 25 | ) 26 | 27 | type bucketOpts struct { 28 | storage string 29 | prefix string 30 | 31 | // filesystem options 32 | filesystemDirectory string 33 | 34 | // s3 options 35 | s3Bucket string 36 | s3Endpoint string 37 | s3AccessKey string 38 | s3SecretKey string 39 | s3Insecure bool 40 | 41 | retries int 42 | } 43 | 44 | func (opts *bucketOpts) registerFlags(cmd *kingpin.CmdClause) { 45 | cmd.Flag("storage.type", "type of storage").Default("filesystem").EnumVar(&opts.storage, "filesystem", "s3") 46 | cmd.Flag("storage.prefix", "prefix for the storage").Default("").StringVar(&opts.prefix) 47 | cmd.Flag("storage.filesystem.directory", "directory for filesystem").Default(".data").StringVar(&opts.filesystemDirectory) 48 | cmd.Flag("storage.s3.bucket", "bucket for s3").Default("").StringVar(&opts.s3Bucket) 49 | cmd.Flag("storage.s3.endpoint", "endpoint for s3").Default("").StringVar(&opts.s3Endpoint) 50 | cmd.Flag("storage.s3.access_key", "access key for s3").Default("").Envar("STORAGE_S3_ACCESS_KEY").StringVar(&opts.s3AccessKey) 51 | cmd.Flag("storage.s3.secret_key", "secret key for s3").Default("").Envar("STORAGE_S3_SECRET_KEY").StringVar(&opts.s3SecretKey) 52 | cmd.Flag("storage.s3.insecure", "use http").Default("false").BoolVar(&opts.s3Insecure) 53 | cmd.Flag("storage.retries", "how many retries to perform").Default("2").IntVar(&opts.retries) 54 | } 55 | 56 | func (opts *bucketOpts) registerParquetFlags(cmd *kingpin.CmdClause) { 57 | cmd.Flag("parquet.storage.type", "type of storage").Default("filesystem").EnumVar(&opts.storage, "filesystem", "s3") 58 | cmd.Flag("parquet.storage.prefix", "prefix for the storage").Default("").StringVar(&opts.prefix) 59 | cmd.Flag("parquet.storage.filesystem.directory", "directory for filesystem").Default(".data").StringVar(&opts.filesystemDirectory) 60 | cmd.Flag("parquet.storage.s3.bucket", "bucket for s3").Default("").StringVar(&opts.s3Bucket) 61 | cmd.Flag("parquet.storage.s3.endpoint", "endpoint for s3").Default("").StringVar(&opts.s3Endpoint) 62 | cmd.Flag("parquet.storage.s3.access_key", "access key for s3").Default("").Envar("PARQUET_STORAGE_S3_ACCESS_KEY").StringVar(&opts.s3AccessKey) 63 | cmd.Flag("parquet.storage.s3.secret_key", "secret key for s3").Default("").Envar("PARQUET_STORAGE_S3_SECRET_KEY").StringVar(&opts.s3SecretKey) 64 | cmd.Flag("parquet.storage.s3.insecure", "use http").Default("false").BoolVar(&opts.s3Insecure) 65 | cmd.Flag("parquet.storage.retries", "how many retries to perform").Default("2").IntVar(&opts.retries) 66 | } 67 | 68 | func (opts *bucketOpts) registerTSDBFlags(cmd *kingpin.CmdClause) { 69 | cmd.Flag("tsdb.storage.type", "type of storage").Default("filesystem").EnumVar(&opts.storage, "filesystem", "s3") 70 | cmd.Flag("tsdb.storage.prefix", "prefix for the storage").Default("").StringVar(&opts.prefix) 71 | cmd.Flag("tsdb.storage.filesystem.directory", "directory for filesystem").Default(".data").StringVar(&opts.filesystemDirectory) 72 | cmd.Flag("tsdb.storage.s3.bucket", "bucket for s3").Default("").StringVar(&opts.s3Bucket) 73 | cmd.Flag("tsdb.storage.s3.endpoint", "endpoint for s3").Default("").StringVar(&opts.s3Endpoint) 74 | cmd.Flag("tsdb.storage.s3.access_key", "access key for s3").Default("").Envar("TSDB_STORAGE_S3_ACCESS_KEY").StringVar(&opts.s3AccessKey) 75 | cmd.Flag("tsdb.storage.s3.secret_key", "secret key for s3").Default("").Envar("TSDB_STORAGE_S3_SECRET_KEY").StringVar(&opts.s3SecretKey) 76 | cmd.Flag("tsdb.storage.s3.insecure", "use http").Default("false").BoolVar(&opts.s3Insecure) 77 | cmd.Flag("tsdb.storage.retries", "how many retries to perform").Default("2").IntVar(&opts.retries) 78 | } 79 | 80 | func setupBucket(log *slog.Logger, opts bucketOpts) (objstore.Bucket, error) { 81 | prov := client.ObjProvider(strings.ToUpper(opts.storage)) 82 | cfg := client.BucketConfig{ 83 | Type: prov, 84 | Prefix: opts.prefix, 85 | } 86 | var subCfg any 87 | switch prov { 88 | case client.FILESYSTEM: 89 | subCfg = struct { 90 | Directory string `yaml:"directory"` 91 | }{ 92 | Directory: opts.filesystemDirectory, 93 | } 94 | case client.S3: 95 | subCfg = struct { 96 | Bucket string `yaml:"bucket"` 97 | Endpoint string `yaml:"endpoint"` 98 | AccessKey string `yaml:"access_key"` 99 | SecretKey string `yaml:"secret_key"` 100 | MaxRetries int `yaml:"max_retries"` 101 | Insecure bool `yaml:"insecure"` 102 | }{ 103 | Bucket: opts.s3Bucket, 104 | Endpoint: opts.s3Endpoint, 105 | AccessKey: opts.s3AccessKey, 106 | SecretKey: opts.s3SecretKey, 107 | Insecure: opts.s3Insecure, 108 | MaxRetries: opts.retries, 109 | } 110 | default: 111 | return nil, fmt.Errorf("unknown bucket type: %s", prov) 112 | } 113 | 114 | cfg.Config = subCfg 115 | bytes, err := yaml.Marshal(cfg) 116 | if err != nil { 117 | return nil, fmt.Errorf("unable to marshal bucket config yaml: %w", err) 118 | } 119 | 120 | bkt, err := client.NewBucket(slogAdapter{log}, bytes, "parquet-gateway", nil) 121 | if err != nil { 122 | return nil, fmt.Errorf("unable to create bucket client: %w", err) 123 | } 124 | 125 | return bkt, nil 126 | } 127 | 128 | type slogAdapter struct { 129 | log *slog.Logger 130 | } 131 | 132 | func (s slogAdapter) Log(args ...interface{}) error { 133 | s.log.Debug("", args...) 134 | return nil 135 | } 136 | 137 | type tracingOpts struct { 138 | exporterType string 139 | 140 | // jaeger opts 141 | jaegerEndpoint string 142 | 143 | samplingParam float64 144 | samplingType string 145 | } 146 | 147 | func (opts *tracingOpts) registerFlags(cmd *kingpin.CmdClause) { 148 | cmd.Flag("tracing.exporter.type", "type of tracing exporter").Default("STDOUT").EnumVar(&opts.exporterType, "JAEGER", "STDOUT") 149 | cmd.Flag("tracing.jaeger.endpoint", "endpoint to send traces, eg. https://example.com:4318/v1/traces").StringVar(&opts.jaegerEndpoint) 150 | cmd.Flag("tracing.sampling.param", "sample of traces to send").Default("0.1").Float64Var(&opts.samplingParam) 151 | cmd.Flag("tracing.sampling.type", "type of sampling").Default("PROBABILISTIC").EnumVar(&opts.samplingType, "PROBABILISTIC", "ALWAYS", "NEVER") 152 | } 153 | 154 | func setupTracing(ctx context.Context, opts tracingOpts) error { 155 | var ( 156 | exporter trace.SpanExporter 157 | err error 158 | ) 159 | switch opts.exporterType { 160 | case "JAEGER": 161 | exporter, err = jaeger.New(jaeger.WithCollectorEndpoint(jaeger.WithEndpoint(opts.jaegerEndpoint))) 162 | if err != nil { 163 | return err 164 | } 165 | case "STDOUT": 166 | exporter, err = stdouttrace.New() 167 | if err != nil { 168 | return err 169 | } 170 | default: 171 | return fmt.Errorf("invalid exporter type %s", opts.exporterType) 172 | } 173 | var sampler trace.Sampler 174 | switch opts.samplingType { 175 | case "PROBABILISTIC": 176 | sampler = trace.TraceIDRatioBased(opts.samplingParam) 177 | case "ALWAYS": 178 | sampler = trace.AlwaysSample() 179 | case "NEVER": 180 | sampler = trace.NeverSample() 181 | default: 182 | return fmt.Errorf("invalid sampling type %s", opts.samplingType) 183 | } 184 | r, err := resource.New(ctx, 185 | resource.WithAttributes( 186 | semconv.ServiceName("parquet-gateway"), 187 | semconv.ServiceVersion("v0.0.0"), 188 | ), 189 | ) 190 | if err != nil { 191 | return err 192 | } 193 | 194 | tracerProvider := trace.NewTracerProvider( 195 | trace.WithSampler(trace.ParentBased(sampler)), 196 | trace.WithBatcher(exporter), 197 | trace.WithResource(r), 198 | ) 199 | otel.SetTracerProvider(tracerProvider) 200 | return nil 201 | } 202 | -------------------------------------------------------------------------------- /cmd/convert.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package main 6 | 7 | import ( 8 | "context" 9 | "encoding/json" 10 | "fmt" 11 | "io" 12 | "log/slog" 13 | "path/filepath" 14 | "runtime" 15 | "sort" 16 | "strings" 17 | "time" 18 | 19 | "github.com/parquet-go/parquet-go" 20 | "github.com/prometheus/prometheus/tsdb" 21 | "github.com/prometheus/prometheus/tsdb/chunkenc" 22 | "github.com/thanos-io/objstore" 23 | "gopkg.in/alecthomas/kingpin.v2" 24 | 25 | "github.com/cloudflare/parquet-tsdb-poc/convert" 26 | "github.com/cloudflare/parquet-tsdb-poc/internal/util" 27 | ) 28 | 29 | type convertOpts struct { 30 | parquetBucket bucketOpts 31 | tsdbBucket bucketOpts 32 | conversion conversionOpts 33 | } 34 | 35 | func (opts *convertOpts) registerFlags(cmd *kingpin.CmdClause) { 36 | opts.conversion.registerFlags(cmd) 37 | opts.parquetBucket.registerParquetFlags(cmd) 38 | opts.tsdbBucket.registerTSDBFlags(cmd) 39 | } 40 | 41 | func registerConvertApp(app *kingpin.Application) (*kingpin.CmdClause, func(ctx context.Context, log *slog.Logger) error) { 42 | cmd := app.Command("convert", "convert TSDB Block to parquet file") 43 | 44 | var opts convertOpts 45 | opts.registerFlags(cmd) 46 | 47 | return cmd, func(ctx context.Context, log *slog.Logger) error { 48 | blkDir := filepath.Join(opts.conversion.tempDir, "blocks") 49 | 50 | start, end, err := getStartEnd(opts.conversion) 51 | if err != nil { 52 | return fmt.Errorf("unable to get start, end: %s", err) 53 | } 54 | 55 | tsdbBkt, err := setupBucket(log, opts.tsdbBucket) 56 | if err != nil { 57 | return fmt.Errorf("unable to setup tsdb bucket: %s", err) 58 | } 59 | parquetBkt, err := setupBucket(log, opts.parquetBucket) 60 | if err != nil { 61 | return fmt.Errorf("unable to setup parquet bucket: %s", err) 62 | } 63 | 64 | log.Info("Sorting by", "labels", opts.conversion.sortLabels) 65 | 66 | // TODO: this is kinda horrible logic here that is not reentrant or robust against errors 67 | // But for sake of getting started we can use it to convert the first blocks and then iterate 68 | 69 | log.Info("Fetching metas", "start", start, "end", end) 70 | metas, err := fetchTSDBMetas(ctx, tsdbBkt, start, end) 71 | if err != nil { 72 | return fmt.Errorf("unable to fetch tsdb metas: %s", err) 73 | } 74 | 75 | log.Info("Downloading blocks", "metas", metas) 76 | blocks, err := downloadBlocks(ctx, tsdbBkt, metas, blkDir) 77 | if err != nil { 78 | return fmt.Errorf("unable to download blocks: %s", err) 79 | } 80 | 81 | for next := start; next != end; next = next.AddDate(0, 0, 1) { 82 | log.Info("Converting next parquet file", "day", next) 83 | 84 | candidates := overlappingBlocks(blocks, next) 85 | if len(candidates) == 0 { 86 | continue 87 | } 88 | convOpts := []convert.ConvertOption{ 89 | convert.SortBufSize(opts.conversion.sortBufSize), 90 | convert.SortBy(opts.conversion.sortLabels), 91 | convert.BufferPool(parquet.NewFileBufferPool(opts.conversion.tempDir, "convert-*")), 92 | } 93 | if err := convert.ConvertTSDBBlock(ctx, parquetBkt, next, candidates, convOpts...); err != nil { 94 | return fmt.Errorf("unable to convert blocks for date %q: %s", next, err) 95 | } 96 | } 97 | return nil 98 | } 99 | } 100 | 101 | type conversionOpts struct { 102 | sortBufSize int 103 | sortLabels []string 104 | 105 | tempDir string 106 | 107 | start, end string 108 | } 109 | 110 | func (opts *conversionOpts) registerFlags(cmd *kingpin.CmdClause) { 111 | cmd.Flag("convert.start", "timestamp of the first parquet block to convert (rounded to start of day)").StringVar(&opts.start) 112 | cmd.Flag("convert.end", "timestamp of the last parquet block to convert(rounded to start of day)").StringVar(&opts.end) 113 | cmd.Flag("convert.tempdir", "directory for temporary state").StringVar(&opts.tempDir) 114 | cmd.Flag("convert.sortbuf", "size of sorting buffer").Default("64_000").IntVar(&opts.sortBufSize) 115 | cmd.Flag("convert.sorting.label", "label to sort by").Default("__name__").StringsVar(&opts.sortLabels) 116 | } 117 | 118 | func getStartEnd(opts conversionOpts) (time.Time, time.Time, error) { 119 | from, err := time.Parse(time.RFC3339, opts.start) 120 | if err != nil { 121 | return time.Time{}, time.Time{}, fmt.Errorf("unable to parse start: %w", err) 122 | } 123 | to, err := time.Parse(time.RFC3339, opts.end) 124 | if err != nil { 125 | return time.Time{}, time.Time{}, fmt.Errorf("unable to parse end: %w", err) 126 | } 127 | return util.BeginOfDay(from), util.BeginOfDay(to), nil 128 | } 129 | 130 | func fetchTSDBMetas(ctx context.Context, bkt objstore.BucketReader, from, to time.Time) ([]tsdb.BlockMeta, error) { 131 | metas := make([]tsdb.BlockMeta, 0) 132 | err := bkt.Iter(ctx, "", func(name string) error { 133 | split := strings.Split(name, "/") 134 | f := split[1] 135 | if f != "meta.json" { 136 | return nil 137 | } 138 | content, err := bkt.Get(ctx, name) 139 | if err != nil { 140 | return err 141 | } 142 | defer content.Close() 143 | 144 | var m tsdb.BlockMeta 145 | if err := json.NewDecoder(content).Decode(&m); err != nil { 146 | return err 147 | } 148 | 149 | metas = append(metas, m) 150 | 151 | return nil 152 | }, objstore.WithRecursiveIter()) 153 | if err != nil { 154 | return nil, fmt.Errorf("unable to fetch metas: %w", err) 155 | } 156 | 157 | startMillis := util.BeginOfDay(from).UnixMilli() 158 | endMillis := util.EndOfDay(to).UnixMilli() 159 | 160 | res := make([]tsdb.BlockMeta, 0) 161 | for _, m := range metas { 162 | if endMillis >= m.MinTime && startMillis <= m.MaxTime { 163 | res = append(res, m) 164 | } 165 | } 166 | sort.Slice(res, func(i, j int) bool { 167 | return res[i].MinTime < res[j].MinTime 168 | }) 169 | 170 | return res, nil 171 | } 172 | 173 | func overlappingBlocks(blocks []*tsdb.Block, date time.Time) []convert.Convertible { 174 | dateStartMillis := date.UnixMilli() 175 | dateEndMillis := date.AddDate(0, 0, 1).UnixMilli() 176 | 177 | res := make([]convert.Convertible, 0) 178 | for _, blk := range blocks { 179 | m := blk.Meta() 180 | if dateEndMillis >= m.MinTime && dateStartMillis <= m.MaxTime { 181 | res = append(res, blk) 182 | } 183 | } 184 | return res 185 | } 186 | 187 | func downloadBlocks(ctx context.Context, bkt objstore.BucketReader, metas []tsdb.BlockMeta, blkDir string) ([]*tsdb.Block, error) { 188 | logAdapter := slogAdapter{log: slog.New(slog.NewJSONHandler(io.Discard, nil))} 189 | 190 | opts := []objstore.DownloadOption{objstore.WithFetchConcurrency(runtime.GOMAXPROCS(0))} 191 | res := make([]*tsdb.Block, 0) 192 | for _, m := range metas { 193 | src := m.ULID.String() 194 | dst := filepath.Join(blkDir, src) 195 | 196 | if err := objstore.DownloadDir(ctx, logAdapter, bkt, src, src, dst, opts...); err != nil { 197 | return nil, fmt.Errorf("unable to download %q: %s", src, err) 198 | } 199 | blk, err := tsdb.OpenBlock(logAdapter, dst, chunkenc.NewPool()) 200 | if err != nil { 201 | return nil, fmt.Errorf("unable to open block %q: %s", src, err) 202 | } 203 | res = append(res, blk) 204 | } 205 | return res, nil 206 | } 207 | -------------------------------------------------------------------------------- /cmd/serve.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package main 6 | 7 | import ( 8 | "context" 9 | "fmt" 10 | "log/slog" 11 | "math" 12 | "net" 13 | "net/http" 14 | "net/http/pprof" 15 | "time" 16 | 17 | "github.com/alecthomas/units" 18 | "github.com/oklog/run" 19 | "github.com/prometheus/client_golang/prometheus" 20 | "github.com/prometheus/client_golang/prometheus/promhttp" 21 | "github.com/prometheus/prometheus/model/labels" 22 | "github.com/prometheus/prometheus/promql" 23 | "google.golang.org/grpc" 24 | "google.golang.org/grpc/reflection" 25 | "gopkg.in/alecthomas/kingpin.v2" 26 | 27 | "github.com/thanos-io/objstore" 28 | "github.com/thanos-io/promql-engine/engine" 29 | "github.com/thanos-io/thanos/pkg/api/query/querypb" 30 | "github.com/thanos-io/thanos/pkg/info/infopb" 31 | "github.com/thanos-io/thanos/pkg/runutil" 32 | 33 | _ "github.com/mostynb/go-grpc-compression/snappy" 34 | 35 | cfgrpc "github.com/cloudflare/parquet-tsdb-poc/api/grpc" 36 | cfhttp "github.com/cloudflare/parquet-tsdb-poc/api/http" 37 | cfdb "github.com/cloudflare/parquet-tsdb-poc/db" 38 | ) 39 | 40 | type serveOpts struct { 41 | block blockOpts 42 | bucket bucketOpts 43 | tracing tracingOpts 44 | 45 | query queryOpts 46 | promAPI promAPIOpts 47 | thanosAPI thanosAPIOpts 48 | internalAPI internalAPIOpts 49 | } 50 | 51 | func (opts *serveOpts) registerFlags(cmd *kingpin.CmdClause) { 52 | opts.block.registerFlags(cmd) 53 | opts.bucket.registerFlags(cmd) 54 | opts.tracing.registerFlags(cmd) 55 | opts.query.registerFlags(cmd) 56 | opts.promAPI.registerFlags(cmd) 57 | opts.thanosAPI.registerFlags(cmd) 58 | opts.internalAPI.registerFlags(cmd) 59 | } 60 | 61 | func registerServeApp(app *kingpin.Application) (*kingpin.CmdClause, func(context.Context, *slog.Logger, *prometheus.Registry) error) { 62 | cmd := app.Command("serve", "serve Prometheus HTTP and thanos gRPC from parquet files in object storage") 63 | 64 | var opts serveOpts 65 | opts.registerFlags(cmd) 66 | 67 | return cmd, func(ctx context.Context, log *slog.Logger, reg *prometheus.Registry) error { 68 | var g run.Group 69 | 70 | setupInterrupt(ctx, &g, log) 71 | 72 | if err := setupTracing(ctx, opts.tracing); err != nil { 73 | return fmt.Errorf("unable to setup tracing: %w", err) 74 | } 75 | 76 | bkt, err := setupBucket(log, opts.bucket) 77 | if err != nil { 78 | return fmt.Errorf("unable to setup bucket: %w", err) 79 | } 80 | 81 | discoverer, err := setupDiscovery(ctx, &g, log, bkt, opts.block) 82 | if err != nil { 83 | return fmt.Errorf("unable to setup discovery: %w", err) 84 | } 85 | 86 | metaFilter, err := setupMetaFilter(ctx, &g, log, opts.block) 87 | if err != nil { 88 | return fmt.Errorf("unable to set up meta filter: %w", err) 89 | } 90 | 91 | syncer, err := setupSyncer(ctx, &g, log, bkt, discoverer, metaFilter, opts.block) 92 | if err != nil { 93 | return fmt.Errorf("unable to setup syncer: %w", err) 94 | } 95 | 96 | db := cfdb.NewDB( 97 | syncer, 98 | cfdb.ExternalLabels(labels.FromMap(opts.query.externalLabels)), 99 | ) 100 | 101 | setupPromAPI(&g, log, db, opts.promAPI, opts.query) 102 | setupThanosAPI(&g, log, db, opts.thanosAPI, opts.query) 103 | setupInternalAPI(&g, log, reg, opts.internalAPI) 104 | 105 | return g.Run() 106 | } 107 | } 108 | 109 | func setupInterrupt(ctx context.Context, g *run.Group, log *slog.Logger) { 110 | ctx, cancel := context.WithCancel(ctx) 111 | g.Add(func() error { 112 | <-ctx.Done() 113 | log.Info("Canceling actors") 114 | return nil 115 | }, func(error) { 116 | cancel() 117 | }) 118 | } 119 | 120 | type queryOpts struct { 121 | defaultStep time.Duration 122 | defaultLookback time.Duration 123 | defaultTimeout time.Duration 124 | externalLabels map[string]string 125 | } 126 | 127 | func (opts *queryOpts) registerFlags(cmd *kingpin.CmdClause) { 128 | // We need to initialize the externalLabels map here 129 | opts.externalLabels = make(map[string]string) 130 | 131 | cmd.Flag("query.step", "default step for range queries").Default("30s").DurationVar(&opts.defaultStep) 132 | cmd.Flag("query.lookback", "default lookback for queries").Default("5m").DurationVar(&opts.defaultLookback) 133 | cmd.Flag("query.timeout", "default timeout for queries").Default("30s").DurationVar(&opts.defaultTimeout) 134 | cmd.Flag("query.external-label", "external label to add to results").StringMapVar(&opts.externalLabels) 135 | } 136 | 137 | func engineFromQueryOpts(opts queryOpts) promql.QueryEngine { 138 | return engine.New(engine.Opts{ 139 | DisableFallback: false, 140 | DisableDuplicateLabelChecks: true, 141 | EngineOpts: promql.EngineOpts{ 142 | Logger: nil, 143 | Reg: nil, 144 | MaxSamples: 10_000_000, 145 | Timeout: opts.defaultTimeout, 146 | NoStepSubqueryIntervalFn: func(int64) int64 { return time.Minute.Milliseconds() }, 147 | EnableAtModifier: true, 148 | EnableNegativeOffset: true, 149 | EnablePerStepStats: true, 150 | LookbackDelta: opts.defaultLookback, 151 | EnableDelayedNameRemoval: false, 152 | }, 153 | }) 154 | 155 | } 156 | 157 | type blockOpts struct { 158 | discoveryInterval time.Duration 159 | discoveryConcurrency int 160 | 161 | syncerInterval time.Duration 162 | syncerConcurrency int 163 | syncerReadBufferSize units.Base2Bytes 164 | 165 | filterType string 166 | filterThanosBackfillEndpoint string 167 | filterThanosBackfillUpdateInterval time.Duration 168 | } 169 | 170 | func (opts *blockOpts) registerFlags(cmd *kingpin.CmdClause) { 171 | cmd.Flag("block.discovery.interval", "interval to discover blocks").Default("1m").DurationVar(&opts.discoveryInterval) 172 | cmd.Flag("block.discovery.concurrency", "concurrency for loading metadata").Default("1").IntVar(&opts.discoveryConcurrency) 173 | cmd.Flag("block.syncer.interval", "interval to sync blocks").Default("1m").DurationVar(&opts.syncerInterval) 174 | cmd.Flag("block.syncer.concurrency", "concurrency for loading blocks").Default("1").IntVar(&opts.syncerConcurrency) 175 | cmd.Flag("block.syncer.read-buffer-size", "read buffer size for blocks").Default("2MiB").BytesVar(&opts.syncerReadBufferSize) 176 | cmd.Flag("block.filter.type", "").Default("all-metas").EnumVar(&opts.filterType, "thanos-backfill", "all-metas") 177 | cmd.Flag("block.filter.thanos-backfill.endpoint", "endpoint to ignore for backfill").StringVar(&opts.filterThanosBackfillEndpoint) 178 | cmd.Flag("block.filter.thanos-backfill.interval", "interval to update thanos-backfill timerange").Default("1m").DurationVar(&opts.filterThanosBackfillUpdateInterval) 179 | } 180 | 181 | func setupDiscovery(ctx context.Context, g *run.Group, log *slog.Logger, bkt objstore.Bucket, opts blockOpts) (*cfdb.Discoverer, error) { 182 | discoverer := cfdb.NewDiscoverer(bkt, cfdb.MetaConcurrency(opts.discoveryConcurrency)) 183 | 184 | log.Info("Running initial discovery") 185 | 186 | iterCtx, iterCancel := context.WithTimeout(ctx, opts.discoveryInterval) 187 | defer iterCancel() 188 | if err := discoverer.Discover(iterCtx); err != nil { 189 | return nil, fmt.Errorf("unable to run initial discovery: %w", err) 190 | } 191 | 192 | ctx, cancel := context.WithCancel(context.Background()) 193 | g.Add(func() error { 194 | return runutil.Repeat(opts.discoveryInterval, ctx.Done(), func() error { 195 | log.Debug("Running discovery") 196 | 197 | iterCtx, iterCancel := context.WithTimeout(ctx, opts.discoveryInterval) 198 | defer iterCancel() 199 | if err := discoverer.Discover(iterCtx); err != nil { 200 | log.Warn("Unable to discover new blocks", slog.Any("err", err)) 201 | } 202 | return nil 203 | }) 204 | }, func(error) { 205 | log.Info("Stopping discovery") 206 | cancel() 207 | }) 208 | return discoverer, nil 209 | } 210 | 211 | func setupMetaFilter(ctx context.Context, g *run.Group, log *slog.Logger, opts blockOpts) (cfdb.MetaFilter, error) { 212 | switch opts.filterType { 213 | case "all-metas": 214 | return cfdb.AllMetasMetaFilter, nil 215 | case "thanos-backfill": 216 | thanosBackfillMetaFilter := cfdb.NewThanosBackfillMetaFilter(opts.filterThanosBackfillEndpoint) 217 | 218 | log.Info("Initializing thanos-backfill meta filter") 219 | 220 | iterCtx, iterCancel := context.WithTimeout(ctx, opts.filterThanosBackfillUpdateInterval) 221 | defer iterCancel() 222 | if err := thanosBackfillMetaFilter.Update(iterCtx); err != nil { 223 | return nil, fmt.Errorf("unable to initialize thanos-backfill meta filter: %w", err) 224 | } 225 | 226 | ctx, cancel := context.WithCancel(context.Background()) 227 | g.Add(func() error { 228 | return runutil.Repeat(opts.filterThanosBackfillUpdateInterval, ctx.Done(), func() error { 229 | log.Debug("Updating thanos-backfill meta filter") 230 | 231 | iterCtx, iterCancel := context.WithTimeout(ctx, opts.filterThanosBackfillUpdateInterval) 232 | defer iterCancel() 233 | if err := thanosBackfillMetaFilter.Update(iterCtx); err != nil { 234 | log.Warn("Unable to update thanos-backfill meta filter", slog.Any("err", err)) 235 | } 236 | return nil 237 | }) 238 | }, func(error) { 239 | log.Info("Stopping thanos-backfill meta filter updates") 240 | cancel() 241 | }) 242 | return thanosBackfillMetaFilter, nil 243 | default: 244 | return nil, fmt.Errorf("unknown meta filter type: %s", opts.filterType) 245 | } 246 | } 247 | 248 | func setupSyncer(ctx context.Context, g *run.Group, log *slog.Logger, bkt objstore.Bucket, discoverer *cfdb.Discoverer, metaFilter cfdb.MetaFilter, opts blockOpts) (*cfdb.Syncer, error) { 249 | syncer := cfdb.NewSyncer( 250 | bkt, 251 | cfdb.FilterMetas(metaFilter), 252 | cfdb.BlockConcurrency(opts.syncerConcurrency), 253 | cfdb.BlockOptions( 254 | cfdb.ReadBufferSize(opts.syncerReadBufferSize), 255 | ), 256 | ) 257 | 258 | log.Info("Running initial sync") 259 | 260 | iterCtx, iterCancel := context.WithTimeout(ctx, opts.syncerInterval) 261 | defer iterCancel() 262 | if err := syncer.Sync(iterCtx, discoverer.Metas()); err != nil { 263 | return nil, fmt.Errorf("unable to run initial discovery: %w", err) 264 | } 265 | 266 | ctx, cancel := context.WithCancel(context.Background()) 267 | g.Add(func() error { 268 | return runutil.Repeat(opts.syncerInterval, ctx.Done(), func() error { 269 | log.Debug("Running sync") 270 | 271 | iterCtx, iterCancel := context.WithTimeout(ctx, opts.syncerInterval) 272 | defer iterCancel() 273 | if err := syncer.Sync(iterCtx, discoverer.Metas()); err != nil { 274 | log.Warn("Unable to sync new blocks", slog.Any("err", err)) 275 | } 276 | return nil 277 | }) 278 | }, func(error) { 279 | log.Info("Stopping syncer") 280 | cancel() 281 | }) 282 | return syncer, nil 283 | } 284 | 285 | type thanosAPIOpts struct { 286 | port int 287 | 288 | shutdownTimeout time.Duration 289 | } 290 | 291 | func (opts *thanosAPIOpts) registerFlags(cmd *kingpin.CmdClause) { 292 | cmd.Flag("http.thanos.port", "port to host query api").Default("9001").IntVar(&opts.port) 293 | cmd.Flag("http.thanos.shutdown-timeout", "timeout on shutdown").Default("10s").DurationVar(&opts.shutdownTimeout) 294 | } 295 | 296 | func setupThanosAPI(g *run.Group, log *slog.Logger, db *cfdb.DB, opts thanosAPIOpts, qOpts queryOpts) { 297 | server := grpc.NewServer( 298 | grpc.MaxSendMsgSize(math.MaxInt32), 299 | grpc.MaxRecvMsgSize(math.MaxInt32), 300 | ) 301 | 302 | infopb.RegisterInfoServer(server, cfgrpc.NewInfoServer(db)) 303 | querypb.RegisterQueryServer(server, cfgrpc.NewQueryServer(db, engineFromQueryOpts(qOpts))) 304 | 305 | reflection.Register(server) 306 | 307 | g.Add(func() error { 308 | log.Info("Serving thanos api", slog.Int("port", opts.port)) 309 | 310 | l, err := net.Listen("tcp", fmt.Sprintf(":%d", opts.port)) 311 | if err != nil { 312 | return fmt.Errorf("unable to listen: %w", err) 313 | } 314 | return server.Serve(l) 315 | }, func(error) { 316 | log.Info("Shutting down thanos api", slog.Int("port", opts.port)) 317 | ctx, cancel := context.WithTimeout(context.Background(), opts.shutdownTimeout) 318 | defer cancel() 319 | 320 | stopped := make(chan struct{}) 321 | go func() { 322 | server.GracefulStop() 323 | close(stopped) 324 | }() 325 | 326 | select { 327 | case <-ctx.Done(): 328 | server.Stop() 329 | return 330 | case <-stopped: 331 | cancel() 332 | } 333 | }) 334 | } 335 | 336 | type promAPIOpts struct { 337 | port int 338 | 339 | shutdownTimeout time.Duration 340 | } 341 | 342 | func (opts *promAPIOpts) registerFlags(cmd *kingpin.CmdClause) { 343 | cmd.Flag("http.prometheus.port", "port to host query api").Default("9000").IntVar(&opts.port) 344 | cmd.Flag("http.prometheus.shutdown-timeout", "timeout on shutdown").Default("10s").DurationVar(&opts.shutdownTimeout) 345 | } 346 | 347 | func setupPromAPI(g *run.Group, log *slog.Logger, db *cfdb.DB, opts promAPIOpts, qOpts queryOpts) { 348 | handler := cfhttp.NewAPI(db.Queryable(), engineFromQueryOpts(qOpts), 349 | cfhttp.QueryOptions( 350 | cfhttp.DefaultStep(qOpts.defaultStep), 351 | cfhttp.DefaultLookback(qOpts.defaultLookback), 352 | cfhttp.DefaultTimeout(qOpts.defaultTimeout), 353 | )) 354 | 355 | server := &http.Server{Addr: fmt.Sprintf(":%d", opts.port), Handler: handler} 356 | g.Add(func() error { 357 | log.Info("Serving prometheus api", slog.Int("port", opts.port)) 358 | if err := server.ListenAndServe(); err != http.ErrServerClosed { 359 | return err 360 | } 361 | return nil 362 | }, func(error) { 363 | log.Info("Shutting down prometheus api", slog.Int("port", opts.port)) 364 | ctx, cancel := context.WithTimeout(context.Background(), opts.shutdownTimeout) 365 | defer cancel() 366 | 367 | if err := server.Shutdown(ctx); err != nil { 368 | log.Error("Error shutting down prometheus server", slog.Any("err", err)) 369 | } 370 | }) 371 | } 372 | 373 | type internalAPIOpts struct { 374 | port int 375 | 376 | shutdownTimeout time.Duration 377 | } 378 | 379 | func (opts *internalAPIOpts) registerFlags(cmd *kingpin.CmdClause) { 380 | cmd.Flag("http.internal.port", "port to host query api").Default("6060").IntVar(&opts.port) 381 | cmd.Flag("http.internal.shutdown-timeout", "timeout on shutdown").Default("10s").DurationVar(&opts.shutdownTimeout) 382 | } 383 | 384 | func setupInternalAPI(g *run.Group, log *slog.Logger, reg *prometheus.Registry, opts internalAPIOpts) { 385 | mux := http.NewServeMux() 386 | mux.HandleFunc("/debug/pprof/", pprof.Index) 387 | mux.HandleFunc("/debug/pprof/profile", pprof.Profile) 388 | mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) 389 | mux.HandleFunc("/debug/pprof/trace", pprof.Trace) 390 | mux.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{})) 391 | 392 | mux.HandleFunc("/-/healthy", func(w http.ResponseWriter, _ *http.Request) { 393 | w.WriteHeader(http.StatusOK) 394 | fmt.Fprintf(w, "OK") 395 | }) 396 | mux.HandleFunc("/-/ready", func(w http.ResponseWriter, _ *http.Request) { 397 | w.WriteHeader(http.StatusOK) 398 | fmt.Fprintf(w, "OK") 399 | }) 400 | 401 | server := &http.Server{Addr: fmt.Sprintf(":%d", opts.port), Handler: mux} 402 | g.Add(func() error { 403 | log.Info("Serving internal api", slog.Int("port", opts.port)) 404 | if err := server.ListenAndServe(); err != http.ErrServerClosed { 405 | return err 406 | } 407 | return nil 408 | }, func(error) { 409 | log.Info("Shutting down internal api", slog.Int("port", opts.port)) 410 | ctx, cancel := context.WithTimeout(context.Background(), opts.shutdownTimeout) 411 | defer cancel() 412 | 413 | if err := server.Shutdown(ctx); err != nil { 414 | log.Error("Error shutting down internal server", slog.Any("err", err)) 415 | } 416 | }) 417 | } 418 | -------------------------------------------------------------------------------- /convert/chunks.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package convert 6 | 7 | import ( 8 | "encoding/binary" 9 | "fmt" 10 | "sort" 11 | "time" 12 | 13 | "github.com/prometheus/prometheus/tsdb/chunks" 14 | 15 | "github.com/cloudflare/parquet-tsdb-poc/internal/encoding" 16 | "github.com/cloudflare/parquet-tsdb-poc/schema" 17 | ) 18 | 19 | func allChunksEmpty(chkBytes [schema.ChunkColumnsPerDay][]byte) bool { 20 | for _, chk := range chkBytes { 21 | if len(chk) != 0 { 22 | return false 23 | } 24 | } 25 | return true 26 | } 27 | 28 | func collectChunks(it chunks.Iterator) ([schema.ChunkColumnsPerDay][]byte, error) { 29 | var ( 30 | res [schema.ChunkColumnsPerDay][]byte 31 | ) 32 | // NOTE: 'it' should hold chunks for one day. Chunks are usually length 2h so we should get 12 of them. 33 | chunks := make([]chunks.Meta, 0, 12) 34 | for it.Next() { 35 | chunks = append(chunks, it.At()) 36 | } 37 | if err := it.Err(); err != nil { 38 | return res, fmt.Errorf("unable to iterate chunks: %w", err) 39 | } 40 | // NOTE: we need to sort chunks here as they come from different blocks that we merged. 41 | // Prometheus does not guarantee that they are sorted. We have to sort them either here or 42 | // before submitting them to the query engine. 43 | sort.Slice(chunks, func(i, j int) bool { 44 | return chunks[i].MinTime < chunks[j].MinTime 45 | }) 46 | for _, chk := range chunks { 47 | enc, bs := chk.Chunk.Encoding(), chk.Chunk.Bytes() 48 | 49 | hour := time.UnixMilli(chk.MinTime).UTC().Hour() 50 | chkIdx := (hour / int(schema.ChunkColumnLength.Hours())) % schema.ChunkColumnsPerDay 51 | chkBytes := res[chkIdx] 52 | chkBytes = binary.BigEndian.AppendUint32(chkBytes, uint32(enc)) 53 | chkBytes = binary.BigEndian.AppendUint64(chkBytes, encoding.ZigZagEncode(chk.MinTime)) 54 | chkBytes = binary.BigEndian.AppendUint64(chkBytes, encoding.ZigZagEncode(chk.MaxTime)) 55 | chkBytes = binary.BigEndian.AppendUint32(chkBytes, uint32(len(bs))) 56 | chkBytes = append(chkBytes, bs...) 57 | res[chkIdx] = chkBytes 58 | } 59 | return res, nil 60 | } 61 | -------------------------------------------------------------------------------- /convert/convert.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package convert 6 | 7 | import ( 8 | "bytes" 9 | "context" 10 | "fmt" 11 | "io" 12 | "time" 13 | 14 | "github.com/parquet-go/parquet-go" 15 | "github.com/prometheus/prometheus/model/labels" 16 | "github.com/prometheus/prometheus/tsdb" 17 | "github.com/prometheus/prometheus/tsdb/tombstones" 18 | "github.com/thanos-io/objstore" 19 | "google.golang.org/protobuf/proto" 20 | 21 | "github.com/cloudflare/parquet-tsdb-poc/internal/util" 22 | "github.com/cloudflare/parquet-tsdb-poc/proto/metapb" 23 | "github.com/cloudflare/parquet-tsdb-poc/schema" 24 | ) 25 | 26 | type Convertible interface { 27 | Index() (tsdb.IndexReader, error) 28 | Chunks() (tsdb.ChunkReader, error) 29 | Tombstones() (tombstones.Reader, error) 30 | Meta() tsdb.BlockMeta 31 | } 32 | 33 | type convertOpts struct { 34 | numRowGroups int 35 | rowGroupSize int 36 | sortBufSize int 37 | 38 | bufferPool parquet.BufferPool 39 | 40 | sortingColumns [][]string 41 | bloomfilterColumns [][]string 42 | } 43 | 44 | func (cfg convertOpts) buildBloomfilterColumns() []parquet.BloomFilterColumn { 45 | cols := make([]parquet.BloomFilterColumn, 0, len(cfg.bloomfilterColumns)) 46 | 47 | for i := range cfg.bloomfilterColumns { 48 | cols = append(cols, 49 | parquet.SplitBlockFilter(10, cfg.bloomfilterColumns[i]...)) 50 | } 51 | return cols 52 | } 53 | 54 | func (cfg convertOpts) buildSortingColumns() []parquet.SortingColumn { 55 | cols := make([]parquet.SortingColumn, 0, len(cfg.sortingColumns)) 56 | 57 | for i := range cfg.sortingColumns { 58 | cols = append(cols, 59 | parquet.Ascending(cfg.sortingColumns[i]...)) 60 | } 61 | return cols 62 | } 63 | 64 | type ConvertOption func(*convertOpts) 65 | 66 | func RowGroupSize(rbs int) ConvertOption { 67 | return func(opts *convertOpts) { 68 | opts.rowGroupSize = rbs 69 | } 70 | } 71 | 72 | func SortBufSize(sbs int) ConvertOption { 73 | return func(opts *convertOpts) { 74 | opts.sortBufSize = sbs 75 | } 76 | } 77 | 78 | func SortBy(labels []string) ConvertOption { 79 | return func(opts *convertOpts) { 80 | sortingColumns := make([][]string, len(labels)) 81 | for i := range labels { 82 | sortingColumns[i] = []string{schema.LabelNameToColumn(labels[i])} 83 | } 84 | opts.sortingColumns = sortingColumns 85 | } 86 | } 87 | 88 | func BufferPool(p parquet.BufferPool) ConvertOption { 89 | return func(opts *convertOpts) { 90 | opts.bufferPool = p 91 | } 92 | } 93 | 94 | func ConvertTSDBBlock( 95 | ctx context.Context, 96 | bkt objstore.Bucket, 97 | day time.Time, 98 | blks []Convertible, 99 | opts ...ConvertOption, 100 | ) error { 101 | cfg := &convertOpts{ 102 | rowGroupSize: 1_000_000, 103 | numRowGroups: 6, 104 | sortBufSize: 128_000, 105 | bufferPool: parquet.NewBufferPool(), 106 | sortingColumns: [][]string{{schema.LabelNameToColumn(labels.MetricName)}}, 107 | bloomfilterColumns: [][]string{{schema.LabelNameToColumn(labels.MetricName)}}, 108 | } 109 | for i := range opts { 110 | opts[i](cfg) 111 | } 112 | start, end := util.BeginOfDay(day), util.EndOfDay(day) 113 | name, err := schema.BlockNameForDay(start) 114 | if err != nil { 115 | return fmt.Errorf("unable to get block name: %s", err) 116 | } 117 | rr, err := newIndexRowReader(ctx, start.UnixMilli(), end.UnixMilli(), blks) 118 | if err != nil { 119 | return fmt.Errorf("unable to create index rowreader: %s", err) 120 | } 121 | defer rr.Close() 122 | 123 | converter := newConverter( 124 | name, 125 | start.UnixMilli(), 126 | end.UnixMilli(), 127 | rr, 128 | bkt, 129 | cfg.bufferPool, 130 | cfg.sortBufSize, 131 | cfg.rowGroupSize, 132 | cfg.numRowGroups, 133 | cfg.buildSortingColumns(), 134 | cfg.buildBloomfilterColumns(), 135 | ) 136 | 137 | if err := converter.convert(ctx); err != nil { 138 | return fmt.Errorf("unable to convert block: %s", err) 139 | } 140 | return nil 141 | } 142 | 143 | type converter struct { 144 | name string 145 | mint, maxt int64 146 | 147 | currentShard int 148 | seriesPerShard int 149 | sortBufSize int 150 | rowGroupSize int 151 | numRowGroups int 152 | 153 | bkt objstore.Bucket 154 | 155 | rr *indexRowReader 156 | p parquet.BufferPool 157 | 158 | sortingColumns []parquet.SortingColumn 159 | bloomfilterColumns []parquet.BloomFilterColumn 160 | } 161 | 162 | func newConverter( 163 | name string, 164 | mint int64, 165 | maxt int64, 166 | rr *indexRowReader, 167 | bkt objstore.Bucket, 168 | p parquet.BufferPool, 169 | sortBufSize int, 170 | rowGroupSize int, 171 | numRowGroups int, 172 | sortingColumns []parquet.SortingColumn, 173 | bloomfilterColumns []parquet.BloomFilterColumn, 174 | 175 | ) *converter { 176 | return &converter{ 177 | name: name, 178 | mint: mint, 179 | maxt: maxt, 180 | 181 | bkt: bkt, 182 | rr: rr, 183 | p: p, 184 | 185 | rowGroupSize: rowGroupSize, 186 | numRowGroups: numRowGroups, 187 | sortBufSize: sortBufSize, 188 | sortingColumns: sortingColumns, 189 | bloomfilterColumns: bloomfilterColumns, 190 | } 191 | } 192 | func (c *converter) convert(ctx context.Context) error { 193 | if err := c.convertShards(ctx); err != nil { 194 | return fmt.Errorf("unable to convert shards: %w", err) 195 | } 196 | if err := c.writeMetaFile(ctx); err != nil { 197 | return fmt.Errorf("unable to write meta file: %w", err) 198 | } 199 | return nil 200 | } 201 | 202 | func (c *converter) writeMetaFile(ctx context.Context) error { 203 | meta := &metapb.Metadata{ 204 | ColumnsForName: make(map[string]*metapb.Columns), 205 | Mint: c.mint, 206 | Maxt: c.maxt, 207 | Shards: int64(c.currentShard) + 1, 208 | } 209 | for k, v := range c.rr.NameLabelMapping() { 210 | cols := &metapb.Columns{Columns: make([]string, 0, len(v))} 211 | for lbl := range v { 212 | cols.Columns = append(cols.Columns, lbl) 213 | } 214 | meta.ColumnsForName[k] = cols 215 | } 216 | 217 | metaBytes, err := proto.Marshal(meta) 218 | if err != nil { 219 | return fmt.Errorf("unable to marshal meta bytes: %s", err) 220 | } 221 | if err := c.bkt.Upload(ctx, schema.MetaFileNameForBlock(c.name), bytes.NewReader(metaBytes)); err != nil { 222 | return fmt.Errorf("unable to upload meta file: %s", err) 223 | } 224 | 225 | return nil 226 | } 227 | 228 | func (c *converter) convertShards(ctx context.Context) error { 229 | for { 230 | if ok, err := c.convertShard(ctx); err != nil { 231 | return fmt.Errorf("unable to convert shard: %s", err) 232 | } else if !ok { 233 | break 234 | } 235 | } 236 | return nil 237 | } 238 | 239 | func (c *converter) convertShard(ctx context.Context) (bool, error) { 240 | s := c.rr.Schema() 241 | rowsToWrite := c.numRowGroups * c.rowGroupSize 242 | 243 | in := c.p.GetBuffer() 244 | defer c.p.PutBuffer(in) 245 | 246 | sw := newSortingWriter(in, c.p, s, c.sortBufSize, c.sortingColumns...) 247 | n, err := parquet.CopyRows(sw, newLimitReader(c.rr, rowsToWrite)) 248 | if err != nil { 249 | return false, fmt.Errorf("unable to copy rows to sorting writer: %s", err) 250 | } 251 | if err := sw.Flush(); err != nil { 252 | return false, fmt.Errorf("unable to flush sorting writer: %s", err) 253 | } 254 | 255 | if err := c.writeShardLabelsPfile(ctx, sw, c.currentShard); err != nil { 256 | return false, fmt.Errorf("unable to write label parquetfile %d: %s", c.currentShard, err) 257 | } 258 | if err := c.writeShardChunksPfile(ctx, sw, c.currentShard); err != nil { 259 | return false, fmt.Errorf("unable to write chunks parquetfile %d: %s", c.currentShard, err) 260 | } 261 | if n < int64(rowsToWrite) { 262 | return false, nil 263 | } 264 | c.currentShard++ 265 | return true, nil 266 | } 267 | 268 | func (c *converter) writeShardLabelsPfile( 269 | ctx context.Context, 270 | sw *sortingWriter, 271 | shard int, 272 | ) error { 273 | out := c.p.GetBuffer() 274 | defer c.p.PutBuffer(out) 275 | 276 | inSchema := c.rr.Schema() 277 | outSchema := schema.LabelsProjection(schema.WithCompression(inSchema)) 278 | 279 | conv, err := parquet.Convert(outSchema, inSchema) 280 | if err != nil { 281 | return fmt.Errorf("unable to convert schemas") 282 | } 283 | 284 | sr, err := sw.RowReader() 285 | if err != nil { 286 | return fmt.Errorf("unable to get sorted row reader: %s", err) 287 | } 288 | 289 | cr := parquet.ConvertRowReader(sr, conv) 290 | 291 | writer := parquet.NewGenericWriter[any](out, outSchema, parquet.BloomFilters(c.bloomfilterColumns...)) 292 | if _, err := parquet.CopyRows(newFlushingWriter(writer, c.rowGroupSize), cr); err != nil { 293 | return fmt.Errorf("unable to copy rows: %s", err) 294 | } 295 | if err := writer.Close(); err != nil { 296 | return fmt.Errorf("unable to close writer: %s", err) 297 | } 298 | if _, err := out.Seek(0, io.SeekStart); err != nil { 299 | return fmt.Errorf("unable to rewind temporary buffer: %s", err) 300 | } 301 | if err := c.bkt.Upload(ctx, schema.LabelsPfileNameForShard(c.name, shard), out); err != nil { 302 | return fmt.Errorf("unable to upload parquet file: %s", err) 303 | } 304 | 305 | return nil 306 | } 307 | 308 | func (c *converter) writeShardChunksPfile( 309 | ctx context.Context, 310 | sw *sortingWriter, 311 | shard int, 312 | ) error { 313 | out := c.p.GetBuffer() 314 | defer c.p.PutBuffer(out) 315 | 316 | inSchema := c.rr.Schema() 317 | outSchema := schema.ChunkProjection(schema.WithCompression(inSchema)) 318 | 319 | conv, err := parquet.Convert(outSchema, inSchema) 320 | if err != nil { 321 | return fmt.Errorf("unable to convert schemas") 322 | } 323 | 324 | sr, err := sw.RowReader() 325 | if err != nil { 326 | return fmt.Errorf("unable to get sorted row reader: %s", err) 327 | } 328 | 329 | cr := parquet.ConvertRowReader(sr, conv) 330 | 331 | writer := parquet.NewGenericWriter[any](out, outSchema, parquet.BloomFilters(c.bloomfilterColumns...)) 332 | if _, err := parquet.CopyRows(newFlushingWriter(writer, c.rowGroupSize), cr); err != nil { 333 | return fmt.Errorf("unable to copy rows: %s", err) 334 | } 335 | if err := writer.Close(); err != nil { 336 | return fmt.Errorf("unable to close writer: %s", err) 337 | } 338 | if _, err := out.Seek(0, io.SeekStart); err != nil { 339 | return fmt.Errorf("unable to rewind temporary buffer: %s", err) 340 | } 341 | if err := c.bkt.Upload(ctx, schema.ChunksPfileNameForShard(c.name, shard), out); err != nil { 342 | return fmt.Errorf("unable to upload parquet file: %s", err) 343 | } 344 | 345 | return nil 346 | } 347 | 348 | type rowWriterFlusher interface { 349 | parquet.RowWriter 350 | Flush() error 351 | } 352 | 353 | type flushingWriter struct { 354 | rowWriterFlusher 355 | flush int 356 | cur int 357 | } 358 | 359 | func newFlushingWriter(w rowWriterFlusher, flush int) parquet.RowWriter { 360 | return &flushingWriter{rowWriterFlusher: w, flush: flush} 361 | } 362 | 363 | func (fw *flushingWriter) WriteRows(buf []parquet.Row) (int, error) { 364 | n, err := fw.rowWriterFlusher.WriteRows(buf) 365 | if err != nil { 366 | return n, err 367 | } 368 | fw.cur += n 369 | 370 | if fw.cur > fw.flush { 371 | if err := fw.rowWriterFlusher.Flush(); err != nil { 372 | return n, err 373 | } 374 | fw.cur = 0 375 | } 376 | return n, err 377 | } 378 | 379 | type limitReader struct { 380 | parquet.RowReader 381 | limit int 382 | cur int 383 | } 384 | 385 | func newLimitReader(r parquet.RowReader, limit int) parquet.RowReader { 386 | return &limitReader{RowReader: r, limit: limit} 387 | } 388 | 389 | func (lr *limitReader) ReadRows(buf []parquet.Row) (int, error) { 390 | n, err := lr.RowReader.ReadRows(buf) 391 | if err != nil { 392 | return n, err 393 | } 394 | lr.cur += n 395 | 396 | if lr.cur > lr.limit { 397 | return n, io.EOF 398 | } 399 | return n, nil 400 | } 401 | -------------------------------------------------------------------------------- /convert/sort.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package convert 6 | 7 | import ( 8 | "container/heap" 9 | "fmt" 10 | "io" 11 | "slices" 12 | "sort" 13 | 14 | "github.com/parquet-go/parquet-go" 15 | ) 16 | 17 | type sortingWriter struct { 18 | in io.ReadWriteSeeker 19 | 20 | schema *parquet.Schema 21 | buffer *parquet.RowBuffer[any] 22 | writer *parquet.GenericWriter[any] 23 | cols []parquet.SortingColumn 24 | 25 | n int 26 | flushThreshold int 27 | } 28 | 29 | func newSortingWriter(in io.ReadWriteSeeker, p parquet.BufferPool, schema *parquet.Schema, flushThreshold int, cols ...parquet.SortingColumn) *sortingWriter { 30 | return &sortingWriter{ 31 | in: in, 32 | schema: schema, 33 | cols: cols, 34 | flushThreshold: flushThreshold, 35 | buffer: parquet.NewRowBuffer[any](schema, parquet.SortingRowGroupConfig( 36 | parquet.SortingColumns(cols...), 37 | parquet.SortingBuffers(p), 38 | ), 39 | ), 40 | writer: parquet.NewGenericWriter[any](in, schema, parquet.SortingWriterConfig( 41 | parquet.SortingColumns(cols...), 42 | parquet.SortingBuffers(p), 43 | ), 44 | ), 45 | } 46 | } 47 | 48 | var _ parquet.RowWriter = &sortingWriter{} 49 | 50 | func (w *sortingWriter) WriteRows(buf []parquet.Row) (int, error) { 51 | n, err := w.buffer.WriteRows(buf) 52 | if err != nil { 53 | return 0, err 54 | } 55 | w.n += n 56 | if w.n > w.flushThreshold { 57 | sort.Sort(w.buffer) 58 | rows := w.buffer.Rows() 59 | defer rows.Close() 60 | if _, err := parquet.CopyRows(w.writer, rows); err != nil { 61 | return 0, err 62 | } 63 | if err := w.writer.Flush(); err != nil { 64 | return 0, err 65 | } 66 | w.buffer.Reset() 67 | w.n = 0 68 | } 69 | return n, nil 70 | } 71 | 72 | func (w *sortingWriter) Flush() error { 73 | sort.Sort(w.buffer) 74 | rows := w.buffer.Rows() 75 | defer rows.Close() 76 | if _, err := parquet.CopyRows(w.writer, rows); err != nil { 77 | return err 78 | } 79 | return w.writer.Close() 80 | } 81 | 82 | func (w *sortingWriter) RowReader() (parquet.RowReader, error) { 83 | if _, err := w.in.Seek(0, io.SeekStart); err != nil { 84 | return nil, err 85 | } 86 | sz, err := sizeFromSeeker(w.in) 87 | if err != nil { 88 | return nil, err 89 | } 90 | pf, err := parquet.OpenFile(newReaderAt(w.in), sz) 91 | if err != nil { 92 | return nil, err 93 | } 94 | rrs := make([]parquet.RowReader, 0) 95 | for _, rg := range pf.RowGroups() { 96 | rrs = append(rrs, rg.Rows()) 97 | } 98 | return mergeRowReaders(rrs, w.schema.Comparator(w.cols...)), nil 99 | } 100 | 101 | // Taken from https://github.com/parquet-go/parquet-go/blob/main/merge.go 102 | // This was necessary to fix corruption that happened because the head was not cloned, though maybe we have been using the library wrong here and this is not actually necessary. 103 | type mergedRowReader struct { 104 | compare func(parquet.Row, parquet.Row) int 105 | readers []*bufferedRowReader 106 | initialized bool 107 | } 108 | 109 | func mergeRowReaders(readers []parquet.RowReader, compare func(parquet.Row, parquet.Row) int) *mergedRowReader { 110 | return &mergedRowReader{ 111 | compare: compare, 112 | readers: makeBufferedRowReaders(len(readers), func(i int) parquet.RowReader { return readers[i] }), 113 | } 114 | } 115 | 116 | func makeBufferedRowReaders(numReaders int, readerAt func(int) parquet.RowReader) []*bufferedRowReader { 117 | buffers := make([]bufferedRowReader, numReaders) 118 | readers := make([]*bufferedRowReader, numReaders) 119 | 120 | for i := range readers { 121 | buffers[i].rows = readerAt(i) 122 | readers[i] = &buffers[i] 123 | } 124 | return readers 125 | } 126 | 127 | func (m *mergedRowReader) initialize() error { 128 | for i, r := range m.readers { 129 | switch err := r.read(); err { 130 | case nil: 131 | case io.EOF: 132 | m.readers[i] = nil 133 | default: 134 | m.readers = nil 135 | return err 136 | } 137 | } 138 | 139 | n := 0 140 | for _, r := range m.readers { 141 | if r != nil { 142 | m.readers[n] = r 143 | n++ 144 | } 145 | } 146 | 147 | toclear := m.readers[n:] 148 | for i := range toclear { 149 | toclear[i] = nil 150 | } 151 | 152 | m.readers = m.readers[:n] 153 | heap.Init(m) 154 | return nil 155 | } 156 | 157 | func (m *mergedRowReader) Close() { 158 | for _, r := range m.readers { 159 | r.close() 160 | } 161 | m.readers = nil 162 | } 163 | 164 | func (m *mergedRowReader) ReadRows(rows []parquet.Row) (n int, err error) { 165 | if !m.initialized { 166 | m.initialized = true 167 | 168 | if err := m.initialize(); err != nil { 169 | return 0, err 170 | } 171 | } 172 | for n < len(rows) && len(m.readers) != 0 { 173 | r := m.readers[0] 174 | h := r.head().Clone() 175 | 176 | rows[n] = slices.Grow(rows[n], len(h))[:len(h)] 177 | copy(rows[n], h) 178 | n++ 179 | 180 | if err := r.next(); err != nil { 181 | if err != io.EOF { 182 | return n, err 183 | } 184 | heap.Pop(m) 185 | } else { 186 | heap.Fix(m, 0) 187 | } 188 | } 189 | 190 | if len(m.readers) == 0 { 191 | err = io.EOF 192 | } 193 | return n, err 194 | } 195 | 196 | func (m *mergedRowReader) Less(i, j int) bool { 197 | return m.compare(m.readers[i].head(), m.readers[j].head()) < 0 198 | } 199 | 200 | func (m *mergedRowReader) Len() int { 201 | return len(m.readers) 202 | } 203 | 204 | func (m *mergedRowReader) Swap(i, j int) { 205 | m.readers[i], m.readers[j] = m.readers[j], m.readers[i] 206 | } 207 | 208 | func (m *mergedRowReader) Push(_ interface{}) { 209 | panic("NOT IMPLEMENTED") 210 | } 211 | 212 | func (m *mergedRowReader) Pop() interface{} { 213 | i := len(m.readers) - 1 214 | r := m.readers[i] 215 | m.readers = m.readers[:i] 216 | return r 217 | } 218 | 219 | type bufferedRowReader struct { 220 | rows parquet.RowReader 221 | off int32 222 | end int32 223 | buf [64]parquet.Row 224 | } 225 | 226 | func (r *bufferedRowReader) head() parquet.Row { 227 | return r.buf[r.off] 228 | } 229 | 230 | func (r *bufferedRowReader) next() error { 231 | if r.off++; r.off == r.end { 232 | r.off = 0 233 | r.end = 0 234 | return r.read() 235 | } 236 | return nil 237 | } 238 | 239 | func (r *bufferedRowReader) read() error { 240 | if r.rows == nil { 241 | return io.EOF 242 | } 243 | n, err := r.rows.ReadRows(r.buf[r.end:]) 244 | if err != nil && n == 0 { 245 | return err 246 | } 247 | r.end += int32(n) 248 | return nil 249 | } 250 | 251 | func (r *bufferedRowReader) close() { 252 | r.rows = nil 253 | r.off = 0 254 | r.end = 0 255 | } 256 | 257 | func sizeFromSeeker(seek io.Seeker) (int64, error) { 258 | pos, err := seek.Seek(0, io.SeekCurrent) 259 | if err != nil { 260 | return 0, err 261 | } 262 | end, err := seek.Seek(0, io.SeekEnd) 263 | if err != nil { 264 | return 0, err 265 | } 266 | size := end - pos 267 | pos1, err := seek.Seek(pos, io.SeekStart) 268 | if err != nil { 269 | return 0, err 270 | } 271 | if pos1 != pos { 272 | return 0, fmt.Errorf("unable to restore seek position: %d != %d", pos1, pos) 273 | } 274 | return size, nil 275 | } 276 | 277 | type readerAt struct { 278 | reader io.ReadSeeker 279 | offset int64 280 | } 281 | 282 | func (r *readerAt) ReadAt(b []byte, off int64) (int, error) { 283 | if r.offset < 0 || off != r.offset { 284 | off, err := r.reader.Seek(off, io.SeekStart) 285 | if err != nil { 286 | return 0, err 287 | } 288 | r.offset = off 289 | } 290 | n, err := r.reader.Read(b) 291 | r.offset += int64(n) 292 | return n, err 293 | } 294 | 295 | func newReaderAt(r io.ReadSeeker) io.ReaderAt { 296 | if rr, ok := r.(io.ReaderAt); ok { 297 | return rr 298 | } 299 | return &readerAt{reader: r, offset: -1} 300 | } 301 | -------------------------------------------------------------------------------- /convert/tsdb.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package convert 6 | 7 | import ( 8 | "context" 9 | "fmt" 10 | "io" 11 | "slices" 12 | 13 | "github.com/hashicorp/go-multierror" 14 | "github.com/parquet-go/parquet-go" 15 | "github.com/prometheus/prometheus/model/labels" 16 | "github.com/prometheus/prometheus/storage" 17 | "github.com/prometheus/prometheus/tsdb" 18 | "github.com/prometheus/prometheus/tsdb/chunks" 19 | 20 | "github.com/cloudflare/parquet-tsdb-poc/schema" 21 | ) 22 | 23 | type indexRowReader struct { 24 | ctx context.Context 25 | 26 | closers []io.Closer 27 | 28 | seriesSet storage.ChunkSeriesSet 29 | 30 | rowBuilder *parquet.RowBuilder 31 | schema *parquet.Schema 32 | 33 | chunksColumn0 int 34 | chunksColumn1 int 35 | chunksColumn2 int 36 | 37 | m map[string]map[string]struct{} 38 | } 39 | 40 | var _ parquet.RowReader = &indexRowReader{} 41 | 42 | func newIndexRowReader(ctx context.Context, mint, maxt int64, blks []Convertible) (*indexRowReader, error) { 43 | var ( 44 | lbls = make([]string, 0) 45 | seriesSets = make([]storage.ChunkSeriesSet, 0, len(blks)) 46 | closers = make([]io.Closer, 0, len(blks)) 47 | ) 48 | for _, blk := range blks { 49 | indexr, err := blk.Index() 50 | if err != nil { 51 | return nil, fmt.Errorf("unable to get index reader from block: %s", err) 52 | } 53 | closers = append(closers, indexr) 54 | 55 | chunkr, err := blk.Chunks() 56 | if err != nil { 57 | return nil, fmt.Errorf("unable to get chunk reader from block: %s", err) 58 | } 59 | closers = append(closers, chunkr) 60 | 61 | tombsr, err := blk.Tombstones() 62 | if err != nil { 63 | return nil, fmt.Errorf("unable to get tombstone reader from block: %s", err) 64 | } 65 | closers = append(closers, tombsr) 66 | 67 | lblns, err := indexr.LabelNames(ctx) 68 | if err != nil { 69 | return nil, fmt.Errorf("unable to get label names from block: %s", err) 70 | } 71 | lbls = append(lbls, lblns...) 72 | 73 | postings := tsdb.AllSortedPostings(ctx, indexr) 74 | seriesSet := tsdb.NewBlockChunkSeriesSet(blk.Meta().ULID, indexr, chunkr, tombsr, postings, mint, maxt, false) 75 | seriesSets = append(seriesSets, seriesSet) 76 | } 77 | slices.Sort(lbls) 78 | 79 | cseriesSet := storage.NewMergeChunkSeriesSet(seriesSets, storage.NewConcatenatingChunkSeriesMerger()) 80 | s := schema.BuildSchemaFromLabels(slices.Compact(lbls)) 81 | 82 | return &indexRowReader{ 83 | ctx: ctx, 84 | seriesSet: cseriesSet, 85 | closers: closers, 86 | 87 | rowBuilder: parquet.NewRowBuilder(s), 88 | schema: s, 89 | 90 | chunksColumn0: columnIDForKnownColumn(s, schema.ChunksColumn0), 91 | chunksColumn1: columnIDForKnownColumn(s, schema.ChunksColumn1), 92 | chunksColumn2: columnIDForKnownColumn(s, schema.ChunksColumn2), 93 | 94 | m: make(map[string]map[string]struct{}), 95 | }, nil 96 | } 97 | 98 | func columnIDForKnownColumn(schema *parquet.Schema, columnName string) int { 99 | lc, _ := schema.Lookup(columnName) 100 | return lc.ColumnIndex 101 | } 102 | 103 | func (rr *indexRowReader) Close() error { 104 | err := &multierror.Error{} 105 | for i := range rr.closers { 106 | err = multierror.Append(err, rr.closers[i].Close()) 107 | } 108 | return err.ErrorOrNil() 109 | } 110 | 111 | func (rr *indexRowReader) Schema() *parquet.Schema { 112 | return rr.schema 113 | } 114 | 115 | func (rr *indexRowReader) NameLabelMapping() map[string]map[string]struct{} { 116 | return rr.m 117 | } 118 | 119 | func (rr *indexRowReader) ReadRows(buf []parquet.Row) (int, error) { 120 | select { 121 | case <-rr.ctx.Done(): 122 | return 0, rr.ctx.Err() 123 | default: 124 | } 125 | 126 | var it chunks.Iterator 127 | 128 | i := 0 129 | for i < len(buf) && rr.seriesSet.Next() { 130 | rr.rowBuilder.Reset() 131 | s := rr.seriesSet.At() 132 | it = s.Iterator(it) 133 | 134 | chkBytes, err := collectChunks(it) 135 | if err != nil { 136 | return i, fmt.Errorf("unable to collect chunks: %s", err) 137 | } 138 | 139 | // skip series that have no chunks in the requested time 140 | if allChunksEmpty(chkBytes) { 141 | continue 142 | } 143 | 144 | metricName := s.Labels().Get(labels.MetricName) 145 | nameMap, ok := rr.m[metricName] 146 | if !ok { 147 | nameMap = make(map[string]struct{}) 148 | } 149 | rr.m[metricName] = nameMap 150 | s.Labels().Range(func(l labels.Label) { 151 | colName := schema.LabelNameToColumn(l.Name) 152 | lc, _ := rr.schema.Lookup(colName) 153 | rr.rowBuilder.Add(lc.ColumnIndex, parquet.ValueOf(l.Value)) 154 | if l.Name != labels.MetricName { 155 | nameMap[colName] = struct{}{} 156 | } 157 | }) 158 | 159 | for idx, chk := range chkBytes { 160 | if len(chk) == 0 { 161 | continue 162 | } 163 | switch idx { 164 | case 0: 165 | rr.rowBuilder.Add(rr.chunksColumn0, parquet.ValueOf(chk)) 166 | case 1: 167 | rr.rowBuilder.Add(rr.chunksColumn1, parquet.ValueOf(chk)) 168 | case 2: 169 | rr.rowBuilder.Add(rr.chunksColumn2, parquet.ValueOf(chk)) 170 | } 171 | } 172 | buf[i] = rr.rowBuilder.AppendRow(buf[i][:0]) 173 | i++ 174 | } 175 | if i < len(buf) { 176 | return i, io.EOF 177 | } 178 | return i, rr.seriesSet.Err() 179 | } 180 | -------------------------------------------------------------------------------- /db/block.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package db 6 | 7 | import ( 8 | "context" 9 | "fmt" 10 | "slices" 11 | 12 | "github.com/hashicorp/go-multierror" 13 | "github.com/prometheus/prometheus/model/labels" 14 | "github.com/prometheus/prometheus/storage" 15 | "github.com/prometheus/prometheus/util/annotations" 16 | ) 17 | 18 | type Block struct { 19 | meta Meta 20 | shards []*Shard 21 | } 22 | 23 | type Meta struct { 24 | Name string 25 | Mint, Maxt int64 26 | Shards int64 27 | ColumnsForName map[string][]string 28 | } 29 | 30 | func (blk *Block) Timerange() (int64, int64) { 31 | return blk.meta.Mint, blk.meta.Maxt 32 | } 33 | 34 | func (blk *Block) Queryable(extlabels labels.Labels, replicaLabelNames []string) storage.Queryable { 35 | qs := make([]storage.Queryable, 0, len(blk.shards)) 36 | for _, shard := range blk.shards { 37 | qs = append(qs, shard.Queryable(extlabels, replicaLabelNames)) 38 | } 39 | return &BlockQueryable{extlabels: extlabels, shards: qs} 40 | } 41 | 42 | type BlockQueryable struct { 43 | extlabels labels.Labels 44 | 45 | shards []storage.Queryable 46 | } 47 | 48 | func (q *BlockQueryable) Querier(mint, maxt int64) (storage.Querier, error) { 49 | qs := make([]storage.Querier, 0, len(q.shards)) 50 | for _, shard := range q.shards { 51 | q, err := shard.Querier(mint, maxt) 52 | if err != nil { 53 | return nil, fmt.Errorf("unable to get shard querier: %w", err) 54 | } 55 | qs = append(qs, q) 56 | } 57 | return &BlockQuerier{mint: mint, maxt: maxt, shards: qs}, nil 58 | } 59 | 60 | type BlockQuerier struct { 61 | mint, maxt int64 62 | 63 | shards []storage.Querier 64 | } 65 | 66 | func (q BlockQuerier) Close() error { 67 | var err *multierror.Error 68 | for _, q := range q.shards { 69 | err = multierror.Append(err, q.Close()) 70 | } 71 | return err.ErrorOrNil() 72 | } 73 | 74 | func (q BlockQuerier) LabelValues(ctx context.Context, name string, hints *storage.LabelHints, ms ...*labels.Matcher) ([]string, annotations.Annotations, error) { 75 | var annos annotations.Annotations 76 | 77 | res := make([]string, 0) 78 | for _, shrd := range q.shards { 79 | lvals, lannos, err := shrd.LabelValues(ctx, name, hints, ms...) 80 | if err != nil { 81 | return nil, nil, fmt.Errorf("unable to query label values for shard: %w", err) 82 | } 83 | annos = annos.Merge(lannos) 84 | res = append(res, lvals...) 85 | } 86 | 87 | slices.Sort(res) 88 | return slices.Compact(res), annos, nil 89 | } 90 | 91 | func (BlockQuerier) LabelNames(context.Context, *storage.LabelHints, ...*labels.Matcher) ([]string, annotations.Annotations, error) { 92 | // TODO 93 | return nil, nil, nil 94 | } 95 | 96 | func (q BlockQuerier) Select(ctx context.Context, sorted bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet { 97 | sss := make([]storage.SeriesSet, 0, len(q.shards)) 98 | for _, q := range q.shards { 99 | sss = append(sss, q.Select(ctx, sorted, hints, matchers...)) 100 | } 101 | return newVerticalSeriesSet(sss...) 102 | } 103 | -------------------------------------------------------------------------------- /db/db.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package db 6 | 7 | import ( 8 | "context" 9 | "fmt" 10 | "math" 11 | "slices" 12 | 13 | "github.com/hashicorp/go-multierror" 14 | "github.com/prometheus/prometheus/model/labels" 15 | "github.com/prometheus/prometheus/storage" 16 | "github.com/prometheus/prometheus/util/annotations" 17 | 18 | "github.com/cloudflare/parquet-tsdb-poc/internal/util" 19 | ) 20 | 21 | // DB is a horizontal partitioning of multiple non-overlapping blocks that are 22 | // aligned to 24h and span exactly 24h. 23 | type DB struct { 24 | syncer syncer 25 | extLabels labels.Labels 26 | } 27 | 28 | type syncer interface { 29 | Blocks() []*Block 30 | } 31 | 32 | type dbConfig struct { 33 | extLabels labels.Labels 34 | } 35 | 36 | type DBOption func(*dbConfig) 37 | 38 | func ExternalLabels(extlabels labels.Labels) DBOption { 39 | return func(cfg *dbConfig) { 40 | cfg.extLabels = extlabels 41 | } 42 | } 43 | 44 | func NewDB(syncer syncer, opts ...DBOption) *DB { 45 | cfg := dbConfig{extLabels: labels.EmptyLabels()} 46 | for _, o := range opts { 47 | o(&cfg) 48 | } 49 | return &DB{syncer: syncer, extLabels: cfg.extLabels} 50 | } 51 | 52 | func (db *DB) Timerange() (int64, int64) { 53 | blocks := db.syncer.Blocks() 54 | 55 | mint := int64(math.MaxInt64) 56 | maxt := int64(math.MinInt64) 57 | 58 | for _, blk := range blocks { 59 | bmint, bmaxt := blk.Timerange() 60 | mint = min(mint, bmint) 61 | maxt = max(maxt, bmaxt) 62 | } 63 | return mint, maxt 64 | } 65 | 66 | func (db *DB) Extlabels() labels.Labels { 67 | return db.extLabels 68 | } 69 | 70 | // Queryable returns a storage.Queryable to evaluate queries with. 71 | func (db *DB) Queryable() storage.Queryable { 72 | return &DBQueryable{ 73 | blocks: db.syncer.Blocks(), 74 | extLabels: db.extLabels, 75 | } 76 | } 77 | 78 | // ReplicaQueryable returns a storage.Queryable that drops replica labels at runtime. Replica labels are 79 | // labels that identify a replica, i.e. one member of an HA pair of Prometheus servers. Thanos 80 | // might request at query time to drop those labels so that we can deduplicate results into one view. 81 | // Common replica labels are 'prometheus', 'host', etc. 82 | func (db *DB) ReplicaQueryable(replicaLabelNames []string) storage.Queryable { 83 | return &DBQueryable{ 84 | blocks: db.syncer.Blocks(), 85 | extLabels: db.extLabels, 86 | replicaLabelNames: replicaLabelNames, 87 | } 88 | } 89 | 90 | type DBQueryable struct { 91 | blocks []*Block 92 | 93 | // extLabels are added to all series in the result set overriding any internal labels. 94 | extLabels labels.Labels 95 | 96 | // replicaLabelNames are names of labels that identify replicas, they are dropped 97 | // after extLabels were applied. 98 | replicaLabelNames []string 99 | } 100 | 101 | func (db *DBQueryable) Querier(mint, maxt int64) (storage.Querier, error) { 102 | qs := make([]storage.Querier, 0, len(db.blocks)) 103 | for _, blk := range db.blocks { 104 | bmint, bmaxt := blk.Timerange() 105 | if !util.Intersects(mint, maxt, bmint, bmaxt) { 106 | continue 107 | } 108 | start, end := util.Intersection(mint, maxt, bmint, bmaxt) 109 | q, err := blk.Queryable(db.extLabels, db.replicaLabelNames).Querier(start, end) 110 | if err != nil { 111 | return nil, fmt.Errorf("unable to get block querier: %s", err) 112 | } 113 | qs = append(qs, q) 114 | } 115 | return &DBQuerier{mint: mint, maxt: maxt, blocks: qs}, nil 116 | } 117 | 118 | type DBQuerier struct { 119 | mint, maxt int64 120 | 121 | blocks []storage.Querier 122 | } 123 | 124 | var _ storage.Querier = &DBQuerier{} 125 | 126 | func (q DBQuerier) Close() error { 127 | var err *multierror.Error 128 | for _, q := range q.blocks { 129 | err = multierror.Append(err, q.Close()) 130 | } 131 | return err.ErrorOrNil() 132 | } 133 | 134 | func (q DBQuerier) LabelValues(ctx context.Context, name string, hints *storage.LabelHints, ms ...*labels.Matcher) ([]string, annotations.Annotations, error) { 135 | var annos annotations.Annotations 136 | 137 | res := make([]string, 0) 138 | for _, blk := range q.blocks { 139 | lvals, lannos, err := blk.LabelValues(ctx, name, hints, ms...) 140 | if err != nil { 141 | return nil, nil, fmt.Errorf("unable to query label values for block: %w", err) 142 | } 143 | annos = annos.Merge(lannos) 144 | res = append(res, lvals...) 145 | } 146 | 147 | slices.Sort(res) 148 | return slices.Compact(res), annos, nil 149 | } 150 | 151 | func (DBQuerier) LabelNames(context.Context, *storage.LabelHints, ...*labels.Matcher) ([]string, annotations.Annotations, error) { 152 | // TODO 153 | return nil, nil, nil 154 | } 155 | 156 | func (q DBQuerier) Select(ctx context.Context, sorted bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet { 157 | return newLazySeriesSet(ctx, q.selectFn, sorted, hints, matchers...) 158 | } 159 | 160 | func (q DBQuerier) selectFn(ctx context.Context, sorted bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet { 161 | // If we need to merge multiple series sets vertically we need them sorted 162 | sorted = sorted || len(q.blocks) > 1 163 | 164 | sss := make([]storage.SeriesSet, 0, len(q.blocks)) 165 | for _, q := range q.blocks { 166 | sss = append(sss, q.Select(ctx, sorted, hints, matchers...)) 167 | } 168 | 169 | if len(sss) == 0 { 170 | return storage.EmptySeriesSet() 171 | } 172 | return storage.NewMergeSeriesSet(sss, storage.ChainedSeriesMerge) 173 | } 174 | -------------------------------------------------------------------------------- /db/discover.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package db 6 | 7 | import ( 8 | "context" 9 | "fmt" 10 | "io" 11 | "maps" 12 | "slices" 13 | "sync" 14 | 15 | "github.com/thanos-io/objstore" 16 | "google.golang.org/protobuf/proto" 17 | 18 | "github.com/cloudflare/parquet-tsdb-poc/proto/metapb" 19 | "github.com/cloudflare/parquet-tsdb-poc/schema" 20 | ) 21 | 22 | type Discoverer struct { 23 | bkt objstore.Bucket 24 | 25 | mu sync.Mutex 26 | metas map[string]Meta 27 | 28 | concurrency int 29 | } 30 | 31 | type discoveryConfig struct { 32 | concurrency int 33 | } 34 | 35 | type DiscoveryOption func(*discoveryConfig) 36 | 37 | func MetaConcurrency(c int) DiscoveryOption { 38 | return func(cfg *discoveryConfig) { 39 | cfg.concurrency = c 40 | } 41 | } 42 | 43 | func NewDiscoverer(bkt objstore.Bucket, opts ...DiscoveryOption) *Discoverer { 44 | cfg := discoveryConfig{ 45 | concurrency: 1, 46 | } 47 | for _, o := range opts { 48 | o(&cfg) 49 | } 50 | return &Discoverer{ 51 | bkt: bkt, 52 | metas: make(map[string]Meta), 53 | concurrency: cfg.concurrency, 54 | } 55 | } 56 | 57 | func (s *Discoverer) Metas() map[string]Meta { 58 | s.mu.Lock() 59 | defer s.mu.Unlock() 60 | 61 | res := make(map[string]Meta, len(s.metas)) 62 | maps.Copy(res, s.metas) 63 | 64 | return res 65 | } 66 | 67 | func (s *Discoverer) Discover(ctx context.Context) error { 68 | m := make(map[string][]string) 69 | err := s.bkt.Iter(ctx, "", func(n string) error { 70 | id, file, ok := schema.SplitBlockPath(n) 71 | if !ok { 72 | return nil 73 | } 74 | m[id] = append(m[id], file) 75 | return nil 76 | }, objstore.WithRecursiveIter()) 77 | if err != nil { 78 | return err 79 | } 80 | 81 | type metaOrError struct { 82 | m Meta 83 | err error 84 | } 85 | 86 | metaC := make(chan metaOrError) 87 | go func() { 88 | defer close(metaC) 89 | 90 | workerC := make(chan string, s.concurrency) 91 | go func() { 92 | defer close(workerC) 93 | 94 | for k, v := range m { 95 | if _, ok := s.metas[k]; ok { 96 | continue 97 | } 98 | if !slices.Contains(v, schema.MetaFile) { 99 | // skip incomplete block 100 | continue 101 | } 102 | workerC <- k 103 | } 104 | }() 105 | 106 | var wg sync.WaitGroup 107 | defer wg.Wait() 108 | 109 | for i := 0; i < s.concurrency; i++ { 110 | wg.Add(1) 111 | go func() { 112 | defer wg.Done() 113 | for k := range workerC { 114 | meta, err := readMetafile(ctx, s.bkt, k) 115 | if err != nil { 116 | metaC <- metaOrError{err: fmt.Errorf("unable to read meta file for %q: %w", k, err)} 117 | } else { 118 | metaC <- metaOrError{m: meta} 119 | } 120 | } 121 | }() 122 | } 123 | }() 124 | 125 | nm := make(map[string]Meta) 126 | for m := range metaC { 127 | if m.err != nil { 128 | return fmt.Errorf("unable to read meta: %w", err) 129 | } 130 | nm[m.m.Name] = m.m 131 | } 132 | 133 | s.mu.Lock() 134 | defer s.mu.Unlock() 135 | 136 | maps.Copy(s.metas, nm) 137 | 138 | return nil 139 | } 140 | 141 | func readMetafile(ctx context.Context, bkt objstore.Bucket, name string) (Meta, error) { 142 | mfile := schema.MetaFileNameForBlock(name) 143 | if _, err := bkt.Attributes(ctx, mfile); err != nil { 144 | return Meta{}, fmt.Errorf("unable to attr %s: %w", mfile, err) 145 | } 146 | rdr, err := bkt.Get(ctx, mfile) 147 | if err != nil { 148 | return Meta{}, fmt.Errorf("unable to get %s: %w", mfile, err) 149 | } 150 | defer rdr.Close() 151 | 152 | metaBytes, err := io.ReadAll(rdr) 153 | if err != nil { 154 | return Meta{}, fmt.Errorf("unable to read %s: %w", mfile, err) 155 | } 156 | 157 | metapb := &metapb.Metadata{} 158 | if err := proto.Unmarshal(metaBytes, metapb); err != nil { 159 | return Meta{}, fmt.Errorf("unable to read %s: %w", mfile, err) 160 | } 161 | 162 | m := make(map[string][]string, 0) 163 | for k, v := range metapb.GetColumnsForName() { 164 | m[k] = v.GetColumns() 165 | } 166 | return Meta{ 167 | Name: name, 168 | Mint: metapb.GetMint(), 169 | Maxt: metapb.GetMaxt(), 170 | Shards: metapb.GetShards(), 171 | ColumnsForName: m, 172 | }, nil 173 | } 174 | -------------------------------------------------------------------------------- /db/iterator.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package db 6 | 7 | import ( 8 | "errors" 9 | 10 | "github.com/prometheus/prometheus/model/histogram" 11 | "github.com/prometheus/prometheus/model/labels" 12 | "github.com/prometheus/prometheus/tsdb/chunkenc" 13 | ) 14 | 15 | // Taken from https://github.com/thanos-io/thanos/blob/main/pkg/query/iter.go 16 | type chunkSeries struct { 17 | lset labels.Labels 18 | chunks []chunkenc.Chunk 19 | mint, maxt int64 20 | } 21 | 22 | func (s *chunkSeries) Labels() labels.Labels { 23 | return s.lset 24 | } 25 | 26 | func (s *chunkSeries) Iterator(_ chunkenc.Iterator) chunkenc.Iterator { 27 | its := make([]chunkenc.Iterator, 0, len(s.chunks)) 28 | for _, chk := range s.chunks { 29 | its = append(its, chk.Iterator(nil)) 30 | } 31 | // We might have collect series where we trimmed all chunks because they had 32 | // no timeseries in the interval 33 | if len(its) == 0 { 34 | return chunkenc.NewNopIterator() 35 | } 36 | return newBoundedSeriesIterator(newChunkSeriesIterator(its), s.mint, s.maxt) 37 | } 38 | 39 | type errSeriesIterator struct { 40 | err error 41 | } 42 | 43 | func (errSeriesIterator) Seek(int64) chunkenc.ValueType { return chunkenc.ValNone } 44 | func (errSeriesIterator) Next() chunkenc.ValueType { return chunkenc.ValNone } 45 | func (errSeriesIterator) At() (int64, float64) { return 0, 0 } 46 | func (errSeriesIterator) AtHistogram(*histogram.Histogram) (int64, *histogram.Histogram) { 47 | return 0, nil 48 | } 49 | func (errSeriesIterator) AtFloatHistogram(*histogram.FloatHistogram) (int64, *histogram.FloatHistogram) { 50 | return 0, nil 51 | } 52 | func (errSeriesIterator) AtT() int64 { return 0 } 53 | func (it errSeriesIterator) Err() error { return it.err } 54 | 55 | type chunkSeriesIterator struct { 56 | chunks []chunkenc.Iterator 57 | i int 58 | lastVal chunkenc.ValueType 59 | 60 | cur chunkenc.Iterator 61 | } 62 | 63 | func newChunkSeriesIterator(cs []chunkenc.Iterator) chunkenc.Iterator { 64 | if len(cs) == 0 { 65 | return errSeriesIterator{err: errors.New("got empty chunks")} 66 | } 67 | return &chunkSeriesIterator{chunks: cs, cur: cs[0]} 68 | } 69 | 70 | func (it *chunkSeriesIterator) Seek(t int64) chunkenc.ValueType { 71 | // We generally expect the chunks already to be cut down 72 | // to the range we are interested in. There's not much to be gained from 73 | // hopping across chunks so we just call next until we reach t. 74 | for { 75 | ct := it.AtT() 76 | if ct >= t { 77 | return it.lastVal 78 | } 79 | it.lastVal = it.Next() 80 | if it.lastVal == chunkenc.ValNone { 81 | return chunkenc.ValNone 82 | } 83 | } 84 | } 85 | 86 | func (it *chunkSeriesIterator) At() (t int64, v float64) { 87 | return it.cur.At() 88 | } 89 | 90 | func (it *chunkSeriesIterator) AtHistogram(h *histogram.Histogram) (int64, *histogram.Histogram) { 91 | return it.cur.AtHistogram(h) 92 | } 93 | 94 | func (it *chunkSeriesIterator) AtFloatHistogram(fh *histogram.FloatHistogram) (int64, *histogram.FloatHistogram) { 95 | return it.cur.AtFloatHistogram(fh) 96 | } 97 | 98 | func (it *chunkSeriesIterator) AtT() int64 { 99 | return it.cur.AtT() 100 | } 101 | 102 | func (it *chunkSeriesIterator) Next() chunkenc.ValueType { 103 | lastT := it.AtT() 104 | 105 | if valueType := it.chunks[it.i].Next(); valueType != chunkenc.ValNone { 106 | it.lastVal = valueType 107 | return valueType 108 | } 109 | if it.Err() != nil { 110 | return chunkenc.ValNone 111 | } 112 | if it.i >= len(it.chunks)-1 { 113 | return chunkenc.ValNone 114 | } 115 | // Chunks are guaranteed to be ordered but not generally guaranteed to not overlap. 116 | // We must ensure to skip any overlapping range between adjacent chunks. 117 | it.i++ 118 | it.cur = it.chunks[it.i] 119 | return it.Seek(lastT + 1) 120 | } 121 | 122 | func (it *chunkSeriesIterator) Err() error { 123 | return it.chunks[it.i].Err() 124 | } 125 | 126 | // Taken from https://github.com/thanos-io/thanos/blob/main/pkg/dedup/iter.go 127 | type boundedSeriesIterator struct { 128 | it chunkenc.Iterator 129 | mint, maxt int64 130 | } 131 | 132 | func newBoundedSeriesIterator(it chunkenc.Iterator, mint, maxt int64) *boundedSeriesIterator { 133 | return &boundedSeriesIterator{it: it, mint: mint, maxt: maxt} 134 | } 135 | 136 | func (it *boundedSeriesIterator) Seek(t int64) chunkenc.ValueType { 137 | if t > it.maxt { 138 | return chunkenc.ValNone 139 | } 140 | if t < it.mint { 141 | t = it.mint 142 | } 143 | return it.it.Seek(t) 144 | } 145 | 146 | func (it *boundedSeriesIterator) At() (t int64, v float64) { 147 | return it.it.At() 148 | } 149 | 150 | func (it *boundedSeriesIterator) AtHistogram(h *histogram.Histogram) (int64, *histogram.Histogram) { 151 | return it.it.AtHistogram(h) 152 | } 153 | 154 | func (it *boundedSeriesIterator) AtFloatHistogram(fh *histogram.FloatHistogram) (int64, *histogram.FloatHistogram) { 155 | return it.it.AtFloatHistogram(fh) 156 | } 157 | 158 | func (it *boundedSeriesIterator) AtT() int64 { 159 | return it.it.AtT() 160 | } 161 | 162 | func (it *boundedSeriesIterator) Next() chunkenc.ValueType { 163 | valueType := it.it.Next() 164 | if valueType == chunkenc.ValNone { 165 | return chunkenc.ValNone 166 | } 167 | t := it.it.AtT() 168 | 169 | // Advance the iterator if we are before the valid interval. 170 | if t < it.mint { 171 | if it.Seek(it.mint) == chunkenc.ValNone { 172 | return chunkenc.ValNone 173 | } 174 | t = it.it.AtT() 175 | } 176 | // Once we passed the valid interval, there is no going back. 177 | if t <= it.maxt { 178 | return valueType 179 | } 180 | 181 | return chunkenc.ValNone 182 | } 183 | 184 | func (it *boundedSeriesIterator) Err() error { 185 | return it.it.Err() 186 | } 187 | -------------------------------------------------------------------------------- /db/metrics.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package db 6 | 7 | import ( 8 | "github.com/hashicorp/go-multierror" 9 | "github.com/prometheus/client_golang/prometheus" 10 | ) 11 | 12 | var ( 13 | bucketRequests = prometheus.NewCounter(prometheus.CounterOpts{ 14 | Name: "bucket_requests_total", 15 | Help: "Total amount of requests to object storage", 16 | }) 17 | ) 18 | 19 | func RegisterMetrics(reg prometheus.Registerer) error { 20 | return multierror.Append(nil, 21 | reg.Register(bucketRequests), 22 | ).ErrorOrNil() 23 | } 24 | -------------------------------------------------------------------------------- /db/seriesset.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package db 6 | 7 | import ( 8 | "context" 9 | "sync" 10 | 11 | "github.com/hashicorp/go-multierror" 12 | "github.com/prometheus/prometheus/model/labels" 13 | "github.com/prometheus/prometheus/storage" 14 | "github.com/prometheus/prometheus/util/annotations" 15 | ) 16 | 17 | type hints struct { 18 | By bool 19 | Func string 20 | Grouping []string 21 | } 22 | 23 | func fromStorageHints(h *storage.SelectHints) hints { 24 | clone := make([]string, len(h.Grouping)) 25 | copy(clone, h.Grouping) 26 | return hints{ 27 | Func: h.Func, 28 | By: h.By, 29 | Grouping: clone, 30 | } 31 | } 32 | 33 | func toStorageHints(h hints) *storage.SelectHints { 34 | return &storage.SelectHints{Func: h.Func, By: h.By, Grouping: h.Grouping} 35 | } 36 | 37 | type selectFn func(context.Context, bool, *storage.SelectHints, ...*labels.Matcher) storage.SeriesSet 38 | 39 | func newLazySeriesSet(ctx context.Context, selectFn selectFn, sorted bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet { 40 | res := &lazySeriesSet{ 41 | selectfn: selectFn, 42 | ctx: ctx, 43 | sorted: sorted, 44 | // SelectHints is reused in the subsequent parallel Select call 45 | hints: fromStorageHints(hints), 46 | matchers: matchers, 47 | done: make(chan struct{}), 48 | } 49 | go res.init() 50 | 51 | return res 52 | } 53 | 54 | type lazySeriesSet struct { 55 | selectfn selectFn 56 | ctx context.Context 57 | sorted bool 58 | hints hints 59 | matchers []*labels.Matcher 60 | 61 | set storage.SeriesSet 62 | 63 | once sync.Once 64 | done chan struct{} 65 | } 66 | 67 | func (c *lazySeriesSet) init() { 68 | c.once.Do(func() { 69 | c.set = c.selectfn(c.ctx, c.sorted, toStorageHints(c.hints), c.matchers...) 70 | close(c.done) 71 | }) 72 | } 73 | 74 | func (c *lazySeriesSet) Next() bool { 75 | <-c.done 76 | return c.set.Next() 77 | } 78 | 79 | func (c *lazySeriesSet) Err() error { 80 | <-c.done 81 | return c.set.Err() 82 | } 83 | 84 | func (c *lazySeriesSet) At() storage.Series { 85 | <-c.done 86 | return c.set.At() 87 | } 88 | 89 | func (c *lazySeriesSet) Warnings() annotations.Annotations { 90 | <-c.done 91 | return c.set.Warnings() 92 | } 93 | 94 | type verticalSeriesSet struct { 95 | i int 96 | sets []storage.SeriesSet 97 | } 98 | 99 | func newVerticalSeriesSet(sets ...storage.SeriesSet) storage.SeriesSet { 100 | if len(sets) == 0 { 101 | return storage.EmptySeriesSet() 102 | } 103 | return &verticalSeriesSet{sets: sets, i: 0} 104 | } 105 | 106 | func (ss *verticalSeriesSet) Next() bool { 107 | if ss.sets[ss.i].Next() { 108 | return true 109 | } 110 | for ss.i < len(ss.sets)-1 { 111 | ss.i++ 112 | if ss.sets[ss.i].Next() { 113 | return true 114 | } 115 | } 116 | return false 117 | } 118 | 119 | func (ss *verticalSeriesSet) At() storage.Series { return ss.sets[ss.i].At() } 120 | func (ss *verticalSeriesSet) Err() error { 121 | var err *multierror.Error 122 | for i := range ss.sets { 123 | err = multierror.Append(err, ss.sets[i].Err()) 124 | } 125 | return err.ErrorOrNil() 126 | } 127 | 128 | func (ss *verticalSeriesSet) Warnings() annotations.Annotations { 129 | res := annotations.New() 130 | for i := range ss.sets { 131 | res.Merge(ss.sets[i].Warnings()) 132 | } 133 | return *res 134 | } 135 | 136 | type concatSeriesSet struct { 137 | i int 138 | series []storage.Series 139 | } 140 | 141 | func newConcatSeriesSet(series ...storage.Series) storage.SeriesSet { 142 | if len(series) == 0 { 143 | return storage.EmptySeriesSet() 144 | } 145 | return &concatSeriesSet{series: series, i: -1} 146 | } 147 | 148 | func (ss *concatSeriesSet) Next() bool { 149 | if ss.i < len(ss.series)-1 { 150 | ss.i++ 151 | return true 152 | } 153 | return false 154 | } 155 | 156 | func (ss *concatSeriesSet) At() storage.Series { return ss.series[ss.i] } 157 | func (ss *concatSeriesSet) Err() error { return nil } 158 | func (ss *concatSeriesSet) Warnings() annotations.Annotations { return nil } 159 | -------------------------------------------------------------------------------- /db/shard.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package db 6 | 7 | import ( 8 | "context" 9 | "encoding/binary" 10 | "errors" 11 | "fmt" 12 | "slices" 13 | 14 | "github.com/parquet-go/parquet-go" 15 | "github.com/prometheus/common/model" 16 | "github.com/prometheus/prometheus/model/labels" 17 | "github.com/prometheus/prometheus/storage" 18 | "github.com/prometheus/prometheus/tsdb/chunkenc" 19 | "github.com/prometheus/prometheus/util/annotations" 20 | 21 | "github.com/cloudflare/parquet-tsdb-poc/internal/encoding" 22 | "github.com/cloudflare/parquet-tsdb-poc/internal/util" 23 | "github.com/cloudflare/parquet-tsdb-poc/schema" 24 | "github.com/cloudflare/parquet-tsdb-poc/search" 25 | ) 26 | 27 | type Shard struct { 28 | meta Meta 29 | chunkspfile *parquet.File 30 | labelspfile *parquet.File 31 | } 32 | 33 | func (shd *Shard) Queryable(extlabels labels.Labels, replicaLabelNames []string) storage.Queryable { 34 | return &ShardQueryable{extlabels: extlabels, replicaLabelNames: replicaLabelNames, shard: shd} 35 | } 36 | 37 | type ShardQueryable struct { 38 | extlabels labels.Labels 39 | replicaLabelNames []string 40 | 41 | shard *Shard 42 | } 43 | 44 | func (q *ShardQueryable) Querier(mint, maxt int64) (storage.Querier, error) { 45 | return &ShardQuerier{ 46 | mint: mint, 47 | maxt: maxt, 48 | shard: q.shard, 49 | extlabels: q.extlabels, 50 | replicaLabelNames: q.replicaLabelNames, 51 | }, nil 52 | } 53 | 54 | type ShardQuerier struct { 55 | mint, maxt int64 56 | extlabels labels.Labels 57 | replicaLabelNames []string 58 | 59 | shard *Shard 60 | } 61 | 62 | var _ storage.Querier = &ShardQuerier{} 63 | 64 | func (ShardQuerier) Close() error { return nil } 65 | 66 | func (q ShardQuerier) LabelValues(_ context.Context, name string, _ *storage.LabelHints, ms ...*labels.Matcher) ([]string, annotations.Annotations, error) { 67 | if len(ms) != 0 { 68 | return nil, nil, errors.New("label values with label matchers is not supported") 69 | } 70 | 71 | if name != model.MetricNameLabel { 72 | return nil, nil, errors.New("label values for label names other then __name__ is not supported") 73 | } 74 | 75 | res := make([]string, 0, len(q.shard.meta.ColumnsForName)) 76 | for name := range q.shard.meta.ColumnsForName { 77 | res = append(res, name) 78 | } 79 | 80 | return res, nil, nil 81 | } 82 | 83 | func (ShardQuerier) LabelNames(context.Context, *storage.LabelHints, ...*labels.Matcher) ([]string, annotations.Annotations, error) { 84 | // TODO 85 | return nil, nil, nil 86 | } 87 | 88 | func (q ShardQuerier) Select(ctx context.Context, sorted bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet { 89 | return newLazySeriesSet(ctx, q.selectFn, sorted, hints, matchers...) 90 | } 91 | 92 | func (q ShardQuerier) selectFn(ctx context.Context, sorted bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet { 93 | metricName, err := metricNameFromMatchers(matchers) 94 | if err != nil { 95 | return storage.ErrSeriesSet(fmt.Errorf("unable to get metric name from matchers: %s", err)) 96 | } 97 | columnsForName := slices.Clone(q.shard.meta.ColumnsForName[metricName]) 98 | 99 | constraint, err := constraintForMatchers(matchers, columnsForName) 100 | if err != nil { 101 | return storage.ErrSeriesSet(fmt.Errorf("unable to compute constraint for matchers: %s", err)) 102 | } 103 | 104 | // we cannot do projections in presence of replica/external labels as its not clear how to resolve conflicts 105 | labelProjections := labelProjections(columnsForName) 106 | chunkProjections := chunkProjections(q.shard.meta.Mint, q.mint, q.maxt) 107 | 108 | // no need to fetch chunks for /api/v1/series 109 | if hints.Func == "series" { 110 | chunkProjections = nil 111 | } 112 | 113 | labelpfile := q.shard.labelspfile 114 | labelReadSchema := schema.Projection(labelpfile.Schema(), labelProjections) 115 | chunkspfile := q.shard.chunkspfile 116 | chunksReadSchema := schema.Projection(chunkspfile.Schema(), chunkProjections) 117 | 118 | rr, err := search.Match(ctx, constraint, labelpfile, labelReadSchema, chunkspfile, chunksReadSchema) 119 | if err != nil { 120 | return storage.ErrSeriesSet(err) 121 | } 122 | defer rr.Close() 123 | 124 | ssb := &seriesSetBuilder{ 125 | schema: schema.Joined(labelReadSchema, chunksReadSchema), 126 | mint: q.mint, 127 | maxt: q.maxt, 128 | m: make(map[uint64]struct{}), 129 | s: make([]*chunkSeries, 0), 130 | b: labels.NewBuilder(labels.EmptyLabels()), 131 | extlabels: q.extlabels, 132 | replicaLabelNames: q.replicaLabelNames, 133 | } 134 | 135 | if _, err := parquet.CopyRows(ssb, rr); err != nil { 136 | return storage.ErrSeriesSet(err) 137 | } 138 | series := ssb.Series() 139 | if sorted { 140 | slices.SortFunc(series, func(l, r storage.Series) int { return labels.Compare(l.Labels(), r.Labels()) }) 141 | } 142 | return newConcatSeriesSet(series...) 143 | } 144 | 145 | func metricNameFromMatchers(matchers []*labels.Matcher) (string, error) { 146 | for i := range matchers { 147 | if matchers[i].Name == labels.MetricName { 148 | return matchers[i].Value, nil 149 | } 150 | } 151 | return "", errors.New("metric name is required") 152 | } 153 | 154 | func constraintForMatchers(matchers []*labels.Matcher, columnsForName []string) (search.Constraint, error) { 155 | constraints := make([]search.Constraint, 0) 156 | for i := range matchers { 157 | m := matchers[i] 158 | col := schema.LabelNameToColumn(m.Name) 159 | val := parquet.ValueOf(m.Value) 160 | if m.Name == labels.MetricName { 161 | if m.Type != labels.MatchEqual { 162 | return nil, errors.New("only equal matches on metric name are allowed") 163 | } 164 | constraints = append(constraints, search.EqualWithPageCheck(col, val)) 165 | continue 166 | } 167 | validColumn := slices.Contains(columnsForName, col) 168 | var c search.Constraint 169 | switch m.Type { 170 | case labels.MatchEqual: 171 | if !validColumn { 172 | // equal match on a column that the series does not have; return nothing 173 | return search.Null(), nil 174 | } 175 | c = search.Equal(col, val) 176 | case labels.MatchNotEqual: 177 | if !validColumn { 178 | continue 179 | } 180 | c = search.Not(search.Equal(col, val)) 181 | case labels.MatchRegexp: 182 | if !validColumn { 183 | // equal match on a column that the series does not have; return nothing 184 | return search.Null(), nil 185 | } 186 | var err error 187 | c, err = search.Regex(col, m.GetRegexString()) 188 | if err != nil { 189 | return nil, fmt.Errorf("unable to build regex constraint: %s", err) 190 | } 191 | case labels.MatchNotRegexp: 192 | if !validColumn { 193 | continue 194 | } 195 | var err error 196 | c, err = search.Regex(col, m.GetRegexString()) 197 | if err != nil { 198 | return nil, fmt.Errorf("unable to build regex constraint: %s", err) 199 | } 200 | c = search.Not(c) 201 | } 202 | constraints = append(constraints, c) 203 | } 204 | return search.And(constraints...), nil 205 | } 206 | 207 | func labelProjections(columnsForName []string) []string { 208 | return append(columnsForName, schema.LabelNameToColumn(model.MetricNameLabel)) 209 | } 210 | 211 | func chunkProjections(blkStart, mint, maxt int64) []string { 212 | res := make([]string, 0) 213 | 214 | // TODO: This might be buggy since a block might start at the end of 0-8 and might 215 | // leak into 8-16. Right now we would not catch this. This supposes that chunks 216 | // are aligned to 8h. 217 | cmin, cmax := blkStart, blkStart+schema.ChunkColumnLength.Milliseconds() 218 | for i := 0; i != int(schema.ChunkColumnsPerDay); i++ { 219 | if !util.Intersects(cmin, cmax, mint, maxt) { 220 | goto C 221 | } 222 | switch i { 223 | case 0: 224 | res = append(res, schema.ChunksColumn0) 225 | case 1: 226 | res = append(res, schema.ChunksColumn1) 227 | case 2: 228 | res = append(res, schema.ChunksColumn2) 229 | } 230 | C: 231 | cmin, cmax = cmax, cmax+schema.ChunkColumnLength.Milliseconds() 232 | } 233 | return res 234 | } 235 | 236 | type seriesSetBuilder struct { 237 | schema *parquet.Schema 238 | mint, maxt int64 239 | extlabels labels.Labels 240 | replicaLabelNames []string 241 | 242 | s []*chunkSeries 243 | b *labels.Builder 244 | m map[uint64]struct{} 245 | } 246 | 247 | func (ssb *seriesSetBuilder) WriteRows(rs []parquet.Row) (int, error) { 248 | var ( 249 | chksBytes [schema.ChunkColumnsPerDay][]byte 250 | ) 251 | cols := ssb.schema.Columns() 252 | for i := range rs { 253 | ssb.b.Reset(labels.EmptyLabels()) 254 | rc := rs[i].Clone() 255 | for j := range rc { 256 | key := cols[j][0] 257 | val := rc[j] 258 | switch key { 259 | case schema.ChunksColumn0: 260 | chksBytes[0] = val.ByteArray() 261 | case schema.ChunksColumn1: 262 | chksBytes[1] = val.ByteArray() 263 | case schema.ChunksColumn2: 264 | chksBytes[2] = val.ByteArray() 265 | default: 266 | lblName := schema.ColumnToLabelName(key) 267 | if !val.IsNull() { 268 | ssb.b.Set(lblName, val.String()) 269 | } 270 | } 271 | } 272 | chks := make([]chunkenc.Chunk, 0, 12) 273 | for _, bs := range chksBytes { 274 | for len(bs) != 0 { 275 | enc := chunkenc.Encoding(binary.BigEndian.Uint32(bs[:4])) 276 | bs = bs[4:] 277 | mint := encoding.ZigZagDecode(binary.BigEndian.Uint64(bs[:8])) 278 | bs = bs[8:] 279 | maxt := encoding.ZigZagDecode(binary.BigEndian.Uint64(bs[:8])) 280 | bs = bs[8:] 281 | l := binary.BigEndian.Uint32(bs[:4]) 282 | bs = bs[4:] 283 | if util.Intersects(mint, maxt, ssb.mint, ssb.maxt) { 284 | chk, err := chunkenc.FromData(enc, bs[:l]) 285 | if err != nil { 286 | return i, fmt.Errorf("unable to create chunk from data: %s", err) 287 | } 288 | chks = append(chks, chk) 289 | } 290 | bs = bs[l:] 291 | } 292 | } 293 | 294 | ssb.extlabels.Range(func(lbl labels.Label) { ssb.b.Set(lbl.Name, lbl.Value) }) 295 | for _, lbl := range ssb.replicaLabelNames { 296 | ssb.b.Del(lbl) 297 | } 298 | 299 | lbls := ssb.b.Labels() 300 | 301 | h := lbls.Hash() 302 | if _, ok := ssb.m[h]; ok { 303 | // We have seen this series before, skip it for now; we could be smarter and select 304 | // chunks appropriately so that we fill in what might be missing but for now skipping is fine 305 | continue 306 | } 307 | ssb.m[h] = struct{}{} 308 | 309 | ssb.s = append(ssb.s, &chunkSeries{ 310 | lset: lbls, 311 | mint: ssb.mint, 312 | maxt: ssb.maxt, 313 | chunks: chks, 314 | }) 315 | } 316 | return len(rs), nil 317 | } 318 | 319 | func (ssb *seriesSetBuilder) Series() []storage.Series { 320 | res := make([]storage.Series, 0, len(ssb.s)) 321 | for _, v := range ssb.s { 322 | res = append(res, v) 323 | } 324 | return res 325 | } 326 | -------------------------------------------------------------------------------- /db/syncer.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package db 6 | 7 | import ( 8 | "bytes" 9 | "context" 10 | "fmt" 11 | "io" 12 | "maps" 13 | "slices" 14 | "sort" 15 | "sync" 16 | "time" 17 | 18 | "github.com/alecthomas/units" 19 | "github.com/parquet-go/parquet-go" 20 | "github.com/thanos-io/objstore" 21 | "github.com/thanos-io/thanos/pkg/info/infopb" 22 | "google.golang.org/grpc" 23 | "google.golang.org/grpc/credentials/insecure" 24 | 25 | "github.com/cloudflare/parquet-tsdb-poc/internal/util" 26 | "github.com/cloudflare/parquet-tsdb-poc/schema" 27 | ) 28 | 29 | type Syncer struct { 30 | bkt objstore.Bucket 31 | 32 | blockOpts []BlockOption 33 | metaFilter MetaFilter 34 | concurrency int 35 | 36 | mu sync.Mutex 37 | blocks map[string]*Block 38 | 39 | cached []*Block 40 | } 41 | 42 | type SyncerOption func(*syncerConfig) 43 | 44 | type syncerConfig struct { 45 | blockOpts []BlockOption 46 | metaFilter MetaFilter 47 | concurrency int 48 | } 49 | 50 | func BlockOptions(opts ...BlockOption) SyncerOption { 51 | return func(cfg *syncerConfig) { 52 | cfg.blockOpts = opts 53 | } 54 | } 55 | 56 | func FilterMetas(f MetaFilter) SyncerOption { 57 | return func(cfg *syncerConfig) { 58 | cfg.metaFilter = f 59 | } 60 | } 61 | 62 | func BlockConcurrency(c int) SyncerOption { 63 | return func(cfg *syncerConfig) { 64 | cfg.concurrency = c 65 | } 66 | } 67 | 68 | type BlockOption func(*blockConfig) 69 | 70 | type blockConfig struct { 71 | readBufferSize units.Base2Bytes 72 | } 73 | 74 | func ReadBufferSize(sz units.Base2Bytes) BlockOption { 75 | return func(cfg *blockConfig) { 76 | cfg.readBufferSize = sz 77 | } 78 | } 79 | 80 | func NewSyncer(bkt objstore.Bucket, opts ...SyncerOption) *Syncer { 81 | cfg := syncerConfig{ 82 | metaFilter: AllMetasMetaFilter, 83 | concurrency: 1, 84 | } 85 | 86 | for _, o := range opts { 87 | o(&cfg) 88 | } 89 | 90 | return &Syncer{ 91 | bkt: bkt, 92 | blocks: make(map[string]*Block), 93 | blockOpts: cfg.blockOpts, 94 | metaFilter: cfg.metaFilter, 95 | concurrency: cfg.concurrency, 96 | } 97 | } 98 | 99 | func (s *Syncer) Blocks() []*Block { 100 | s.mu.Lock() 101 | defer s.mu.Unlock() 102 | 103 | return s.filterBlocks(s.cached) 104 | } 105 | 106 | func (s *Syncer) Sync(ctx context.Context, metas map[string]Meta) error { 107 | type blockOrError struct { 108 | blk *Block 109 | err error 110 | } 111 | 112 | blkC := make(chan blockOrError) 113 | go func() { 114 | defer close(blkC) 115 | 116 | workerC := make(chan Meta, s.concurrency) 117 | go func() { 118 | defer close(workerC) 119 | 120 | for k, v := range s.filterMetas(metas) { 121 | if _, ok := s.blocks[k]; ok { 122 | continue 123 | } 124 | workerC <- v 125 | } 126 | }() 127 | 128 | var wg sync.WaitGroup 129 | defer wg.Wait() 130 | 131 | for i := 0; i < s.concurrency; i++ { 132 | wg.Add(1) 133 | go func() { 134 | defer wg.Done() 135 | for m := range workerC { 136 | blk, err := newBlockForMeta(ctx, s.bkt, m, s.blockOpts...) 137 | if err != nil { 138 | blkC <- blockOrError{err: fmt.Errorf("unable to read block %q: %w", m.Name, err)} 139 | } else { 140 | blkC <- blockOrError{blk: blk} 141 | } 142 | } 143 | }() 144 | } 145 | }() 146 | 147 | blocks := make(map[string]*Block, 0) 148 | for b := range blkC { 149 | if b.err != nil { 150 | return fmt.Errorf("unable to read block: %w", b.err) 151 | } 152 | blocks[b.blk.meta.Name] = b.blk 153 | } 154 | 155 | s.mu.Lock() 156 | defer s.mu.Unlock() 157 | 158 | // delete blocks that are not in meta map 159 | maps.DeleteFunc(s.blocks, func(k string, _ *Block) bool { _, ok := metas[k]; return !ok }) 160 | 161 | // add new blocks that we just loaded 162 | maps.Copy(s.blocks, blocks) 163 | 164 | s.cached = slices.Collect(maps.Values(s.blocks)) 165 | sort.Slice(s.cached, func(i, j int) bool { 166 | ls, _ := s.cached[i].Timerange() 167 | rs, _ := s.cached[j].Timerange() 168 | return ls < rs 169 | }) 170 | return nil 171 | } 172 | 173 | func (s *Syncer) filterMetas(metas map[string]Meta) map[string]Meta { 174 | return s.metaFilter.filterMetas(metas) 175 | } 176 | 177 | func (s *Syncer) filterBlocks(blks []*Block) []*Block { 178 | return s.metaFilter.filterBlocks(blks) 179 | } 180 | 181 | type MetaFilter interface { 182 | filterMetas(map[string]Meta) map[string]Meta 183 | filterBlocks([]*Block) []*Block 184 | } 185 | 186 | var AllMetasMetaFilter = allMetas{} 187 | 188 | type allMetas struct { 189 | } 190 | 191 | func (mf allMetas) filterMetas(metas map[string]Meta) map[string]Meta { return metas } 192 | func (mf allMetas) filterBlocks(blocks []*Block) []*Block { return blocks } 193 | 194 | type ThanosBackfillMetaFilter struct { 195 | endpoint string 196 | 197 | mu sync.Mutex 198 | mint, maxt int64 199 | } 200 | 201 | func NewThanosBackfillMetaFilter(endpoint string) *ThanosBackfillMetaFilter { 202 | return &ThanosBackfillMetaFilter{endpoint: endpoint} 203 | } 204 | 205 | func (tp *ThanosBackfillMetaFilter) filterMetas(metas map[string]Meta) map[string]Meta { 206 | tp.mu.Lock() 207 | defer tp.mu.Unlock() 208 | 209 | res := make(map[string]Meta, len(metas)) 210 | for k, v := range metas { 211 | if util.Contains(tp.mint, tp.maxt, v.Mint, v.Maxt) { 212 | continue 213 | } 214 | res[k] = v 215 | } 216 | return res 217 | } 218 | 219 | func (tp *ThanosBackfillMetaFilter) filterBlocks(blks []*Block) []*Block { 220 | tp.mu.Lock() 221 | defer tp.mu.Unlock() 222 | 223 | res := make([]*Block, 0, len(blks)) 224 | for _, blk := range blks { 225 | blkMint, blkMaxt := blk.Timerange() 226 | if util.Contains(tp.mint, tp.maxt, blkMint, blkMaxt) { 227 | continue 228 | } 229 | res = append(res, blk) 230 | } 231 | return res 232 | } 233 | 234 | func (tp *ThanosBackfillMetaFilter) Update(ctx context.Context) error { 235 | // Note: we assume that thanos runs close to this server, we dont need TLS here 236 | cc, err := grpc.NewClient(tp.endpoint, grpc.WithTransportCredentials(insecure.NewCredentials())) 237 | if err != nil { 238 | return fmt.Errorf("unable to connect: %w", err) 239 | } 240 | client := infopb.NewInfoClient(cc) 241 | 242 | info, err := client.Info(ctx, &infopb.InfoRequest{}) 243 | if err != nil { 244 | return fmt.Errorf("unable to get store time range from thanos: %w", err) 245 | } 246 | 247 | tp.mu.Lock() 248 | defer tp.mu.Unlock() 249 | 250 | tp.mint = info.Store.MinTime 251 | tp.maxt = info.Store.MaxTime 252 | 253 | return nil 254 | } 255 | 256 | func newBlockForMeta(ctx context.Context, bkt objstore.Bucket, m Meta, opts ...BlockOption) (*Block, error) { 257 | cfg := blockConfig{ 258 | readBufferSize: 8 * units.MiB, 259 | } 260 | for _, o := range opts { 261 | o(&cfg) 262 | } 263 | 264 | shards, err := readShards(ctx, bkt, m, cfg) 265 | if err != nil { 266 | return nil, fmt.Errorf("unable to read shards: %w", err) 267 | } 268 | 269 | return &Block{meta: m, shards: shards}, nil 270 | } 271 | 272 | func readShards(ctx context.Context, bkt objstore.Bucket, m Meta, cfg blockConfig) ([]*Shard, error) { 273 | shards := make([]*Shard, 0, m.Shards) 274 | for i := 0; i != int(m.Shards); i++ { 275 | shard, err := readShard(ctx, bkt, m, i, cfg) 276 | if err != nil { 277 | return nil, fmt.Errorf("unable to read shard %d: %w", i, err) 278 | } 279 | shards = append(shards, shard) 280 | } 281 | return shards, nil 282 | } 283 | 284 | func readShard(ctx context.Context, bkt objstore.Bucket, m Meta, i int, cfg blockConfig) (*Shard, error) { 285 | chunkspfile := schema.ChunksPfileNameForShard(m.Name, i) 286 | attrs, err := bkt.Attributes(ctx, chunkspfile) 287 | if err != nil { 288 | return nil, fmt.Errorf("unable to attr %s: %w", chunkspfile, err) 289 | } 290 | 291 | bktRdrAt := newBucketReaderAt(bkt, chunkspfile, 1*time.Minute) 292 | 293 | chunkspf, err := parquet.OpenFile(bktRdrAt, attrs.Size, 294 | parquet.FileReadMode(parquet.ReadModeAsync), 295 | parquet.ReadBufferSize(int(cfg.readBufferSize)), 296 | ) 297 | if err != nil { 298 | return nil, fmt.Errorf("unable to open parquet file %s: %w", chunkspfile, err) 299 | } 300 | 301 | labelspfile := schema.LabelsPfileNameForShard(m.Name, i) 302 | rdr, err := bkt.Get(ctx, labelspfile) 303 | if err != nil { 304 | return nil, fmt.Errorf("unable to get %s: %w", labelspfile, err) 305 | } 306 | defer rdr.Close() 307 | 308 | labelspfileBs, err := io.ReadAll(rdr) 309 | if err != nil { 310 | return nil, fmt.Errorf("unable to read %s: %w", labelspfile, err) 311 | } 312 | 313 | labelspf, err := parquet.OpenFile(bytes.NewReader(labelspfileBs), int64(len(labelspfileBs))) 314 | if err != nil { 315 | return nil, fmt.Errorf("unable to open parquet file %s: %w", labelspfile, err) 316 | } 317 | 318 | return &Shard{meta: m, chunkspfile: chunkspf, labelspfile: labelspf}, nil 319 | } 320 | -------------------------------------------------------------------------------- /db/util.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package db 6 | 7 | import ( 8 | "context" 9 | "fmt" 10 | "io" 11 | "time" 12 | 13 | "github.com/thanos-io/objstore" 14 | ) 15 | 16 | // TODO: timeouts retries whatnot 17 | type bucketReaderAt struct { 18 | bkt objstore.Bucket 19 | name string 20 | 21 | timeout time.Duration 22 | } 23 | 24 | func newBucketReaderAt(bkt objstore.Bucket, name string, timeout time.Duration) *bucketReaderAt { 25 | return &bucketReaderAt{ 26 | bkt: bkt, 27 | name: name, 28 | timeout: timeout, 29 | } 30 | } 31 | 32 | func (br *bucketReaderAt) ReadAt(p []byte, off int64) (n int, err error) { 33 | ctx, cancel := context.WithTimeout(context.Background(), br.timeout) 34 | defer cancel() 35 | 36 | bucketRequests.Inc() 37 | 38 | rdc, err := br.bkt.GetRange(ctx, br.name, off, int64(len(p))) 39 | if err != nil { 40 | return 0, fmt.Errorf("unable to read range for %s: %w", br.name, err) 41 | } 42 | defer rdc.Close() 43 | 44 | return io.ReadFull(rdc, p) 45 | } 46 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/cloudflare/parquet-tsdb-poc 2 | 3 | go 1.23.1 4 | 5 | require ( 6 | github.com/KimMachineGun/automemlimit v0.6.1 7 | github.com/google/go-cmp v0.6.0 8 | github.com/hashicorp/go-multierror v1.1.1 9 | github.com/mostynb/go-grpc-compression v1.2.3 10 | github.com/oklog/run v1.1.0 11 | github.com/parquet-go/parquet-go v0.25.0 12 | github.com/prometheus/client_golang v1.20.5 13 | github.com/prometheus/common v0.61.0 14 | github.com/prometheus/prometheus v0.300.0-beta.0.0.20241007135006-65f610353919 15 | github.com/thanos-io/objstore v0.0.0-20241111205755-d1dd89d41f97 16 | github.com/thanos-io/promql-engine v0.0.0-20241203103240-2f49f80c7c68 17 | github.com/thanos-io/thanos v0.37.2 18 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.57.0 19 | go.opentelemetry.io/otel v1.32.0 20 | go.opentelemetry.io/otel/exporters/jaeger v1.17.0 21 | go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.32.0 22 | go.opentelemetry.io/otel/sdk v1.32.0 23 | google.golang.org/protobuf v1.35.2 24 | gopkg.in/alecthomas/kingpin.v2 v2.2.6 25 | gopkg.in/yaml.v3 v3.0.1 26 | ) 27 | 28 | require ( 29 | github.com/go-kit/log v0.2.1 // indirect 30 | go4.org/intern v0.0.0-20230525184215-6c62f75575cb // indirect 31 | go4.org/unsafe/assume-no-moving-gc v0.0.0-20230525183740-e7c30c78aeb2 // indirect 32 | ) 33 | 34 | require ( 35 | cel.dev/expr v0.18.0 // indirect 36 | cloud.google.com/go v0.116.0 // indirect 37 | cloud.google.com/go/auth v0.11.0 // indirect 38 | cloud.google.com/go/auth/oauth2adapt v0.2.6 // indirect 39 | cloud.google.com/go/compute/metadata v0.5.2 // indirect 40 | cloud.google.com/go/iam v1.2.2 // indirect 41 | cloud.google.com/go/monitoring v1.21.2 // indirect 42 | cloud.google.com/go/storage v1.47.0 // indirect 43 | github.com/Azure/azure-sdk-for-go/sdk/azcore v1.16.0 // indirect 44 | github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.8.0 // indirect 45 | github.com/Azure/azure-sdk-for-go/sdk/internal v1.10.0 // indirect 46 | github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.5.0 // indirect 47 | github.com/AzureAD/microsoft-authentication-library-for-go v1.3.2 // indirect 48 | github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.25.0 // indirect 49 | github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.49.0 // indirect 50 | github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.49.0 // indirect 51 | github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect 52 | github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b 53 | github.com/aliyun/aliyun-oss-go-sdk v3.0.2+incompatible // indirect 54 | github.com/andybalholm/brotli v1.1.1 // indirect 55 | github.com/aws/aws-sdk-go v1.55.5 // indirect 56 | github.com/aws/aws-sdk-go-v2 v1.32.5 // indirect 57 | github.com/aws/aws-sdk-go-v2/config v1.28.5 // indirect 58 | github.com/aws/aws-sdk-go-v2/credentials v1.17.46 // indirect 59 | github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.20 // indirect 60 | github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.24 // indirect 61 | github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.24 // indirect 62 | github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 // indirect 63 | github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.1 // indirect 64 | github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.5 // indirect 65 | github.com/aws/aws-sdk-go-v2/service/sso v1.24.6 // indirect 66 | github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.5 // indirect 67 | github.com/aws/aws-sdk-go-v2/service/sts v1.33.1 // indirect 68 | github.com/aws/smithy-go v1.22.1 // indirect 69 | github.com/baidubce/bce-sdk-go v0.9.201 // indirect 70 | github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3 // indirect 71 | github.com/beorn7/perks v1.0.1 // indirect 72 | github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect 73 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 74 | github.com/cilium/ebpf v0.16.0 // indirect 75 | github.com/clbanning/mxj v1.8.4 // indirect 76 | github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 // indirect 77 | github.com/containerd/cgroups/v3 v3.0.4 // indirect 78 | github.com/containerd/log v0.1.0 // indirect 79 | github.com/coreos/go-systemd/v22 v22.5.0 // indirect 80 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 81 | github.com/dennwc/varint v1.0.0 // indirect 82 | github.com/docker/go-units v0.5.0 // indirect 83 | github.com/dustin/go-humanize v1.0.1 // indirect 84 | github.com/edsrzf/mmap-go v1.2.0 // indirect 85 | github.com/efficientgo/core v1.0.0-rc.3 // indirect 86 | github.com/envoyproxy/go-control-plane v0.13.1 // indirect 87 | github.com/envoyproxy/protoc-gen-validate v1.1.0 // indirect 88 | github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb // indirect 89 | github.com/felixge/httpsnoop v1.0.4 // indirect 90 | github.com/go-ini/ini v1.67.0 // indirect 91 | github.com/go-logfmt/logfmt v0.6.0 // indirect 92 | github.com/go-logr/logr v1.4.2 // indirect 93 | github.com/go-logr/stdr v1.2.2 // indirect 94 | github.com/goccy/go-json v0.10.3 // indirect 95 | github.com/godbus/dbus/v5 v5.1.0 // indirect 96 | github.com/gofrs/flock v0.12.1 // indirect 97 | github.com/gogo/protobuf v1.3.2 // indirect 98 | github.com/golang-jwt/jwt/v5 v5.2.1 // indirect 99 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 100 | github.com/golang/snappy v0.0.4 // indirect 101 | github.com/google/go-querystring v1.1.0 // indirect 102 | github.com/google/s2a-go v0.1.8 // indirect 103 | github.com/google/uuid v1.6.0 // indirect 104 | github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect 105 | github.com/googleapis/gax-go/v2 v2.14.0 // indirect 106 | github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect 107 | github.com/hashicorp/errwrap v1.1.0 // indirect 108 | github.com/huaweicloud/huaweicloud-sdk-go-obs v3.24.9+incompatible // indirect 109 | github.com/jmespath/go-jmespath v0.4.0 // indirect 110 | github.com/jpillora/backoff v1.0.0 // indirect 111 | github.com/julienschmidt/httprouter v1.3.0 // indirect 112 | github.com/klauspost/compress v1.18.0 // indirect 113 | github.com/klauspost/cpuid/v2 v2.2.9 // indirect 114 | github.com/kylelemons/godebug v1.1.0 // indirect 115 | github.com/mattn/go-runewidth v0.0.16 // indirect 116 | github.com/minio/md5-simd v1.1.2 // indirect 117 | github.com/minio/minio-go/v7 v7.0.81 // indirect 118 | github.com/mitchellh/mapstructure v1.5.0 // indirect 119 | github.com/moby/sys/userns v0.1.0 // indirect 120 | github.com/mozillazg/go-httpheader v0.4.0 // indirect 121 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 122 | github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect 123 | github.com/ncw/swift v1.0.53 // indirect 124 | github.com/oklog/ulid v1.3.1 // indirect 125 | github.com/olekukonko/tablewriter v0.0.5 // indirect 126 | github.com/opencontainers/runtime-spec v1.2.0 // indirect 127 | github.com/oracle/oci-go-sdk/v65 v65.79.0 // indirect 128 | github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect 129 | github.com/pierrec/lz4/v4 v4.1.22 // indirect 130 | github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect 131 | github.com/pkg/errors v0.9.1 // indirect 132 | github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect 133 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect 134 | github.com/prometheus/client_model v0.6.1 // indirect 135 | github.com/prometheus/common/sigv4 v0.1.0 // indirect 136 | github.com/prometheus/procfs v0.15.1 // indirect 137 | github.com/rivo/uniseg v0.4.7 // indirect 138 | github.com/rogpeppe/go-internal v1.13.1 // indirect 139 | github.com/rs/xid v1.6.0 // indirect 140 | github.com/sirupsen/logrus v1.9.3 // indirect 141 | github.com/sony/gobreaker v1.0.0 // indirect 142 | github.com/stretchr/testify v1.10.0 // indirect 143 | github.com/tencentyun/cos-go-sdk-v5 v0.7.59 // indirect 144 | github.com/zhangyunhao116/umap v0.0.0-20241028085443-797b8ba93f64 // indirect 145 | go.opencensus.io v0.24.0 // indirect 146 | go.opentelemetry.io/contrib/detectors/gcp v1.32.0 // indirect 147 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.57.0 // indirect 148 | go.opentelemetry.io/otel/metric v1.32.0 // indirect 149 | go.opentelemetry.io/otel/sdk/metric v1.32.0 // indirect 150 | go.opentelemetry.io/otel/trace v1.32.0 151 | go.uber.org/atomic v1.11.0 // indirect 152 | go.uber.org/goleak v1.3.0 // indirect 153 | golang.org/x/crypto v0.35.0 // indirect 154 | golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f // indirect 155 | golang.org/x/net v0.36.0 // indirect 156 | golang.org/x/oauth2 v0.24.0 // indirect 157 | golang.org/x/sync v0.11.0 // indirect 158 | golang.org/x/sys v0.30.0 // indirect 159 | golang.org/x/text v0.22.0 // indirect 160 | golang.org/x/time v0.8.0 // indirect 161 | gonum.org/v1/gonum v0.15.1 // indirect 162 | google.golang.org/api v0.209.0 // indirect 163 | google.golang.org/genproto v0.0.0-20241118233622-e639e219e697 // indirect 164 | google.golang.org/genproto/googleapis/api v0.0.0-20241118233622-e639e219e697 // indirect 165 | google.golang.org/genproto/googleapis/rpc v0.0.0-20241118233622-e639e219e697 // indirect 166 | google.golang.org/grpc v1.68.0 167 | google.golang.org/grpc/stats/opentelemetry v0.0.0-20241028142157-ada6787961b3 // indirect 168 | gopkg.in/yaml.v2 v2.4.0 // indirect 169 | k8s.io/apimachinery v0.31.3 // indirect 170 | k8s.io/client-go v0.31.3 // indirect 171 | k8s.io/klog/v2 v2.130.1 // indirect 172 | k8s.io/utils v0.0.0-20241104163129-6fe5fd82f078 // indirect 173 | ) 174 | -------------------------------------------------------------------------------- /internal/encoding/zigzag.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package encoding 6 | 7 | func ZigZagEncode(x int64) uint64 { 8 | return uint64(uint64(x<<1) ^ uint64((int64(x) >> 63))) 9 | } 10 | 11 | func ZigZagDecode(v uint64) int64 { 12 | return int64((v >> 1) ^ uint64((int64(v&1)<<63)>>63)) 13 | } 14 | -------------------------------------------------------------------------------- /internal/tracing/tracer.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package tracing 6 | 7 | import ( 8 | "context" 9 | "sync" 10 | 11 | "go.opentelemetry.io/otel" 12 | "go.opentelemetry.io/otel/trace" 13 | ) 14 | 15 | func Tracer() trace.Tracer { 16 | return sync.OnceValue(func() trace.Tracer { return otel.GetTracerProvider().Tracer("parquet-gateway") })() 17 | } 18 | 19 | func SpanFromContext(ctx context.Context) trace.Span { 20 | return trace.SpanFromContext(ctx) 21 | } 22 | -------------------------------------------------------------------------------- /internal/util/date.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package util 6 | 7 | import ( 8 | "fmt" 9 | "time" 10 | ) 11 | 12 | func SplitDays(start time.Time, end time.Time) ([]time.Time, error) { 13 | if !alignsToStartOfDay(start) { 14 | return nil, fmt.Errorf("start %q needs to align to start of day", start) 15 | } 16 | if !alignsToStartOfDay(end) { 17 | return nil, fmt.Errorf("end %q needs to align to start of day", end) 18 | } 19 | res := make([]time.Time, 0) 20 | 21 | cur := start 22 | res = append(res, cur.UTC()) 23 | for cur != end { 24 | cur = cur.AddDate(0, 0, 1) 25 | res = append(res, cur.UTC()) 26 | } 27 | 28 | return res, nil 29 | } 30 | 31 | func BeginOfDay(t time.Time) time.Time { 32 | year, month, day := t.Date() 33 | return time.Date(year, month, day, 0, 0, 0, 0, t.Location()) 34 | } 35 | 36 | func EndOfDay(t time.Time) time.Time { 37 | return BeginOfDay(t).AddDate(0, 0, 1) 38 | } 39 | 40 | func alignsToStartOfDay(t time.Time) bool { 41 | return t.Equal(BeginOfDay(t)) 42 | } 43 | -------------------------------------------------------------------------------- /internal/util/interval.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package util 6 | 7 | // Intersects returns if [a, b] and [c, d] intersect. 8 | func Intersects(a, b, c, d int64) bool { 9 | return !(c > b || a > d) 10 | } 11 | 12 | // Contains returns if [a, b] contains [c, d]. 13 | func Contains(a, b, c, d int64) bool { 14 | return a <= c && b >= d 15 | } 16 | 17 | // Intersection gives the intersection of [a, b] and [c, d]. 18 | // It should first be checked if both intersect using Intersects 19 | func Intersection(a, b, c, d int64) (int64, int64) { 20 | return max(a, c), min(b, d) 21 | } 22 | -------------------------------------------------------------------------------- /proto/metapb/meta.pb.go: -------------------------------------------------------------------------------- 1 | // Code generated by protoc-gen-go. DO NOT EDIT. 2 | // versions: 3 | // protoc-gen-go v1.35.2 4 | // protoc v5.28.3 5 | // source: meta.proto 6 | 7 | package metapb 8 | 9 | import ( 10 | protoreflect "google.golang.org/protobuf/reflect/protoreflect" 11 | protoimpl "google.golang.org/protobuf/runtime/protoimpl" 12 | reflect "reflect" 13 | sync "sync" 14 | ) 15 | 16 | const ( 17 | // Verify that this generated code is sufficiently up-to-date. 18 | _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) 19 | // Verify that runtime/protoimpl is sufficiently up-to-date. 20 | _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) 21 | ) 22 | 23 | type Metadata struct { 24 | state protoimpl.MessageState 25 | sizeCache protoimpl.SizeCache 26 | unknownFields protoimpl.UnknownFields 27 | 28 | Mint int64 `protobuf:"varint,1,opt,name=mint,proto3" json:"mint,omitempty"` 29 | Maxt int64 `protobuf:"varint,2,opt,name=maxt,proto3" json:"maxt,omitempty"` 30 | Shards int64 `protobuf:"varint,3,opt,name=shards,proto3" json:"shards,omitempty"` 31 | ColumnsForName map[string]*Columns `protobuf:"bytes,4,rep,name=columnsForName,proto3" json:"columnsForName,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` 32 | } 33 | 34 | func (x *Metadata) Reset() { 35 | *x = Metadata{} 36 | mi := &file_meta_proto_msgTypes[0] 37 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 38 | ms.StoreMessageInfo(mi) 39 | } 40 | 41 | func (x *Metadata) String() string { 42 | return protoimpl.X.MessageStringOf(x) 43 | } 44 | 45 | func (*Metadata) ProtoMessage() {} 46 | 47 | func (x *Metadata) ProtoReflect() protoreflect.Message { 48 | mi := &file_meta_proto_msgTypes[0] 49 | if x != nil { 50 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 51 | if ms.LoadMessageInfo() == nil { 52 | ms.StoreMessageInfo(mi) 53 | } 54 | return ms 55 | } 56 | return mi.MessageOf(x) 57 | } 58 | 59 | // Deprecated: Use Metadata.ProtoReflect.Descriptor instead. 60 | func (*Metadata) Descriptor() ([]byte, []int) { 61 | return file_meta_proto_rawDescGZIP(), []int{0} 62 | } 63 | 64 | func (x *Metadata) GetMint() int64 { 65 | if x != nil { 66 | return x.Mint 67 | } 68 | return 0 69 | } 70 | 71 | func (x *Metadata) GetMaxt() int64 { 72 | if x != nil { 73 | return x.Maxt 74 | } 75 | return 0 76 | } 77 | 78 | func (x *Metadata) GetShards() int64 { 79 | if x != nil { 80 | return x.Shards 81 | } 82 | return 0 83 | } 84 | 85 | func (x *Metadata) GetColumnsForName() map[string]*Columns { 86 | if x != nil { 87 | return x.ColumnsForName 88 | } 89 | return nil 90 | } 91 | 92 | type Columns struct { 93 | state protoimpl.MessageState 94 | sizeCache protoimpl.SizeCache 95 | unknownFields protoimpl.UnknownFields 96 | 97 | Columns []string `protobuf:"bytes,1,rep,name=columns,proto3" json:"columns,omitempty"` 98 | } 99 | 100 | func (x *Columns) Reset() { 101 | *x = Columns{} 102 | mi := &file_meta_proto_msgTypes[1] 103 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 104 | ms.StoreMessageInfo(mi) 105 | } 106 | 107 | func (x *Columns) String() string { 108 | return protoimpl.X.MessageStringOf(x) 109 | } 110 | 111 | func (*Columns) ProtoMessage() {} 112 | 113 | func (x *Columns) ProtoReflect() protoreflect.Message { 114 | mi := &file_meta_proto_msgTypes[1] 115 | if x != nil { 116 | ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) 117 | if ms.LoadMessageInfo() == nil { 118 | ms.StoreMessageInfo(mi) 119 | } 120 | return ms 121 | } 122 | return mi.MessageOf(x) 123 | } 124 | 125 | // Deprecated: Use Columns.ProtoReflect.Descriptor instead. 126 | func (*Columns) Descriptor() ([]byte, []int) { 127 | return file_meta_proto_rawDescGZIP(), []int{1} 128 | } 129 | 130 | func (x *Columns) GetColumns() []string { 131 | if x != nil { 132 | return x.Columns 133 | } 134 | return nil 135 | } 136 | 137 | var File_meta_proto protoreflect.FileDescriptor 138 | 139 | var file_meta_proto_rawDesc = []byte{ 140 | 0x0a, 0x0a, 0x6d, 0x65, 0x74, 0x61, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x04, 0x6d, 0x65, 141 | 0x74, 0x61, 0x22, 0xe8, 0x01, 0x0a, 0x08, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 142 | 0x12, 0x0a, 0x04, 0x6d, 0x69, 0x6e, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x03, 0x52, 0x04, 0x6d, 143 | 0x69, 0x6e, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x6d, 0x61, 0x78, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 144 | 0x03, 0x52, 0x04, 0x6d, 0x61, 0x78, 0x74, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x68, 0x61, 0x72, 0x64, 145 | 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x03, 0x52, 0x06, 0x73, 0x68, 0x61, 0x72, 0x64, 0x73, 0x12, 146 | 0x4a, 0x0a, 0x0e, 0x63, 0x6f, 0x6c, 0x75, 0x6d, 0x6e, 0x73, 0x46, 0x6f, 0x72, 0x4e, 0x61, 0x6d, 147 | 0x65, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x22, 0x2e, 0x6d, 0x65, 0x74, 0x61, 0x2e, 0x4d, 148 | 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x43, 0x6f, 0x6c, 0x75, 0x6d, 0x6e, 0x73, 0x46, 149 | 0x6f, 0x72, 0x4e, 0x61, 0x6d, 0x65, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0e, 0x63, 0x6f, 0x6c, 150 | 0x75, 0x6d, 0x6e, 0x73, 0x46, 0x6f, 0x72, 0x4e, 0x61, 0x6d, 0x65, 0x1a, 0x50, 0x0a, 0x13, 0x43, 151 | 0x6f, 0x6c, 0x75, 0x6d, 0x6e, 0x73, 0x46, 0x6f, 0x72, 0x4e, 0x61, 0x6d, 0x65, 0x45, 0x6e, 0x74, 152 | 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 153 | 0x03, 0x6b, 0x65, 0x79, 0x12, 0x23, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 154 | 0x01, 0x28, 0x0b, 0x32, 0x0d, 0x2e, 0x6d, 0x65, 0x74, 0x61, 0x2e, 0x43, 0x6f, 0x6c, 0x75, 0x6d, 155 | 0x6e, 0x73, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x23, 0x0a, 156 | 0x07, 0x43, 0x6f, 0x6c, 0x75, 0x6d, 0x6e, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x63, 0x6f, 0x6c, 0x75, 157 | 0x6d, 0x6e, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x09, 0x52, 0x07, 0x63, 0x6f, 0x6c, 0x75, 0x6d, 158 | 0x6e, 0x73, 0x42, 0x35, 0x5a, 0x33, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 159 | 0x2f, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x66, 0x6c, 0x61, 0x72, 0x65, 0x2f, 0x70, 0x61, 0x72, 0x71, 160 | 0x75, 0x65, 0x74, 0x2d, 0x74, 0x73, 0x64, 0x62, 0x2d, 0x70, 0x6f, 0x63, 0x2f, 0x70, 0x72, 0x6f, 161 | 0x74, 0x6f, 0x2f, 0x6d, 0x65, 0x74, 0x61, 0x70, 0x62, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 162 | 0x33, 163 | } 164 | 165 | var ( 166 | file_meta_proto_rawDescOnce sync.Once 167 | file_meta_proto_rawDescData = file_meta_proto_rawDesc 168 | ) 169 | 170 | func file_meta_proto_rawDescGZIP() []byte { 171 | file_meta_proto_rawDescOnce.Do(func() { 172 | file_meta_proto_rawDescData = protoimpl.X.CompressGZIP(file_meta_proto_rawDescData) 173 | }) 174 | return file_meta_proto_rawDescData 175 | } 176 | 177 | var file_meta_proto_msgTypes = make([]protoimpl.MessageInfo, 3) 178 | var file_meta_proto_goTypes = []any{ 179 | (*Metadata)(nil), // 0: meta.Metadata 180 | (*Columns)(nil), // 1: meta.Columns 181 | nil, // 2: meta.Metadata.ColumnsForNameEntry 182 | } 183 | var file_meta_proto_depIdxs = []int32{ 184 | 2, // 0: meta.Metadata.columnsForName:type_name -> meta.Metadata.ColumnsForNameEntry 185 | 1, // 1: meta.Metadata.ColumnsForNameEntry.value:type_name -> meta.Columns 186 | 2, // [2:2] is the sub-list for method output_type 187 | 2, // [2:2] is the sub-list for method input_type 188 | 2, // [2:2] is the sub-list for extension type_name 189 | 2, // [2:2] is the sub-list for extension extendee 190 | 0, // [0:2] is the sub-list for field type_name 191 | } 192 | 193 | func init() { file_meta_proto_init() } 194 | func file_meta_proto_init() { 195 | if File_meta_proto != nil { 196 | return 197 | } 198 | type x struct{} 199 | out := protoimpl.TypeBuilder{ 200 | File: protoimpl.DescBuilder{ 201 | GoPackagePath: reflect.TypeOf(x{}).PkgPath(), 202 | RawDescriptor: file_meta_proto_rawDesc, 203 | NumEnums: 0, 204 | NumMessages: 3, 205 | NumExtensions: 0, 206 | NumServices: 0, 207 | }, 208 | GoTypes: file_meta_proto_goTypes, 209 | DependencyIndexes: file_meta_proto_depIdxs, 210 | MessageInfos: file_meta_proto_msgTypes, 211 | }.Build() 212 | File_meta_proto = out.File 213 | file_meta_proto_rawDesc = nil 214 | file_meta_proto_goTypes = nil 215 | file_meta_proto_depIdxs = nil 216 | } 217 | -------------------------------------------------------------------------------- /proto/metapb/meta.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | package meta; 3 | 4 | option go_package = "github.com/cloudflare/parquet-tsdb-poc/proto/metapb"; 5 | 6 | 7 | message Metadata { 8 | int64 mint = 1; 9 | int64 maxt = 2; 10 | int64 shards = 3; 11 | map columnsForName = 4; 12 | } 13 | 14 | message Columns { 15 | repeated string columns = 1; 16 | } 17 | -------------------------------------------------------------------------------- /revive.toml: -------------------------------------------------------------------------------- 1 | ignoreGeneratedHeader = false 2 | severity = "warning" 3 | confidence = 0.8 4 | errorCode = 1 5 | warningCode = 1 6 | 7 | [rule.blank-imports] 8 | [rule.context-as-argument] 9 | [rule.context-keys-type] 10 | [rule.dot-imports] 11 | [rule.error-return] 12 | [rule.error-strings] 13 | [rule.error-naming] 14 | [rule.if-return] 15 | [rule.increment-decrement] 16 | [rule.var-naming] 17 | [rule.var-declaration] 18 | [rule.range] 19 | [rule.receiver-naming] 20 | [rule.time-naming] 21 | [rule.indent-error-flow] 22 | [rule.errorf] 23 | [rule.empty-block] 24 | [rule.superfluous-else] 25 | [rule.unused-parameter] 26 | [rule.unreachable-code] 27 | [rule.redefines-builtin-id] 28 | [rule.file-header] 29 | arguments = ["Copyright \\(c\\) 2025 Cloudflare, Inc."] 30 | 31 | -------------------------------------------------------------------------------- /schema/block.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package schema 6 | 7 | import ( 8 | "fmt" 9 | "path/filepath" 10 | "time" 11 | 12 | "github.com/cloudflare/parquet-tsdb-poc/internal/util" 13 | ) 14 | 15 | const ( 16 | MetaFile = "meta.pb" 17 | dateFormat = "%04d/%02d/%02d" 18 | ) 19 | 20 | func SplitBlockPath(name string) (string, string, bool) { 21 | var ( 22 | year, month, day int 23 | file string 24 | ) 25 | n, err := fmt.Sscanf(name, dateFormat+"/%s", &year, &month, &day, &file) 26 | if err != nil { 27 | return "", "", false 28 | } 29 | if n != 4 { 30 | return "", "", false 31 | } 32 | return filepath.Dir(name), file, true 33 | } 34 | 35 | func BlockNameForDay(t time.Time) (string, error) { 36 | if t.Location() != time.UTC { 37 | return "", fmt.Errorf("block start time %s must be in UTC", t) 38 | } 39 | if !t.Equal(util.BeginOfDay(t)) { 40 | return "", fmt.Errorf("block start time %s must be aligned to a day", t) 41 | } 42 | year, month, day := t.Date() 43 | return fmt.Sprintf(dateFormat, year, int(month), day), nil 44 | } 45 | 46 | func LabelsPfileNameForShard(name string, shard int) string { 47 | return fmt.Sprintf("%s/%d.%s", name, shard, "labels.parquet") 48 | } 49 | func ChunksPfileNameForShard(name string, shard int) string { 50 | return fmt.Sprintf("%s/%d.%s", name, shard, "chunks.parquet") 51 | } 52 | 53 | func MetaFileNameForBlock(name string) string { 54 | return fmt.Sprintf("%s/%s", name, MetaFile) 55 | } 56 | -------------------------------------------------------------------------------- /schema/block_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package schema 6 | 7 | import ( 8 | "testing" 9 | "time" 10 | ) 11 | 12 | func TestBlockNameForDay(t *testing.T) { 13 | t.Run("", func(t *testing.T) { 14 | d := time.Unix(10, 0).UTC() 15 | if _, err := BlockNameForDay(d); err == nil { 16 | t.Fatal("expected error, got none") 17 | } 18 | }) 19 | t.Run("", func(t *testing.T) { 20 | d := time.Unix(0, 0).UTC() 21 | b, err := BlockNameForDay(d) 22 | if err != nil { 23 | t.Fatal("unexpected error: ", err) 24 | } 25 | want := "1970/01/01" 26 | got := b 27 | if want != got { 28 | t.Fatalf("wanted %q, got %q", want, got) 29 | } 30 | }) 31 | t.Run("", func(t *testing.T) { 32 | d := time.Unix(1732320000, 0).UTC() 33 | b, err := BlockNameForDay(d) 34 | if err != nil { 35 | t.Fatal("unexpected error: ", err) 36 | } 37 | want := "2024/11/23" 38 | got := b 39 | if want != got { 40 | t.Fatalf("wanted %q, got %q", want, got) 41 | } 42 | }) 43 | } 44 | -------------------------------------------------------------------------------- /schema/schema.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package schema 6 | 7 | import ( 8 | "fmt" 9 | "slices" 10 | "strings" 11 | "time" 12 | 13 | "github.com/parquet-go/parquet-go" 14 | "github.com/parquet-go/parquet-go/compress/zstd" 15 | ) 16 | 17 | const ( 18 | LabelColumnPrefix = "___cf_meta_label_" 19 | ChunksColumn0 = "___cf_meta_chunk_0" 20 | ChunksColumn1 = "___cf_meta_chunk_1" 21 | ChunksColumn2 = "___cf_meta_chunk_2" 22 | ) 23 | 24 | const ( 25 | ChunkColumnLength = 8 * time.Hour 26 | ChunkColumnsPerDay = 3 27 | ) 28 | 29 | func LabelNameToColumn(lbl string) string { 30 | return fmt.Sprintf("%s%s", LabelColumnPrefix, lbl) 31 | } 32 | 33 | func ColumnToLabelName(col string) string { 34 | return strings.TrimPrefix(col, LabelColumnPrefix) 35 | } 36 | 37 | func BuildSchemaFromLabels(lbls []string) *parquet.Schema { 38 | g := make(parquet.Group) 39 | 40 | for _, lbl := range lbls { 41 | g[LabelNameToColumn(lbl)] = parquet.Optional(parquet.Encoded(parquet.String(), &parquet.RLEDictionary)) 42 | } 43 | 44 | chunkNode := parquet.Encoded(parquet.Leaf(parquet.ByteArrayType), &parquet.DeltaLengthByteArray) 45 | g[ChunksColumn0] = chunkNode 46 | g[ChunksColumn1] = chunkNode 47 | g[ChunksColumn2] = chunkNode 48 | return parquet.NewSchema("tsdb", g) 49 | } 50 | 51 | func WithCompression(s *parquet.Schema) *parquet.Schema { 52 | g := make(parquet.Group) 53 | 54 | for _, c := range s.Columns() { 55 | lc, _ := s.Lookup(c...) 56 | g[lc.Path[0]] = parquet.Compressed(lc.Node, &zstd.Codec{Level: zstd.SpeedBetterCompression}) 57 | } 58 | 59 | return parquet.NewSchema("uncompressed", g) 60 | } 61 | 62 | func Projection(schema *parquet.Schema, projections []string) *parquet.Schema { 63 | g := make(parquet.Group) 64 | 65 | for i := range projections { 66 | lc, ok := schema.Lookup(projections[i]) 67 | if !ok { 68 | continue 69 | } 70 | g[projections[i]] = lc.Node 71 | } 72 | return parquet.NewSchema("projection", g) 73 | } 74 | 75 | var ( 76 | ChunkColumns = []string{ChunksColumn0, ChunksColumn1, ChunksColumn2} 77 | ) 78 | 79 | func ChunkProjection(s *parquet.Schema) *parquet.Schema { 80 | g := make(parquet.Group) 81 | 82 | for _, c := range ChunkColumns { 83 | lc, ok := s.Lookup(c) 84 | if !ok { 85 | continue 86 | } 87 | g[c] = lc.Node 88 | } 89 | return parquet.NewSchema("chunk-projection", g) 90 | } 91 | 92 | func LabelsProjection(s *parquet.Schema) *parquet.Schema { 93 | g := make(parquet.Group) 94 | 95 | for _, c := range s.Columns() { 96 | if slices.Contains(ChunkColumns, c[0]) { 97 | continue 98 | } 99 | lc, ok := s.Lookup(c...) 100 | if !ok { 101 | continue 102 | } 103 | g[c[0]] = lc.Node 104 | } 105 | return parquet.NewSchema("labels-projection", g) 106 | } 107 | 108 | func Joined(l, r *parquet.Schema) *parquet.Schema { 109 | g := make(parquet.Group) 110 | 111 | for _, c := range l.Columns() { 112 | lc, ok := l.Lookup(c...) 113 | if !ok { 114 | continue 115 | } 116 | g[c[0]] = lc.Node 117 | } 118 | for _, c := range r.Columns() { 119 | lc, ok := r.Lookup(c...) 120 | if !ok { 121 | continue 122 | } 123 | g[c[0]] = lc.Node 124 | } 125 | return parquet.NewSchema("joined", g) 126 | } 127 | -------------------------------------------------------------------------------- /search/constraint.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package search 6 | 7 | import ( 8 | "fmt" 9 | "io" 10 | "slices" 11 | "sort" 12 | 13 | "github.com/parquet-go/parquet-go" 14 | "github.com/prometheus/prometheus/model/labels" 15 | ) 16 | 17 | type equalConstraint struct { 18 | path string 19 | 20 | val parquet.Value 21 | 22 | // set during init 23 | col int 24 | comp func(l, r parquet.Value) int 25 | 26 | inspectPages bool 27 | } 28 | 29 | var _ Constraint = &equalConstraint{} 30 | 31 | func Equal(path string, value parquet.Value) Constraint { 32 | return &equalConstraint{path: path, val: value} 33 | } 34 | 35 | func EqualWithPageCheck(path string, value parquet.Value) Constraint { 36 | return &equalConstraint{path: path, val: value, inspectPages: true} 37 | } 38 | 39 | func (ec *equalConstraint) rowRanges(rg parquet.RowGroup) ([]rowRange, error) { 40 | col, ok := rg.Schema().Lookup(ec.path) 41 | if !ok { 42 | return nil, nil 43 | } 44 | cc := rg.ColumnChunks()[col.ColumnIndex] 45 | 46 | if skip, err := ec.skipByBloomfilter(cc); err != nil { 47 | return nil, fmt.Errorf("unable to skip by bloomfilter: %w", err) 48 | } else if skip { 49 | return nil, nil 50 | } 51 | 52 | var pgs parquet.Pages 53 | if ec.inspectPages { 54 | pgs = cc.Pages() 55 | defer pgs.Close() 56 | } 57 | 58 | oidx, err := cc.OffsetIndex() 59 | if err != nil { 60 | return nil, fmt.Errorf("unable to read offset index: %w", err) 61 | } 62 | cidx, err := cc.ColumnIndex() 63 | if err != nil { 64 | return nil, fmt.Errorf("unable to read column index: %w", err) 65 | } 66 | var buf []parquet.Value 67 | 68 | res := make([]rowRange, 0) 69 | for i := 0; i < cidx.NumPages(); i++ { 70 | if cidx.NullPage(i) { 71 | pagesDiscarded.WithLabelValues(ec.path).Inc() 72 | continue 73 | } 74 | minv, maxv := cidx.MinValue(i), cidx.MaxValue(i) 75 | if !ec.val.IsNull() && !maxv.IsNull() && ec.comp(ec.val, maxv) > 0 { 76 | pagesDiscarded.WithLabelValues(ec.path).Inc() 77 | if cidx.IsDescending() { 78 | break 79 | } 80 | continue 81 | } 82 | if !ec.val.IsNull() && !minv.IsNull() && ec.comp(ec.val, minv) < 0 { 83 | pagesDiscarded.WithLabelValues(ec.path).Inc() 84 | if cidx.IsAscending() { 85 | break 86 | } 87 | continue 88 | } 89 | from := oidx.FirstRowIndex(i) 90 | 91 | // TODO: this would also work for descending columns, but for now 92 | // this is only used on __name__ which is ascending anyway so we dont 93 | // bother implementing it for descending or unordered columns. 94 | // TODO: for unordered columns we could inspect the dictionary here 95 | // and for descending columns we just have to flip the inequalities 96 | if ec.inspectPages && cidx.IsAscending() { 97 | if err := pgs.SeekToRow(from); err != nil { 98 | return nil, fmt.Errorf("unable to seek to row: %w", err) 99 | } 100 | pg, err := pgs.ReadPage() 101 | if err != nil { 102 | return nil, fmt.Errorf("unable to read page: %w", err) 103 | } 104 | pagesRead.WithLabelValues(ec.path).Inc() 105 | vals, rows := pg.Values(), pg.NumRows() 106 | buf = slices.Grow(buf, int(rows))[:rows] 107 | n, err := vals.ReadValues(buf) 108 | if err != nil && err != io.EOF { 109 | return nil, fmt.Errorf("unable to read page values") 110 | } 111 | l := sort.Search(n, func(i int) bool { return ec.comp(ec.val, buf[i]) <= 0 }) 112 | r := sort.Search(n, func(i int) bool { return ec.comp(ec.val, buf[i]) < 0 }) 113 | res = append(res, rowRange{from: from + int64(l), count: int64(r - l)}) 114 | } else { 115 | count := rg.NumRows() - from 116 | if i < oidx.NumPages()-1 { 117 | count = oidx.FirstRowIndex(i+1) - from 118 | } 119 | res = append(res, rowRange{from: from, count: count}) 120 | } 121 | } 122 | if len(res) == 0 { 123 | return nil, nil 124 | } 125 | return simplify(res), nil 126 | } 127 | 128 | func (ec *equalConstraint) accept(r parquet.Row) bool { 129 | return ec.comp(r[ec.col], ec.val) == 0 130 | } 131 | 132 | func (ec *equalConstraint) init(s *parquet.Schema) error { 133 | c, ok := s.Lookup(ec.path) 134 | if !ok { 135 | return fmt.Errorf("schema: must contain path: %s", ec.path) 136 | } 137 | if c.Node.Type().Kind() != ec.val.Kind() { 138 | return fmt.Errorf("schema: cannot search value of kind %s in column of kind %s", ec.val.Kind(), c.Node.Type().Kind()) 139 | } 140 | ec.comp = c.Node.Type().Compare 141 | ec.col = c.ColumnIndex 142 | return nil 143 | } 144 | 145 | func (ec *equalConstraint) skipByBloomfilter(cc parquet.ColumnChunk) (bool, error) { 146 | bf := cc.BloomFilter() 147 | if bf == nil { 148 | return false, nil 149 | } 150 | ok, err := bf.Check(ec.val) 151 | if err != nil { 152 | return false, fmt.Errorf("unable to check bloomfilter: %w", err) 153 | } 154 | return !ok, nil 155 | } 156 | 157 | type andConstraint struct { 158 | cs []Constraint 159 | } 160 | 161 | var _ Constraint = &andConstraint{} 162 | 163 | func And(cs ...Constraint) Constraint { 164 | return &andConstraint{cs: cs} 165 | } 166 | 167 | func (ac *andConstraint) rowRanges(rg parquet.RowGroup) ([]rowRange, error) { 168 | var res []rowRange 169 | for i := range ac.cs { 170 | rrs, err := ac.cs[i].rowRanges(rg) 171 | if err != nil { 172 | return nil, fmt.Errorf("unable to get lhs row ranges: %w", err) 173 | } 174 | if i == 0 { 175 | res = rrs 176 | } else { 177 | res = intersectRowRanges(res, rrs) 178 | } 179 | } 180 | return simplify(res), nil 181 | } 182 | 183 | func (ac *andConstraint) accept(r parquet.Row) bool { 184 | for i := range ac.cs { 185 | if !ac.cs[i].accept(r) { 186 | return false 187 | } 188 | } 189 | return true 190 | } 191 | 192 | func (ac *andConstraint) init(s *parquet.Schema) error { 193 | for i := range ac.cs { 194 | if err := ac.cs[i].init(s); err != nil { 195 | return fmt.Errorf("unable to init constraint %d: %w", i, err) 196 | } 197 | } 198 | return nil 199 | } 200 | 201 | type notConstraint struct { 202 | cs Constraint 203 | } 204 | 205 | var _ Constraint = ¬Constraint{} 206 | 207 | func Not(cs Constraint) Constraint { 208 | return ¬Constraint{cs: cs} 209 | } 210 | 211 | func (nc *notConstraint) rowRanges(rg parquet.RowGroup) ([]rowRange, error) { 212 | return []rowRange{{from: 0, count: int64(rg.NumRows())}}, nil 213 | } 214 | 215 | func (nc *notConstraint) accept(r parquet.Row) bool { 216 | return !nc.cs.accept(r) 217 | } 218 | 219 | func (nc *notConstraint) init(s *parquet.Schema) error { 220 | return nc.cs.init(s) 221 | } 222 | 223 | type nullConstraint struct { 224 | } 225 | 226 | var _ Constraint = &nullConstraint{} 227 | 228 | func Null() Constraint { 229 | return &nullConstraint{} 230 | } 231 | 232 | func (null *nullConstraint) rowRanges(parquet.RowGroup) ([]rowRange, error) { 233 | return nil, nil 234 | } 235 | 236 | func (null *nullConstraint) accept(parquet.Row) bool { 237 | return false 238 | } 239 | 240 | func (null *nullConstraint) init(_ *parquet.Schema) error { 241 | return nil 242 | } 243 | 244 | type orConstraint struct { 245 | cs []Constraint 246 | } 247 | 248 | var _ Constraint = &orConstraint{} 249 | 250 | func Or(cs ...Constraint) Constraint { 251 | return &orConstraint{cs: cs} 252 | } 253 | 254 | func Set(path string, values []parquet.Value) Constraint { 255 | equals := []Constraint{} 256 | for _, val := range values { 257 | equals = append(equals, Equal(path, val)) 258 | } 259 | return &orConstraint{cs: equals} 260 | } 261 | 262 | func (oc *orConstraint) rowRanges(rg parquet.RowGroup) ([]rowRange, error) { 263 | var res []rowRange 264 | for i := range oc.cs { 265 | rrs, err := oc.cs[i].rowRanges(rg) 266 | if err != nil { 267 | return nil, fmt.Errorf("unable to get lhs row ranges: %w", err) 268 | } 269 | res = append(res, rrs...) 270 | } 271 | return simplify(res), nil 272 | } 273 | 274 | func (oc *orConstraint) accept(r parquet.Row) bool { 275 | for i := range oc.cs { 276 | if oc.cs[i].accept(r) { 277 | return true 278 | } 279 | } 280 | return false 281 | } 282 | 283 | func (oc *orConstraint) init(s *parquet.Schema) error { 284 | for i := range oc.cs { 285 | if err := oc.cs[i].init(s); err != nil { 286 | return fmt.Errorf("unable to init constraint %d: %w", i, err) 287 | } 288 | } 289 | return nil 290 | } 291 | 292 | type regexConstraint struct { 293 | path string 294 | matcher *labels.FastRegexMatcher 295 | 296 | // set during init 297 | col int 298 | cache map[string]bool 299 | } 300 | 301 | var _ Constraint = ®exConstraint{} 302 | 303 | func Regex(path string, regex string) (Constraint, error) { 304 | matcher, err := labels.NewFastRegexMatcher(regex) 305 | if err != nil { 306 | return nil, err 307 | } 308 | 309 | if len(matcher.SetMatches()) > 0 { 310 | vals := []parquet.Value{} 311 | for _, match := range matcher.SetMatches() { 312 | vals = append(vals, parquet.ValueOf(match)) 313 | } 314 | return Set(path, vals), nil 315 | } 316 | 317 | return ®exConstraint{ 318 | matcher: matcher, 319 | path: path, 320 | }, nil 321 | } 322 | 323 | func (ec *regexConstraint) rowRanges(rg parquet.RowGroup) ([]rowRange, error) { 324 | col, ok := rg.Schema().Lookup(ec.path) 325 | if !ok { 326 | return nil, nil 327 | } 328 | cc := rg.ColumnChunks()[col.ColumnIndex] 329 | 330 | oidx, err := cc.OffsetIndex() 331 | if err != nil { 332 | return nil, fmt.Errorf("unable to read offset index: %w", err) 333 | } 334 | cidx, err := cc.ColumnIndex() 335 | if err != nil { 336 | return nil, fmt.Errorf("unable to read column index: %w", err) 337 | } 338 | 339 | res := []rowRange{} 340 | for i := 0; i < cidx.NumPages(); i++ { 341 | if cidx.NullPage(i) { 342 | pagesDiscarded.WithLabelValues(ec.path).Inc() 343 | continue 344 | } 345 | from := oidx.FirstRowIndex(i) 346 | count := rg.NumRows() - from 347 | if i < oidx.NumPages()-1 { 348 | count = oidx.FirstRowIndex(i+1) - from 349 | } 350 | res = append(res, rowRange{from: from, count: count}) 351 | } 352 | return simplify(res), nil 353 | } 354 | 355 | func (ec *regexConstraint) accept(r parquet.Row) bool { 356 | val := r[ec.col].String() 357 | accept, seen := ec.cache[val] 358 | if !seen { 359 | accept = ec.matcher.MatchString(val) 360 | ec.cache[val] = accept 361 | } 362 | return accept 363 | } 364 | 365 | func (ec *regexConstraint) init(s *parquet.Schema) error { 366 | c, ok := s.Lookup(ec.path) 367 | if !ok { 368 | return fmt.Errorf("schema: must contain path: %s", ec.path) 369 | } 370 | 371 | if c.Node.Type().Kind() != parquet.ByteArray { 372 | return fmt.Errorf("schema: expected string, cannot assert regex of type %s", c.Node.Type().String()) 373 | } 374 | 375 | ec.cache = map[string]bool{} 376 | ec.col = c.ColumnIndex 377 | 378 | return nil 379 | } 380 | -------------------------------------------------------------------------------- /search/metrics.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package search 6 | 7 | import ( 8 | "github.com/hashicorp/go-multierror" 9 | "github.com/prometheus/client_golang/prometheus" 10 | ) 11 | 12 | var ( 13 | pagesDiscarded = prometheus.NewCounterVec(prometheus.CounterOpts{ 14 | Name: "pages_discarded_total", 15 | Help: "Pages discarded during parquet index scans", 16 | }, []string{"column"}, 17 | ) 18 | pagesRead = prometheus.NewCounterVec(prometheus.CounterOpts{ 19 | Name: "pages_read_total", 20 | Help: "Pages read during parquet page scans", 21 | }, []string{"column"}, 22 | ) 23 | ) 24 | 25 | func RegisterMetrics(reg prometheus.Registerer) error { 26 | return multierror.Append(nil, 27 | reg.Register(pagesDiscarded), 28 | reg.Register(pagesRead), 29 | ).ErrorOrNil() 30 | } 31 | -------------------------------------------------------------------------------- /search/rowrange.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package search 6 | 7 | import ( 8 | "sort" 9 | ) 10 | 11 | type rowRange struct { 12 | from int64 13 | count int64 14 | } 15 | 16 | // intersect intserects the row ranges from left hand sight with the row ranges from rhs 17 | // it assumes that lhs and rhs are simplified and returns a simplified result. 18 | // it operates in o(l+r) time by cursoring through ranges with a two pointer approach. 19 | func intersectRowRanges(lhs []rowRange, rhs []rowRange) []rowRange { 20 | res := make([]rowRange, 0) 21 | for l, r := 0, 0; l < len(lhs) && r < len(rhs); { 22 | al, bl := lhs[l].from, lhs[l].from+lhs[l].count 23 | ar, br := rhs[r].from, rhs[r].from+rhs[r].count 24 | 25 | // check if rows intersect 26 | if al <= br && ar <= bl { 27 | os, oe := max(al, ar), min(bl, br) 28 | res = append(res, rowRange{from: os, count: oe - os}) 29 | } 30 | 31 | // advance the cursor of the range that ends first 32 | if bl <= br { 33 | l++ 34 | } else { 35 | r++ 36 | } 37 | } 38 | 39 | return simplify(res) 40 | } 41 | 42 | func simplify(rr []rowRange) []rowRange { 43 | if len(rr) == 0 { 44 | return nil 45 | } 46 | 47 | sort.Slice(rr, func(i, j int) bool { 48 | return rr[i].from < rr[j].from 49 | }) 50 | 51 | tmp := make([]rowRange, 0) 52 | l := rr[0] 53 | for i := 1; i < len(rr); i++ { 54 | r := rr[i] 55 | al, bl := l.from, l.from+l.count 56 | ar, br := r.from, r.from+r.count 57 | if bl < ar { 58 | tmp = append(tmp, l) 59 | l = r 60 | continue 61 | } 62 | 63 | from := min(al, ar) 64 | count := max(bl, br) - from 65 | if count == 0 { 66 | continue 67 | } 68 | 69 | l = rowRange{ 70 | from: from, 71 | count: count, 72 | } 73 | } 74 | 75 | tmp = append(tmp, l) 76 | res := make([]rowRange, 0, len(tmp)) 77 | for i := range tmp { 78 | if tmp[i].count != 0 { 79 | res = append(res, tmp[i]) 80 | } 81 | } 82 | 83 | return res 84 | } 85 | -------------------------------------------------------------------------------- /search/rowrange_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package search 6 | 7 | import ( 8 | "slices" 9 | "testing" 10 | ) 11 | 12 | func TestIntersect(t *testing.T) { 13 | for _, tt := range []struct{ lhs, rhs, expect []rowRange }{ 14 | { 15 | lhs: []rowRange{{from: 0, count: 4}}, 16 | rhs: []rowRange{{from: 2, count: 6}}, 17 | expect: []rowRange{{from: 2, count: 2}}, 18 | }, 19 | { 20 | lhs: []rowRange{{from: 0, count: 4}}, 21 | rhs: []rowRange{{from: 6, count: 8}}, 22 | expect: []rowRange{}, 23 | }, 24 | { 25 | lhs: []rowRange{{from: 0, count: 4}}, 26 | rhs: []rowRange{{from: 0, count: 4}}, 27 | expect: []rowRange{{from: 0, count: 4}}, 28 | }, 29 | { 30 | lhs: []rowRange{{from: 0, count: 4}, {from: 8, count: 2}}, 31 | rhs: []rowRange{{from: 2, count: 9}}, 32 | expect: []rowRange{{from: 2, count: 2}, {from: 8, count: 2}}, 33 | }, 34 | { 35 | lhs: []rowRange{{from: 0, count: 1}, {from: 4, count: 1}}, 36 | rhs: []rowRange{{from: 2, count: 1}, {from: 6, count: 1}}, 37 | expect: []rowRange{}, 38 | }, 39 | { 40 | lhs: []rowRange{{from: 0, count: 2}, {from: 2, count: 2}}, 41 | rhs: []rowRange{{from: 1, count: 2}, {from: 3, count: 2}}, 42 | expect: []rowRange{{from: 1, count: 3}}, 43 | }, 44 | { 45 | lhs: []rowRange{{from: 0, count: 2}, {from: 5, count: 2}}, 46 | rhs: []rowRange{{from: 0, count: 10}}, 47 | expect: []rowRange{{from: 0, count: 2}, {from: 5, count: 2}}, 48 | }, 49 | { 50 | lhs: []rowRange{{from: 0, count: 2}, {from: 3, count: 1}, {from: 5, count: 2}, {from: 12, count: 10}}, 51 | rhs: []rowRange{{from: 0, count: 10}, {from: 15, count: 32}}, 52 | expect: []rowRange{{from: 0, count: 2}, {from: 3, count: 1}, {from: 5, count: 2}, {from: 15, count: 7}}, 53 | }, 54 | { 55 | lhs: []rowRange{}, 56 | rhs: []rowRange{{from: 0, count: 10}}, 57 | expect: []rowRange{}, 58 | }, 59 | { 60 | lhs: []rowRange{{from: 0, count: 10}}, 61 | rhs: []rowRange{}, 62 | expect: []rowRange{}, 63 | }, 64 | { 65 | lhs: []rowRange{}, 66 | rhs: []rowRange{}, 67 | expect: []rowRange{}, 68 | }, 69 | { 70 | lhs: []rowRange{{from: 0, count: 2}}, 71 | rhs: []rowRange{{from: 0, count: 1}, {from: 1, count: 1}, {from: 2, count: 1}}, 72 | expect: []rowRange{{from: 0, count: 2}}, 73 | }, 74 | } { 75 | t.Run("", func(t *testing.T) { 76 | if res := intersectRowRanges(tt.lhs, tt.rhs); !slices.Equal(res, tt.expect) { 77 | t.Fatalf("Expected %q to match %q", res, tt.expect) 78 | } 79 | }) 80 | } 81 | } 82 | 83 | func TestSimplify(t *testing.T) { 84 | for _, tt := range []struct{ in, expect []rowRange }{ 85 | { 86 | in: []rowRange{ 87 | {from: 0, count: 15}, 88 | {from: 4, count: 4}, 89 | }, 90 | expect: []rowRange{ 91 | {from: 0, count: 15}, 92 | }, 93 | }, 94 | { 95 | in: []rowRange{ 96 | {from: 4, count: 4}, 97 | {from: 4, count: 2}, 98 | }, 99 | expect: []rowRange{ 100 | {from: 4, count: 4}, 101 | }, 102 | }, 103 | { 104 | in: []rowRange{ 105 | {from: 0, count: 4}, 106 | {from: 1, count: 5}, 107 | {from: 8, count: 10}, 108 | }, 109 | expect: []rowRange{ 110 | {from: 0, count: 6}, 111 | {from: 8, count: 10}, 112 | }, 113 | }, 114 | } { 115 | t.Run("", func(t *testing.T) { 116 | if res := simplify(tt.in); !slices.Equal(res, tt.expect) { 117 | t.Fatalf("Expected %v to match %v", res, tt.expect) 118 | } 119 | }) 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /search/search.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package search 6 | 7 | import ( 8 | "context" 9 | "errors" 10 | "fmt" 11 | "io" 12 | 13 | "github.com/hashicorp/go-multierror" 14 | "github.com/parquet-go/parquet-go" 15 | 16 | "github.com/cloudflare/parquet-tsdb-poc/internal/tracing" 17 | "github.com/cloudflare/parquet-tsdb-poc/schema" 18 | ) 19 | 20 | type Constraint interface { 21 | // rowRanges returns a set of non-overlapping increasing row indexes that may satisfy the constraint. 22 | rowRanges(rg parquet.RowGroup) ([]rowRange, error) 23 | // accept returns if this constraint is satisfied by the row. 24 | accept(r parquet.Row) bool 25 | // init initializes the constraint with respect to the file schema and projections. 26 | init(s *parquet.Schema) error 27 | } 28 | 29 | type RowReaderCloser interface { 30 | parquet.RowReader 31 | io.Closer 32 | } 33 | 34 | func Match( 35 | ctx context.Context, 36 | c Constraint, 37 | labelPfile *parquet.File, 38 | labelSchema *parquet.Schema, 39 | chunkPfile *parquet.File, 40 | chunkSchema *parquet.Schema, 41 | ) (RowReaderCloser, error) { 42 | ctx, span := tracing.Tracer().Start(ctx, "Match") 43 | defer span.End() 44 | 45 | labelRowGroups := labelPfile.RowGroups() 46 | chunkRowGroups := chunkPfile.RowGroups() 47 | 48 | joinedSchema := schema.Joined(labelSchema, chunkSchema) 49 | if err := c.init(joinedSchema); err != nil { 50 | return nil, fmt.Errorf("unable to initialize constraints: %w", err) 51 | } 52 | 53 | // label and chunk files have same number of rows and rowgroups, just pick either 54 | numRowGroups := len(labelRowGroups) 55 | 56 | rrs := make([]RowReaderCloser, 0, numRowGroups) 57 | for i := 0; i != numRowGroups; i++ { 58 | ranges, err := c.rowRanges(labelRowGroups[i]) 59 | if err != nil { 60 | return nil, fmt.Errorf("unable to compute ranges for row group: %w", err) 61 | } 62 | if len(ranges) == 0 { 63 | continue 64 | } 65 | 66 | columnChunks := make([]parquet.ColumnChunk, 0, len(joinedSchema.Columns())) 67 | for _, p := range joinedSchema.Columns() { 68 | if col, ok := labelRowGroups[i].Schema().Lookup(p...); ok { 69 | columnChunks = append(columnChunks, labelRowGroups[i].ColumnChunks()[col.ColumnIndex]) 70 | } else if col, ok := chunkRowGroups[i].Schema().Lookup(p...); ok { 71 | columnChunks = append(columnChunks, chunkRowGroups[i].ColumnChunks()[col.ColumnIndex]) 72 | } else { 73 | // nothing to read here really 74 | continue 75 | } 76 | } 77 | rrs = append(rrs, newRangesRowReader(ranges, newRowGroupRows(joinedSchema, columnChunks))) 78 | } 79 | return newFilterRowReader(newConcatRowReader(rrs), c.accept), nil 80 | } 81 | 82 | type rangesRowReader struct { 83 | ranges []rowRange 84 | rows parquet.Rows 85 | 86 | n int 87 | rMaxRow int 88 | rCurRow int 89 | } 90 | 91 | func newRangesRowReader(ranges []rowRange, rows parquet.Rows) *rangesRowReader { 92 | return &rangesRowReader{ranges: ranges, rows: rows, n: -1} 93 | } 94 | 95 | func (r *rangesRowReader) next() error { 96 | if r.n == len(r.ranges)-1 { 97 | return io.EOF 98 | } 99 | r.n++ 100 | r.rMaxRow = int(r.ranges[r.n].count) 101 | r.rCurRow = 0 102 | return r.rows.SeekToRow(r.ranges[r.n].from) 103 | } 104 | 105 | func (r *rangesRowReader) ReadRows(buf []parquet.Row) (int, error) { 106 | canRead := r.rMaxRow - r.rCurRow 107 | if canRead == 0 { 108 | if err := r.next(); err != nil { 109 | return 0, err 110 | } 111 | canRead = r.rMaxRow - r.rCurRow 112 | } 113 | buf = buf[:min(len(buf), canRead)] 114 | 115 | n, err := r.rows.ReadRows(buf) 116 | if err != nil { 117 | return n, err 118 | } 119 | r.rCurRow += n 120 | return n, err 121 | 122 | } 123 | 124 | func (r *rangesRowReader) Close() error { 125 | return r.rows.Close() 126 | } 127 | 128 | type concatRowReader struct { 129 | idx int 130 | rrs []RowReaderCloser 131 | } 132 | 133 | func newConcatRowReader(rrs []RowReaderCloser) *concatRowReader { 134 | return &concatRowReader{rrs: rrs} 135 | } 136 | 137 | func (f *concatRowReader) ReadRows(r []parquet.Row) (int, error) { 138 | if f.idx >= len(f.rrs) { 139 | return 0, io.EOF 140 | } 141 | n := 0 142 | for n != len(r) && f.idx != len(f.rrs) { 143 | m, err := f.rrs[f.idx].ReadRows(r[n:]) 144 | n += m 145 | if err != nil { 146 | if err == io.EOF { 147 | f.idx++ 148 | } else { 149 | return n, err 150 | } 151 | } 152 | } 153 | if n != len(r) { 154 | return n, io.EOF 155 | } 156 | return n, nil 157 | } 158 | 159 | func (f *concatRowReader) Close() error { 160 | var err *multierror.Error 161 | for i := range f.rrs { 162 | err = multierror.Append(err, f.rrs[i].Close()) 163 | } 164 | return err.ErrorOrNil() 165 | } 166 | 167 | type filterRowReader struct { 168 | rr parquet.RowReader 169 | closer io.Closer 170 | } 171 | 172 | func newFilterRowReader(rr RowReaderCloser, accept func(r parquet.Row) bool) *filterRowReader { 173 | return &filterRowReader{rr: parquet.FilterRowReader(rr, accept), closer: rr} 174 | } 175 | 176 | func (f *filterRowReader) ReadRows(r []parquet.Row) (int, error) { 177 | return f.rr.ReadRows(r) 178 | } 179 | 180 | func (f *filterRowReader) Close() error { 181 | return f.closer.Close() 182 | } 183 | 184 | // Copied from parquet-go https://github.com/parquet-go/parquet-go/blob/main/row_group.go 185 | // Needs to be upstreamed eventually; Adapted to work with column chunks and joined schema 186 | 187 | type columnChunkValueReader struct { 188 | pages parquet.Pages 189 | page parquet.Page 190 | values parquet.ValueReader 191 | release func(parquet.Page) 192 | } 193 | 194 | func (r *columnChunkValueReader) clear() { 195 | if r.page != nil { 196 | r.release(r.page) 197 | r.page = nil 198 | r.values = nil 199 | } 200 | } 201 | 202 | func (r *columnChunkValueReader) Reset() { 203 | if r.pages != nil { 204 | // Ignore errors because we are resetting the reader, if the error 205 | // persists we will see it on the next read, and otherwise we can 206 | // read back from the beginning. 207 | r.pages.SeekToRow(0) 208 | } 209 | r.clear() 210 | } 211 | 212 | func (r *columnChunkValueReader) Close() error { 213 | var err error 214 | if r.pages != nil { 215 | err = r.pages.Close() 216 | r.pages = nil 217 | } 218 | r.clear() 219 | return err 220 | } 221 | 222 | func (r *columnChunkValueReader) ReadValues(values []parquet.Value) (int, error) { 223 | if r.pages == nil { 224 | return 0, io.EOF 225 | } 226 | 227 | for { 228 | if r.values == nil { 229 | p, err := r.pages.ReadPage() 230 | if err != nil { 231 | return 0, err 232 | } 233 | r.page = p 234 | r.values = p.Values() 235 | } 236 | 237 | n, err := r.values.ReadValues(values) 238 | if n > 0 { 239 | return n, nil 240 | } 241 | if err == nil { 242 | return 0, io.ErrNoProgress 243 | } 244 | if err != io.EOF { 245 | return 0, err 246 | } 247 | r.clear() 248 | } 249 | } 250 | 251 | func (r *columnChunkValueReader) SeekToRow(rowIndex int64) error { 252 | if r.pages == nil { 253 | return io.ErrClosedPipe 254 | } 255 | if err := r.pages.SeekToRow(rowIndex); err != nil { 256 | return err 257 | } 258 | r.clear() 259 | return nil 260 | } 261 | 262 | type rowGroupRows struct { 263 | schema *parquet.Schema 264 | bufsize int 265 | buffers []parquet.Value 266 | columns []columnChunkRows 267 | closed bool 268 | rowIndex int64 269 | } 270 | 271 | type columnChunkRows struct { 272 | offset int32 273 | length int32 274 | reader columnChunkValueReader 275 | } 276 | 277 | func (r *rowGroupRows) buffer(i int) []parquet.Value { 278 | j := (i + 0) * r.bufsize 279 | k := (i + 1) * r.bufsize 280 | return r.buffers[j:k:k] 281 | } 282 | 283 | func newRowGroupRows(schema *parquet.Schema, columns []parquet.ColumnChunk) *rowGroupRows { 284 | bufferSize := 64 285 | r := &rowGroupRows{ 286 | schema: schema, 287 | bufsize: bufferSize, 288 | buffers: make([]parquet.Value, len(columns)*bufferSize), 289 | columns: make([]columnChunkRows, len(columns)), 290 | rowIndex: -1, 291 | } 292 | 293 | for i, column := range columns { 294 | var release func(parquet.Page) 295 | // Only release pages that are not byte array because the values 296 | // that were read from the page might be retained by the program 297 | // after calls to ReadRows. 298 | switch column.Type().Kind() { 299 | case parquet.ByteArray, parquet.FixedLenByteArray: 300 | release = func(parquet.Page) {} 301 | default: 302 | release = parquet.Release 303 | } 304 | r.columns[i].reader.release = release 305 | r.columns[i].reader.pages = column.Pages() 306 | } 307 | return r 308 | } 309 | 310 | func (r *rowGroupRows) clear() { 311 | for i, c := range r.columns { 312 | r.columns[i] = columnChunkRows{reader: c.reader} 313 | } 314 | clear(r.buffers) 315 | } 316 | 317 | func (r *rowGroupRows) Reset() { 318 | for i := range r.columns { 319 | r.columns[i].reader.Reset() 320 | } 321 | r.clear() 322 | } 323 | 324 | func (r *rowGroupRows) Close() error { 325 | var errs []error 326 | for i := range r.columns { 327 | c := &r.columns[i] 328 | c.offset = 0 329 | c.length = 0 330 | if err := c.reader.Close(); err != nil { 331 | errs = append(errs, err) 332 | } 333 | } 334 | r.clear() 335 | r.closed = true 336 | return errors.Join(errs...) 337 | } 338 | 339 | func (r *rowGroupRows) SeekToRow(rowIndex int64) error { 340 | if r.closed { 341 | return io.ErrClosedPipe 342 | } 343 | if rowIndex != r.rowIndex { 344 | for i := range r.columns { 345 | if err := r.columns[i].reader.SeekToRow(rowIndex); err != nil { 346 | return err 347 | } 348 | } 349 | r.clear() 350 | r.rowIndex = rowIndex 351 | } 352 | return nil 353 | } 354 | 355 | func (r *rowGroupRows) ReadRows(rows []parquet.Row) (int, error) { 356 | if r.closed { 357 | return 0, io.EOF 358 | } 359 | 360 | for rowIndex := range rows { 361 | rows[rowIndex] = rows[rowIndex][:0] 362 | } 363 | 364 | // When this is the first call to ReadRows, we issue a seek to the first row 365 | // because this starts prefetching pages asynchronously on columns. 366 | // 367 | // This condition does not apply if SeekToRow was called before ReadRows, 368 | // only when ReadRows is the very first method called on the row reader. 369 | if r.rowIndex < 0 { 370 | if err := r.SeekToRow(0); err != nil { 371 | return 0, err 372 | } 373 | } 374 | 375 | eofCount := 0 376 | rowCount := 0 377 | 378 | readColumnValues: 379 | for columnIndex := range r.columns { 380 | c := &r.columns[columnIndex] 381 | b := r.buffer(columnIndex) 382 | eof := false 383 | 384 | for rowIndex := range rows { 385 | numValuesInRow := 1 386 | 387 | for { 388 | if c.offset == c.length { 389 | n, err := c.reader.ReadValues(b) 390 | c.offset = 0 391 | c.length = int32(n) 392 | 393 | if n == 0 { 394 | if err == io.EOF { 395 | eof = true 396 | eofCount++ 397 | break 398 | } 399 | return 0, err 400 | } 401 | } 402 | 403 | values := b[c.offset:c.length:c.length] 404 | for numValuesInRow < len(values) && values[numValuesInRow].RepetitionLevel() != 0 { 405 | numValuesInRow++ 406 | } 407 | if numValuesInRow == 0 { 408 | break 409 | } 410 | 411 | rows[rowIndex] = append(rows[rowIndex], values[:numValuesInRow]...) 412 | rowCount = max(rowCount, rowIndex+1) 413 | c.offset += int32(numValuesInRow) 414 | 415 | if numValuesInRow != len(values) { 416 | break 417 | } 418 | if eof { 419 | continue readColumnValues 420 | } 421 | numValuesInRow = 0 422 | } 423 | } 424 | } 425 | 426 | var err error 427 | if eofCount > 0 { 428 | err = io.EOF 429 | } 430 | r.rowIndex += int64(rowCount) 431 | return rowCount, err 432 | } 433 | 434 | func (r *rowGroupRows) Schema() *parquet.Schema { 435 | return r.schema 436 | } 437 | -------------------------------------------------------------------------------- /search/search_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | package search 6 | 7 | import ( 8 | "bytes" 9 | "context" 10 | "io" 11 | "slices" 12 | "testing" 13 | 14 | "github.com/parquet-go/parquet-go" 15 | 16 | "github.com/cloudflare/parquet-tsdb-poc/schema" 17 | ) 18 | 19 | func TestSearch(t *testing.T) { 20 | ctx := context.Background() 21 | t.Run("", func(t *testing.T) { 22 | type S struct { 23 | A int64 24 | B int64 25 | C string 26 | } 27 | srows := []S{ 28 | { 29 | A: 1, 30 | B: 2, 31 | C: "a", 32 | }, 33 | { 34 | A: 3, 35 | B: 4, 36 | C: "b", 37 | }, 38 | { 39 | A: 7, 40 | B: 12, 41 | C: "c", 42 | }, 43 | { 44 | A: 9, 45 | B: 22, 46 | C: "d", 47 | }, 48 | { 49 | A: 0, 50 | B: 1, 51 | C: "e", 52 | }, 53 | { 54 | A: 0, 55 | B: 1, 56 | C: "f", 57 | }, 58 | { 59 | A: 0, 60 | B: 1, 61 | C: "g", 62 | }, 63 | { 64 | A: 0, 65 | B: 1, 66 | C: "h", 67 | }, 68 | } 69 | type T struct { 70 | D string 71 | } 72 | 73 | trows := []T{ 74 | { 75 | D: "h", 76 | }, 77 | { 78 | D: "g", 79 | }, 80 | { 81 | D: "f", 82 | }, 83 | { 84 | D: "e", 85 | }, 86 | { 87 | D: "d", 88 | }, 89 | { 90 | D: "c", 91 | }, 92 | { 93 | D: "b", 94 | }, 95 | { 96 | D: "a", 97 | }, 98 | } 99 | sfile := buildFile(t, srows) 100 | tfile := buildFile(t, trows) 101 | 102 | t.Run("", func(t *testing.T) { 103 | constraint := And( 104 | Not(Equal("B", parquet.ValueOf(3))), 105 | Or( 106 | Equal("C", parquet.ValueOf("f")), 107 | Equal("C", parquet.ValueOf("g")), 108 | ), 109 | ) 110 | 111 | sproj := schema.Projection(sfile.Schema(), []string{"B", "C"}) 112 | tproj := schema.Projection(tfile.Schema(), []string{"D"}) 113 | 114 | rr, err := Match( 115 | ctx, 116 | constraint, 117 | sfile, sproj, 118 | tfile, tproj, 119 | ) 120 | if err != nil { 121 | t.Fatal(err) 122 | } 123 | defer rr.Close() 124 | 125 | got := readAll(t, rr) 126 | expect := []parquet.Row{ 127 | {parquet.ValueOf(1), parquet.ValueOf("f"), parquet.ValueOf("c")}, 128 | {parquet.ValueOf(1), parquet.ValueOf("g"), parquet.ValueOf("b")}, 129 | } 130 | 131 | if !equalRows(got, expect) { 132 | t.Fatalf("expected %q to equal %q", got, expect) 133 | } 134 | }) 135 | }) 136 | } 137 | 138 | func equalRows(l, r []parquet.Row) bool { 139 | return slices.EqualFunc(l, r, func(ll, rr parquet.Row) bool { 140 | return equalRow(ll, rr) 141 | }) 142 | } 143 | 144 | func equalRow(l, r parquet.Row) bool { 145 | return slices.EqualFunc(l, r, func(lv, rv parquet.Value) bool { 146 | return lv.String() == rv.String() 147 | }) 148 | } 149 | 150 | func buildFile[T any](t testing.TB, rows []T) *parquet.File { 151 | buf := bytes.NewBuffer(nil) 152 | w := parquet.NewGenericWriter[T](buf, parquet.PageBufferSize(12), parquet.WriteBufferSize(0)) 153 | for _, row := range rows { 154 | if _, err := w.Write([]T{row}); err != nil { 155 | t.Fatal(err) 156 | } 157 | } 158 | if err := w.Close(); err != nil { 159 | t.Fatal(err) 160 | } 161 | reader := bytes.NewReader(buf.Bytes()) 162 | file, err := parquet.OpenFile(reader, reader.Size()) 163 | if err != nil { 164 | t.Fatal(err) 165 | } 166 | return file 167 | } 168 | 169 | type testRows struct { 170 | p int 171 | rows []parquet.Row 172 | } 173 | 174 | // parquet.Rows interface stuff we care about 175 | func (trs *testRows) Close() error { return nil } 176 | 177 | func (trs *testRows) ReadRows(r []parquet.Row) (int, error) { 178 | if trs.p >= len(trs.rows) { 179 | return 0, io.EOF 180 | } 181 | n := copy(r, trs.rows) 182 | trs.p += n 183 | 184 | if trs.p == len(trs.rows) { 185 | return n, io.EOF 186 | } 187 | return n, nil 188 | } 189 | 190 | func TestConcatRowReader(t *testing.T) { 191 | ccr := newConcatRowReader([]RowReaderCloser{ 192 | &testRows{ 193 | rows: []parquet.Row{{parquet.ValueOf(0), parquet.ValueOf(1)}, {parquet.ValueOf(3)}}, 194 | }, 195 | &testRows{ 196 | rows: []parquet.Row{{parquet.ValueOf(4), parquet.ValueOf(5)}, {parquet.ValueOf(6)}}, 197 | }, 198 | }) 199 | 200 | got := readAll(t, ccr) 201 | expect := []parquet.Row{ 202 | {parquet.ValueOf(0), parquet.ValueOf(1)}, 203 | {parquet.ValueOf(3)}, 204 | {parquet.ValueOf(4), parquet.ValueOf(5)}, 205 | {parquet.ValueOf(6)}, 206 | } 207 | 208 | if !equalRows(got, expect) { 209 | t.Fatalf("expected %q to equal %q", got, expect) 210 | } 211 | } 212 | 213 | func readAll(t *testing.T, rr parquet.RowReader) []parquet.Row { 214 | res := make([]parquet.Row, 0) 215 | 216 | rw := parquet.RowWriterFunc(func(rs []parquet.Row) (int, error) { 217 | res = slices.Grow(res, len(res)) 218 | for _, r := range rs { 219 | res = append(res, r.Clone()) 220 | } 221 | return len(res), nil 222 | }) 223 | if _, err := parquet.CopyRows(rw, rr); err != nil { 224 | t.Fatal(err) 225 | } 226 | 227 | return res 228 | } 229 | -------------------------------------------------------------------------------- /shell.nix: -------------------------------------------------------------------------------- 1 | let 2 | pkgs = import { }; 3 | in 4 | pkgs.mkShell { 5 | name = "env"; 6 | hardeningDisable = [ "fortify" ]; 7 | buildInputs = with pkgs; [ 8 | go_1_23 9 | gotools 10 | delve 11 | revive 12 | duckdb 13 | protobuf 14 | protoc-gen-go 15 | ]; 16 | shellHook = '' 17 | export PATH="$(go env GOPATH)/bin:$PATH" 18 | export GOPATH="$(go env GOPATH)" 19 | ''; 20 | } 21 | 22 | --------------------------------------------------------------------------------