├── .cargo └── config ├── .github └── workflows │ └── actions.yml ├── .gitignore ├── .travis.yml ├── Cargo.lock ├── Cargo.toml ├── Dockerfile ├── LICENSE ├── README.md ├── statsrelay-protobuf ├── Cargo.toml ├── build.rs ├── proto │ └── ext │ │ └── github.com │ │ ├── gogo │ │ └── protobuf │ │ │ └── gogoproto │ │ │ └── gogo.proto │ │ └── prometheus │ │ └── prometheus │ │ └── prompb │ │ ├── remote.proto │ │ └── types.proto └── src │ └── lib.rs └── statsrelay ├── Cargo.toml ├── benches └── statsd_benchmark.rs ├── build.rs ├── examples ├── processors-basic.json ├── source-example1.json ├── tugboat-discovery.json └── tugboat-legacy-basic.json └── src ├── admin.rs ├── backend.rs ├── backend_client ├── mod.rs ├── prom_client.rs └── statsd_client.rs ├── backends.rs ├── cmd ├── loadgen.rs └── statsrelay.rs ├── config.rs ├── cuckoofilter ├── LICENSE ├── bucket.rs ├── mod.rs └── util.rs ├── discovery.rs ├── lib.rs ├── processors ├── cardinality.rs ├── mod.rs ├── regex_filter.rs ├── sampler.rs └── tag.rs ├── shard.rs ├── stats.rs ├── statsd_proto.rs └── statsd_server.rs /.cargo/config: -------------------------------------------------------------------------------- 1 | [profile.release] 2 | lto = true 3 | -------------------------------------------------------------------------------- /.github/workflows/actions.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | jobs: 3 | build: 4 | runs-on: ubuntu-20.04 5 | steps: 6 | - uses: actions/checkout@v2 7 | - run: docker build . 8 | 9 | 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | *.o 3 | *.py[co] 4 | *.swp 5 | *.trs 6 | .deps 7 | .idea 8 | .dirstamp 9 | /target 10 | .vscode/settings.json 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | dist: bionic 3 | services: 4 | - docker 5 | before_install: 6 | - docker build . 7 | 8 | 9 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "statsrelay", 4 | "statsrelay-protobuf" 5 | ] 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM fedora:33 2 | RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 3 | 4 | ENV PATH="$PATH:$HOME/.cargo/bin" 5 | RUN dnf install -y gcc make g++ openssl-devel 6 | 7 | COPY . /code 8 | WORKDIR /code 9 | 10 | RUN $HOME/.cargo/bin/cargo test --release && \ 11 | $HOME/.cargo/bin/cargo build --release 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | tugboat 2 | Copyright (c) 2020 Lyft Inc. 3 | The MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to 7 | deal in the Software without restriction, including without limitation the 8 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | sell copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # statsrelay (3.0) 2 | A veritable toolkit, sidecar, or daemon(set) for sharding, aggregating, relaying 3 | and working with statsd and Prometheus based metrics sources, at scale. 4 | 5 | ## License 6 | MIT License 7 | Copyright (c) 2015-2020 Lyft Inc. 8 | 9 | Originally based on statsrelay: 10 | Copyright (c) 2014 Uber Technologies, Inc. 11 | 12 | ## Whats different in this version 13 | 14 | Statsrelay 3.0 is a port of the original C statsrelay to Rust, with a number of 15 | new features designed to improve the operatability and scalability of the 16 | original daemon, moving beyond pure "relaying" and instead focusing on both 17 | sharding, as well as cascading aggregation. The original C daemon in this fork 18 | featured sampling support, but was limited by having the output format be 19 | statsd. 20 | 21 | ## Build 22 | 23 | Dependencies: 24 | - Rust (stable, 1.46+) 25 | 26 | ## Use 27 | 28 | ``` 29 | statsrelay 3.1.0 30 | 31 | USAGE: 32 | statsrelay [OPTIONS] 33 | 34 | FLAGS: 35 | -h, --help Prints help information 36 | -V, --version Prints version information 37 | 38 | OPTIONS: 39 | -c, --config [default: /etc/statsrelay.json] 40 | ``` 41 | 42 | Statsrelay logging is handled by the env_logger crate, which inherits a number of 43 | logging options from the environment. Consult the [crate 44 | documentation](https://docs.rs/env_logger/0.8.1/env_logger/#enabling-logging) 45 | for more information on options you can set. 46 | 47 | ### Protocols 48 | 49 | Statsrelay understands: 50 | 51 | - statsd text line protocol 52 | - with sampling support (`@sampling`) 53 | - with extended data types (map, kv, sets, etc) 54 | - with "DogStatsD" extended tags (`|#tags`) 55 | - with Lyft internal tags (`metric.__tag=value`) 56 | 57 | ### Configuration file 58 | 59 | The configuration file is a JSON file originating from the original statsrelay 60 | structure. The original statsrelay configuration contract has been broken as of 61 | version 3.1 in order to fix a number of features. 62 | 63 | #### Basic structure 64 | 65 | ```json 66 | { 67 | "statsd": { 68 | "bind": "127.0.0.1:8129", 69 | "validate": true, 70 | "backends": { 71 | "b1": { 72 | "shard_map": ["127.0.0.1:1234"], 73 | "prefix": "myapp.", 74 | "suffix": ".suffix" 75 | } 76 | } 77 | }, 78 | "discovery": { 79 | "sources": { 80 | "source1": { 81 | "type": "s3", 82 | "bucket": "my-bucket", 83 | "key": "file.json", 84 | "interval": 10 85 | } 86 | } 87 | } 88 | } 89 | ``` 90 | 91 | Statsd inputs and routing is defined in the outer `statsd` block. 92 | 93 | - `bind`: sets the server bind address to accept statsd protocol messages. 94 | Statsrelay will bind on both UDP and TCP ports. 95 | - `validate`: turns on extended, more expensive validation of statsd line 96 | protocol messages, such as parsing of numerical fields, which may not be 97 | required for a pure relaying case. 98 | - `backends` forks the incoming statsd metrics down a number of parallel 99 | processing pipelines. By default, all incoming protocol lines from the statsd 100 | server are sent to all backends. 101 | 102 | #### `backends` options 103 | 104 | Each backend is named and can accept a number of options and rewrite steps for 105 | sending and processing StatsD messages. 106 | 107 | - `shard_map`: list of socket addresses that defines where to send statsd output 108 | to, from a list of servers. The same server can be specified more than once 109 | (allowing for virtual sharding). Output statsd lines are consistently hashed, 110 | and sent to the corresponding server based on a standard hash ring, in a 111 | compatible format to the original statsrelay code (Murmur3 hash). This list 112 | can be empty to not relay statsd messages. 113 | - `shard_map_source`: string value which defines a discovery source to use 114 | in-lieu of `shard_map`. 115 | - `prefix`: prepend this prefix string in front of every metric/statsd line before 116 | forwarding it to the `shard_map` servers. Useful for tagging metrics coming 117 | from a sidecar. 118 | - `suffix`: append a suffix. Works like prefix, just at the end. 119 | - `max_queue`: Number of messages to support queued up before dropping. Allows 120 | the sender to make overall progress in light of one backend being down. 121 | Defaults to 10,000. 122 | 123 | #### `discovery` options 124 | 125 | Each key in the discovery sources section defines a source which can be used by 126 | most backends to locate servers, sharding, or other network resources to 127 | communicate with. Each named discovery source is listed in the `sources` subkey: 128 | 129 | ```json 130 | { 131 | "sources": { 132 | "source_name_1": { 133 | "type": "static_file" 134 | }, 135 | "source_name_2": { 136 | "type": "s3" 137 | } 138 | } 139 | } 140 | ``` 141 | 142 | For sources supporting a file input (s3, static_file), the following schema is 143 | assumed: 144 | 145 | ```json 146 | { 147 | "hosts": ["host:port", "host:port"] 148 | } 149 | ``` 150 | 151 | Some sources may support rewriting to transform the input string into an output 152 | string (e.g., to add a port) 153 | 154 | ##### s3 source 155 | 156 | An S3 source represents an AWS S3 compatible source. Statsrelay uses `rusoto_s3` 157 | to access S3 and supports the vast majority of metadata sources, configuration, 158 | and environment variables in order to locate credentials. 159 | 160 | The following keys are supported for the S3 source: 161 | 162 | - `bucket` - The S3 bucket where the file lives 163 | - `key` - The key/path instead the S3 bucket 164 | - `interval` - An integer number of seconds to wait before re-polling the 165 | contents of the S3 key to detect changes. 166 | - `format` - A simple text subsitution to run on the incoming text, where `{}` is 167 | replaced by the value of each host entry. Valuable to append information, such 168 | as a port number by specifying `"format": "{}:8125"` 169 | -------------------------------------------------------------------------------- /statsrelay-protobuf/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "statsrelay-protobuf" 3 | version = "3.1.0-dev" 4 | edition = "2018" 5 | 6 | [dependencies] 7 | tonic = "0.6" 8 | prost = "0.9" 9 | 10 | [build-dependencies] 11 | tonic-build = "0.6" 12 | -------------------------------------------------------------------------------- /statsrelay-protobuf/build.rs: -------------------------------------------------------------------------------- 1 | fn main() -> Result<(), Box> { 2 | tonic_build::configure() 3 | .protoc_arg("--experimental_allow_proto3_optional") 4 | .compile( 5 | &["proto/ext/github.com/prometheus/prometheus/prompb/remote.proto"], 6 | &[ 7 | "proto/", 8 | "proto/ext/github.com/prometheus/prometheus/prompb", 9 | "proto/ext/github.com/gogo/protobuf/", 10 | ], 11 | )?; 12 | 13 | Ok(()) 14 | } 15 | -------------------------------------------------------------------------------- /statsrelay-protobuf/proto/ext/github.com/gogo/protobuf/gogoproto/gogo.proto: -------------------------------------------------------------------------------- 1 | // Protocol Buffers for Go with Gadgets 2 | // 3 | // Copyright (c) 2013, The GoGo Authors. All rights reserved. 4 | // http://github.com/gogo/protobuf 5 | // 6 | // Redistribution and use in source and binary forms, with or without 7 | // modification, are permitted provided that the following conditions are 8 | // met: 9 | // 10 | // * Redistributions of source code must retain the above copyright 11 | // notice, this list of conditions and the following disclaimer. 12 | // * Redistributions in binary form must reproduce the above 13 | // copyright notice, this list of conditions and the following disclaimer 14 | // in the documentation and/or other materials provided with the 15 | // distribution. 16 | // 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | syntax = "proto2"; 30 | package gogoproto; 31 | 32 | import "google/protobuf/descriptor.proto"; 33 | 34 | option java_package = "com.google.protobuf"; 35 | option java_outer_classname = "GoGoProtos"; 36 | option go_package = "github.com/gogo/protobuf/gogoproto"; 37 | 38 | extend google.protobuf.EnumOptions { 39 | optional bool goproto_enum_prefix = 62001; 40 | optional bool goproto_enum_stringer = 62021; 41 | optional bool enum_stringer = 62022; 42 | optional string enum_customname = 62023; 43 | optional bool enumdecl = 62024; 44 | } 45 | 46 | extend google.protobuf.EnumValueOptions { 47 | optional string enumvalue_customname = 66001; 48 | } 49 | 50 | extend google.protobuf.FileOptions { 51 | optional bool goproto_getters_all = 63001; 52 | optional bool goproto_enum_prefix_all = 63002; 53 | optional bool goproto_stringer_all = 63003; 54 | optional bool verbose_equal_all = 63004; 55 | optional bool face_all = 63005; 56 | optional bool gostring_all = 63006; 57 | optional bool populate_all = 63007; 58 | optional bool stringer_all = 63008; 59 | optional bool onlyone_all = 63009; 60 | 61 | optional bool equal_all = 63013; 62 | optional bool description_all = 63014; 63 | optional bool testgen_all = 63015; 64 | optional bool benchgen_all = 63016; 65 | optional bool marshaler_all = 63017; 66 | optional bool unmarshaler_all = 63018; 67 | optional bool stable_marshaler_all = 63019; 68 | 69 | optional bool sizer_all = 63020; 70 | 71 | optional bool goproto_enum_stringer_all = 63021; 72 | optional bool enum_stringer_all = 63022; 73 | 74 | optional bool unsafe_marshaler_all = 63023; 75 | optional bool unsafe_unmarshaler_all = 63024; 76 | 77 | optional bool goproto_extensions_map_all = 63025; 78 | optional bool goproto_unrecognized_all = 63026; 79 | optional bool gogoproto_import = 63027; 80 | optional bool protosizer_all = 63028; 81 | optional bool compare_all = 63029; 82 | optional bool typedecl_all = 63030; 83 | optional bool enumdecl_all = 63031; 84 | 85 | optional bool goproto_registration = 63032; 86 | optional bool messagename_all = 63033; 87 | 88 | optional bool goproto_sizecache_all = 63034; 89 | optional bool goproto_unkeyed_all = 63035; 90 | } 91 | 92 | extend google.protobuf.MessageOptions { 93 | optional bool goproto_getters = 64001; 94 | optional bool goproto_stringer = 64003; 95 | optional bool verbose_equal = 64004; 96 | optional bool face = 64005; 97 | optional bool gostring = 64006; 98 | optional bool populate = 64007; 99 | optional bool stringer = 67008; 100 | optional bool onlyone = 64009; 101 | 102 | optional bool equal = 64013; 103 | optional bool description = 64014; 104 | optional bool testgen = 64015; 105 | optional bool benchgen = 64016; 106 | optional bool marshaler = 64017; 107 | optional bool unmarshaler = 64018; 108 | optional bool stable_marshaler = 64019; 109 | 110 | optional bool sizer = 64020; 111 | 112 | optional bool unsafe_marshaler = 64023; 113 | optional bool unsafe_unmarshaler = 64024; 114 | 115 | optional bool goproto_extensions_map = 64025; 116 | optional bool goproto_unrecognized = 64026; 117 | 118 | optional bool protosizer = 64028; 119 | optional bool compare = 64029; 120 | 121 | optional bool typedecl = 64030; 122 | 123 | optional bool messagename = 64033; 124 | 125 | optional bool goproto_sizecache = 64034; 126 | optional bool goproto_unkeyed = 64035; 127 | } 128 | 129 | extend google.protobuf.FieldOptions { 130 | optional bool nullable = 65001; 131 | optional bool embed = 65002; 132 | optional string customtype = 65003; 133 | optional string customname = 65004; 134 | optional string jsontag = 65005; 135 | optional string moretags = 65006; 136 | optional string casttype = 65007; 137 | optional string castkey = 65008; 138 | optional string castvalue = 65009; 139 | 140 | optional bool stdtime = 65010; 141 | optional bool stdduration = 65011; 142 | optional bool wktpointer = 65012; 143 | 144 | } -------------------------------------------------------------------------------- /statsrelay-protobuf/proto/ext/github.com/prometheus/prometheus/prompb/remote.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Prometheus Team 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | syntax = "proto3"; 15 | package prometheus; 16 | 17 | option go_package = "prompb"; 18 | 19 | import "types.proto"; 20 | import "gogoproto/gogo.proto"; 21 | 22 | message WriteRequest { 23 | repeated prometheus.TimeSeries timeseries = 1 [(gogoproto.nullable) = false]; 24 | // Cortex uses this field to determine the source of the write request. 25 | // We reserve it to avoid any compatibility issues. 26 | reserved 2; 27 | repeated prometheus.MetricMetadata metadata = 3 [(gogoproto.nullable) = false]; 28 | 29 | // Added as an extension, this field allows us to forward statsd PDUs that fail to parse. 30 | repeated string failthrough_statsd_lines = 17; 31 | } 32 | 33 | // ReadRequest represents a remote read request. 34 | message ReadRequest { 35 | repeated Query queries = 1; 36 | 37 | enum ResponseType { 38 | // Server will return a single ReadResponse message with matched series that includes list of raw samples. 39 | // It's recommended to use streamed response types instead. 40 | // 41 | // Response headers: 42 | // Content-Type: "application/x-protobuf" 43 | // Content-Encoding: "snappy" 44 | SAMPLES = 0; 45 | // Server will stream a delimited ChunkedReadResponse message that contains XOR encoded chunks for a single series. 46 | // Each message is following varint size and fixed size bigendian uint32 for CRC32 Castagnoli checksum. 47 | // 48 | // Response headers: 49 | // Content-Type: "application/x-streamed-protobuf; proto=prometheus.ChunkedReadResponse" 50 | // Content-Encoding: "" 51 | STREAMED_XOR_CHUNKS = 1; 52 | } 53 | 54 | // accepted_response_types allows negotiating the content type of the response. 55 | // 56 | // Response types are taken from the list in the FIFO order. If no response type in `accepted_response_types` is 57 | // implemented by server, error is returned. 58 | // For request that do not contain `accepted_response_types` field the SAMPLES response type will be used. 59 | repeated ResponseType accepted_response_types = 2; 60 | } 61 | 62 | // ReadResponse is a response when response_type equals SAMPLES. 63 | message ReadResponse { 64 | // In same order as the request's queries. 65 | repeated QueryResult results = 1; 66 | } 67 | 68 | message Query { 69 | int64 start_timestamp_ms = 1; 70 | int64 end_timestamp_ms = 2; 71 | repeated prometheus.LabelMatcher matchers = 3; 72 | prometheus.ReadHints hints = 4; 73 | } 74 | 75 | message QueryResult { 76 | // Samples within a time series must be ordered by time. 77 | repeated prometheus.TimeSeries timeseries = 1; 78 | } 79 | 80 | // ChunkedReadResponse is a response when response_type equals STREAMED_XOR_CHUNKS. 81 | // We strictly stream full series after series, optionally split by time. This means that a single frame can contain 82 | // partition of the single series, but once a new series is started to be streamed it means that no more chunks will 83 | // be sent for previous one. Series are returned sorted in the same way TSDB block are internally. 84 | message ChunkedReadResponse { 85 | repeated prometheus.ChunkedSeries chunked_series = 1; 86 | 87 | // query_index represents an index of the query from ReadRequest.queries these chunks relates to. 88 | int64 query_index = 2; 89 | } -------------------------------------------------------------------------------- /statsrelay-protobuf/proto/ext/github.com/prometheus/prometheus/prompb/types.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Prometheus Team 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | syntax = "proto3"; 15 | package prometheus; 16 | 17 | option go_package = "prompb"; 18 | 19 | import "gogoproto/gogo.proto"; 20 | 21 | message MetricMetadata { 22 | enum MetricType { 23 | UNKNOWN = 0; 24 | COUNTER = 1; 25 | GAUGE = 2; 26 | HISTOGRAM = 3; 27 | GAUGEHISTOGRAM = 4; 28 | SUMMARY = 5; 29 | INFO = 6; 30 | STATESET = 7; 31 | 32 | // added as extensions 33 | DIRECTGAUGE = 100; 34 | DELTAGAUGE = 101; 35 | } 36 | 37 | // Represents the metric type, these match the set from Prometheus. 38 | // Refer to model/textparse/interface.go for details. 39 | MetricType type = 1; 40 | string metric_family_name = 2; 41 | string help = 4; 42 | string unit = 5; 43 | } 44 | 45 | message Sample { 46 | double value = 1; 47 | // timestamp is in ms format, see model/timestamp/timestamp.go for 48 | // conversion from time.Time to Prometheus timestamp. 49 | int64 timestamp = 2; 50 | 51 | // added as an extension, sample_rate is a ratio in the range (0,1]. 52 | double sample_rate = 100; 53 | } 54 | 55 | message Exemplar { 56 | // Optional, can be empty. 57 | repeated Label labels = 1 [(gogoproto.nullable) = false]; 58 | double value = 2; 59 | // timestamp is in ms format, see model/timestamp/timestamp.go for 60 | // conversion from time.Time to Prometheus timestamp. 61 | int64 timestamp = 3; 62 | } 63 | 64 | // TimeSeries represents samples and labels for a single time series. 65 | message TimeSeries { 66 | // For a timeseries to be valid, and for the samples and exemplars 67 | // to be ingested by the remote system properly, the labels field is required. 68 | repeated Label labels = 1 [(gogoproto.nullable) = false]; 69 | repeated Sample samples = 2 [(gogoproto.nullable) = false]; 70 | repeated Exemplar exemplars = 3 [(gogoproto.nullable) = false]; 71 | } 72 | 73 | message Label { 74 | string name = 1; 75 | string value = 2; 76 | } 77 | 78 | message Labels { 79 | repeated Label labels = 1 [(gogoproto.nullable) = false]; 80 | } 81 | 82 | // Matcher specifies a rule, which can match or set of labels or not. 83 | message LabelMatcher { 84 | enum Type { 85 | EQ = 0; 86 | NEQ = 1; 87 | RE = 2; 88 | NRE = 3; 89 | } 90 | Type type = 1; 91 | string name = 2; 92 | string value = 3; 93 | } 94 | 95 | message ReadHints { 96 | int64 step_ms = 1; // Query step size in milliseconds. 97 | string func = 2; // String representation of surrounding function or aggregation. 98 | int64 start_ms = 3; // Start time in milliseconds. 99 | int64 end_ms = 4; // End time in milliseconds. 100 | repeated string grouping = 5; // List of label names used in aggregation. 101 | bool by = 6; // Indicate whether it is without or by. 102 | int64 range_ms = 7; // Range vector selector range in milliseconds. 103 | } 104 | 105 | // Chunk represents a TSDB chunk. 106 | // Time range [min, max] is inclusive. 107 | message Chunk { 108 | int64 min_time_ms = 1; 109 | int64 max_time_ms = 2; 110 | 111 | // We require this to match chunkenc.Encoding. 112 | enum Encoding { 113 | UNKNOWN = 0; 114 | XOR = 1; 115 | } 116 | Encoding type = 3; 117 | bytes data = 4; 118 | } 119 | 120 | // ChunkedSeries represents single, encoded time series. 121 | message ChunkedSeries { 122 | // Labels should be sorted. 123 | repeated Label labels = 1 [(gogoproto.nullable) = false]; 124 | // Chunks will be in start time order and may overlap. 125 | repeated Chunk chunks = 2 [(gogoproto.nullable) = false]; 126 | } -------------------------------------------------------------------------------- /statsrelay-protobuf/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod prometheus { 2 | tonic::include_proto!("prometheus"); 3 | } 4 | 5 | #[cfg(test)] 6 | mod tests { 7 | use crate::prometheus; 8 | 9 | #[test] 10 | fn compiles_protobufs() { 11 | // Just build some default protobuf objects from the .proto file to check 12 | // everything was compiled by build.rs. While its not a runtime error, 13 | // maybe reduces some hair pulling later 14 | let _write_request = prometheus::WriteRequest::default(); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /statsrelay/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "statsrelay" 3 | version = "3.1.0-dev" 4 | authors = ["Yann Ramin "] 5 | edition = "2018" 6 | description = "Swiss army knife for aggregation, filtering, relaying, sharding of statsd, prometheus, and other metrics systems" 7 | license = "MIT" 8 | keywords = ["statsd", "metrics", "aggregation"] 9 | categories = ["development-tools"] 10 | repository = "https://github.com/lyft/statsrelay" 11 | 12 | [[bin]] 13 | name = "statsrelay" 14 | path = "src/cmd/statsrelay.rs" 15 | 16 | [[bin]] 17 | name = "sr-loadgen" 18 | path = "src/cmd/loadgen.rs" 19 | 20 | [dependencies] 21 | murmur3 = "0.5" 22 | tokio = { version = "1", features = ["full", "parking_lot"] } 23 | tokio-stream = "0" 24 | futures = "0.3" 25 | hyper = { version = "0.14", features = ["server", "client", "runtime", "http2", "stream", "http1"] } 26 | structopt = "0.3" 27 | serde = { version = "1.0", features = ["derive"] } 28 | serde_json = "1.0" 29 | anyhow = "1.0" 30 | thiserror = "1.0" 31 | memchr = "2" 32 | stream-cancel = "0.8" 33 | bytes = "1" 34 | parking_lot = "0.11" 35 | regex = "1" 36 | chrono = "0.4" 37 | dashmap = "4" 38 | async-stream = "0.3" 39 | lexical = "5" 40 | prost = "0.9" 41 | reqwest = "0.11" 42 | smallvec = "1" 43 | snap = "1" 44 | statsrelay-protobuf = { path = "../statsrelay-protobuf" } 45 | async-trait = "0.1.56" 46 | backoff = { version = "0.4.0", features = ["tokio"] } 47 | 48 | # For discovery 49 | rusoto_core = "0.46" 50 | rusoto_credential = "0.46.0" 51 | rusoto_s3 = "0.46.0" 52 | rusoto_sts = "0.46.0" 53 | 54 | log = "0.4" 55 | env_logger = "0.8" 56 | 57 | # Internal stats 58 | prometheus = "0.11" 59 | 60 | # malloc 61 | jemallocator = "0.3.0" 62 | 63 | # Samplers 64 | byteorder = "1" 65 | hyperloglog = "1" 66 | ahash = "0.7" 67 | fastrand = "1" 68 | rand = { version = "0.8", features = ["small_rng"] } 69 | 70 | [[bench]] 71 | name = "statsd_benchmark" 72 | harness = false 73 | 74 | [dev-dependencies] 75 | criterion = { version = "0.3", features = ["html_reports"] } 76 | tempfile = "3.1" 77 | quickcheck = "1" 78 | quickcheck_async = "0.1.1" 79 | quickcheck_macros = "1" 80 | 81 | [build-dependencies] 82 | built = { version = "0.4", features = ["git2"] } 83 | -------------------------------------------------------------------------------- /statsrelay/benches/statsd_benchmark.rs: -------------------------------------------------------------------------------- 1 | use bytes::Bytes; 2 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 3 | use std::convert::TryInto; 4 | 5 | fn parse( 6 | line: &Bytes, 7 | ) -> Result { 8 | statsrelay::statsd_proto::Pdu::parse(line.clone()) 9 | } 10 | 11 | fn criterion_benchmark(c: &mut Criterion) { 12 | let by = Bytes::from_static( 13 | b"hello_world.worldworld_i_am_a_pumpkin:3|c|@1.0|#tags:tags,tags:tags,tags:tags,tags:tags", 14 | ); 15 | c.bench_function("statsd pdu parsing", |b| b.iter(|| parse(black_box(&by)))); 16 | c.bench_function("statsd pdu conversion", |b| { 17 | b.iter(|| { 18 | let _: statsrelay::statsd_proto::Owned = 19 | parse(black_box(&by)).unwrap().try_into().unwrap(); 20 | }) 21 | }); 22 | } 23 | 24 | criterion_group!(benches, criterion_benchmark); 25 | criterion_main!(benches); 26 | -------------------------------------------------------------------------------- /statsrelay/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::path; 3 | 4 | fn main() { 5 | let src = env::var("CARGO_MANIFEST_DIR").unwrap(); 6 | let dst = path::Path::new(&env::var("OUT_DIR").unwrap()).join("built.rs"); 7 | let mut options = built::Options::default(); 8 | options.set_git(true); 9 | built::write_built_file_with_opts(&options, src.as_ref(), &dst) 10 | .expect("Failed to acquire build-time information"); 11 | } 12 | -------------------------------------------------------------------------------- /statsrelay/examples/processors-basic.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "admin": { 4 | "port": 9001 5 | }, 6 | "processors": { 7 | "tag": { 8 | "type": "tag_converter", 9 | "route": ["processor:cardinality"] 10 | }, 11 | "sampler": { 12 | "type": "sampler", 13 | "window": 5, 14 | "timer_reservoir_size": 2, 15 | "route": ["statsd:simple"] 16 | }, 17 | "cardinality": { 18 | "type": "cardinality", 19 | "route": ["processor:sampler"], 20 | "size_limit": 100, 21 | "rotate_after_seconds": 10, 22 | "buckets": 3 23 | } 24 | }, 25 | "statsd": { 26 | "servers": { 27 | "default": { 28 | "bind": "127.0.0.1:8129", 29 | "route": ["processor:tag"] 30 | } 31 | }, 32 | "backends": { 33 | "simple": { 34 | "type": "statsd", 35 | "shard_map": [ 36 | "127.0.0.1:8122" 37 | ] 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /statsrelay/examples/source-example1.json: -------------------------------------------------------------------------------- 1 | { 2 | "hosts": ["127.0.0.1:9000", "127.0.0.1:9002"] 3 | } 4 | -------------------------------------------------------------------------------- /statsrelay/examples/tugboat-discovery.json: -------------------------------------------------------------------------------- 1 | { 2 | "statsd": { 3 | "servers": { 4 | "default": { 5 | "bind": "127.0.0.1:8129", 6 | "socket": "/tmp/statsrelay", 7 | "route": ["statsd:simple", "statsd:path_discovery"] 8 | } 9 | }, 10 | "backends": { 11 | "simple": { 12 | "shard_map": [ 13 | "127.0.0.1:8122" 14 | ] 15 | }, 16 | "path_discovery": { 17 | "shard_map_source": "path_discovery" 18 | } 19 | } 20 | }, 21 | "discovery": { 22 | "sources": { 23 | "path_discovery": { 24 | "type": "static_file", 25 | "path": "examples/source-example1.json", 26 | "interval": 5, 27 | "transforms": [ 28 | { 29 | "type": "repeat", 30 | "count": 2 31 | }, 32 | { 33 | "type": "format", 34 | "pattern": "{}:cheese" 35 | }, 36 | { 37 | "type": "format", 38 | "pattern": "{}:cheese2" 39 | } 40 | ] 41 | } 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /statsrelay/examples/tugboat-legacy-basic.json: -------------------------------------------------------------------------------- 1 | { 2 | "admin": { 3 | "port": 9001 4 | }, 5 | "statsd": { 6 | "servers": { 7 | "default": { 8 | "bind": "127.0.0.1:8129", 9 | "route": ["statsd:simple"] 10 | } 11 | }, 12 | "backends": { 13 | "simple": { 14 | "shard_map": [ 15 | "127.0.0.1:8122" 16 | ] 17 | } 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /statsrelay/src/admin.rs: -------------------------------------------------------------------------------- 1 | use log::info; 2 | 3 | use hyper::service::{make_service_fn, service_fn}; 4 | use hyper::{Body, Method, Request, Response, Server}; 5 | use tokio::runtime; 6 | 7 | use std::boxed::Box; 8 | use std::convert::Infallible; 9 | 10 | use crate::stats::Collector; 11 | 12 | #[derive(Clone)] 13 | struct AdminState { 14 | collector: Collector, 15 | } 16 | 17 | async fn metric_response( 18 | state: AdminState, 19 | _req: Request, 20 | ) -> Result, Infallible> { 21 | let buffer = state.collector.prometheus_output().unwrap(); 22 | Ok(Response::builder() 23 | .header(hyper::header::CONTENT_TYPE, prometheus::TEXT_FORMAT) 24 | .body(Body::from(buffer)) 25 | .unwrap()) 26 | } 27 | 28 | async fn request_handler( 29 | state: AdminState, 30 | req: Request, 31 | ) -> Result, Infallible> { 32 | match (req.method(), req.uri().path()) { 33 | (&Method::GET, "/") => Ok(Response::builder() 34 | .body(Body::from("statsrelay admin server")) 35 | .unwrap()), 36 | (&Method::GET, "/healthcheck") => Ok(Response::builder().body(Body::from("OK")).unwrap()), 37 | (&Method::GET, "/metrics") => metric_response(state, req).await, 38 | _ => Ok(Response::builder() 39 | .status(404) 40 | .body(Body::from("not found")) 41 | .unwrap()), 42 | } 43 | } 44 | 45 | async fn hyper_server(port: u16, collector: Collector) -> Result<(), Box> { 46 | let addr = format!("[::]:{}", port).parse().unwrap(); 47 | let admin_state = AdminState { collector }; 48 | let make_svc = make_service_fn(move |_conn| { 49 | let service_capture = admin_state.clone(); 50 | async { 51 | Ok::<_, Infallible>(service_fn(move |req| { 52 | request_handler(service_capture.clone(), req) 53 | })) 54 | } 55 | }); 56 | info!("admin server starting on port {}", port); 57 | Server::bind(&addr).serve(make_svc).await?; 58 | Ok(()) 59 | } 60 | 61 | pub fn spawn_admin_server(port: u16, collector: Collector) { 62 | let rt = runtime::Builder::new_current_thread() 63 | .enable_all() 64 | .build() 65 | .unwrap(); 66 | std::thread::spawn(move || rt.block_on(hyper_server(port, collector)).unwrap()); 67 | } 68 | -------------------------------------------------------------------------------- /statsrelay/src/backend.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::sync::atomic::AtomicU64; 3 | use std::sync::Arc; 4 | 5 | use regex::bytes::RegexSet; 6 | use tokio::sync::mpsc; 7 | 8 | use crate::backend_client::prom_client::PromClient; 9 | use crate::backend_client::statsd_client::StatsdClient; 10 | use crate::backend_client::BackendClient; 11 | use crate::config; 12 | use crate::config::BackendClientConfig; 13 | use crate::discovery; 14 | use crate::shard::{statsrelay_compat_hash, Ring}; 15 | use crate::stats; 16 | use crate::statsd_proto::Event; 17 | use crate::statsd_proto::{self, Pdu}; 18 | 19 | use log::warn; 20 | 21 | type ArcBackendClient = 22 | Arc> + Send + Sync>; 23 | 24 | pub struct Backend { 25 | conf: config::BackendConfig, 26 | ring: Ring, 27 | input_filter: Option, 28 | warning_log: AtomicU64, 29 | backend_sends: stats::Counter, 30 | backend_fails: stats::Counter, 31 | } 32 | 33 | impl Backend { 34 | pub fn new( 35 | stats: stats::Scope, 36 | conf: &config::BackendConfig, 37 | client_ref: Option<&Backend>, 38 | discovery_update: Option<&discovery::Update>, 39 | shutdown_send: Option>, 40 | ) -> anyhow::Result { 41 | let input_filter = conf.generate_input_filter(); 42 | let mut ring: Ring = Ring::new(); 43 | 44 | // Use the same backend for the same endpoint address, caching the lookup locally 45 | let mut memoize: HashMap = 46 | client_ref.map_or_else(HashMap::new, |b| b.clients()); 47 | 48 | let use_endpoints = discovery_update 49 | .map(|u| u.sources()) 50 | .unwrap_or(&conf.shard_map); 51 | for endpoint in use_endpoints { 52 | if endpoint.is_empty() { 53 | continue; 54 | } 55 | if let Some(client) = memoize.get(endpoint) { 56 | ring.push(client.clone()) 57 | } else { 58 | let client: ArcBackendClient = match &conf.client_config { 59 | BackendClientConfig::Statsd(client_config) => Arc::new(StatsdClient::new( 60 | stats.scope("statsd_client"), 61 | endpoint.as_str(), 62 | shutdown_send.clone(), 63 | client_config.clone(), 64 | )), 65 | BackendClientConfig::PromRemoteWrite(client_config) => { 66 | Arc::new(PromClient::new( 67 | stats.scope("prom_client"), 68 | endpoint, 69 | client_config.clone(), 70 | shutdown_send.clone(), 71 | )) 72 | } 73 | }; 74 | memoize.insert(endpoint.clone(), client.clone()); 75 | ring.push(client); 76 | } 77 | } 78 | 79 | let backend = Backend { 80 | conf: conf.clone(), 81 | ring, 82 | input_filter, 83 | warning_log: AtomicU64::new(0), 84 | backend_fails: stats.counter("backend_fails").unwrap(), 85 | backend_sends: stats.counter("backend_sends").unwrap(), 86 | }; 87 | 88 | Ok(backend) 89 | } 90 | 91 | // Capture the old ring contents into a memoization map by endpoint, 92 | // letting us re-use any old client connections and buffers. Note we 93 | // won't start tearing down connections until the memoization buffer and 94 | // old ring are both dropped. 95 | fn clients(&self) -> HashMap { 96 | let mut memoize: HashMap = HashMap::new(); 97 | for i in 0..self.ring.len() { 98 | let client = self.ring.pick_from(i as u32); 99 | memoize.insert(String::from(client.endpoint()), client.clone()); 100 | } 101 | memoize 102 | } 103 | 104 | pub fn provide_statsd(&self, input: &Event) { 105 | let pdu: statsd_proto::Pdu = input.into(); 106 | if !self 107 | .input_filter 108 | .as_ref() 109 | .map_or(true, |inf| inf.is_match(pdu.name())) 110 | { 111 | return; 112 | } 113 | 114 | let ring_read = &self.ring; 115 | let code = match ring_read.len() { 116 | 0 => return, // In case of nothing to send, do nothing 117 | 1 => 1_u32, 118 | _ => statsrelay_compat_hash(&pdu), 119 | }; 120 | let client = ring_read.pick_from(code); 121 | 122 | // Assign prefix and/or suffix 123 | let pdu_clone = if self.conf.prefix.is_some() || self.conf.suffix.is_some() { 124 | pdu.with_prefix_suffix( 125 | self.conf 126 | .prefix 127 | .as_ref() 128 | .map(|p| p.as_bytes()) 129 | .unwrap_or_default(), 130 | self.conf 131 | .suffix 132 | .as_ref() 133 | .map(|s| s.as_bytes()) 134 | .unwrap_or_default(), 135 | ) 136 | } else { 137 | pdu 138 | }; 139 | match client.try_send(pdu_clone) { 140 | Err(_e) => { 141 | self.backend_fails.inc(); 142 | let count = self 143 | .warning_log 144 | .fetch_add(1, std::sync::atomic::Ordering::Relaxed); 145 | if count % 1000 == 0 { 146 | warn!( 147 | "error pushing to queue full (endpoint {}, total failures {})", 148 | client.endpoint(), 149 | count 150 | ); 151 | } 152 | } 153 | Ok(_) => { 154 | self.backend_sends.inc(); 155 | } 156 | } 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /statsrelay/src/backend_client/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::statsd_proto::Pdu; 2 | 3 | pub mod prom_client; 4 | pub mod statsd_client; 5 | 6 | pub trait BackendClient { 7 | type Error; 8 | 9 | fn try_send(&self, pdu: Pdu) -> Result<(), Self::Error>; 10 | 11 | fn endpoint(&self) -> &str; 12 | } 13 | -------------------------------------------------------------------------------- /statsrelay/src/backend_client/prom_client.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryFrom; 2 | use std::sync::{Arc, Mutex}; 3 | use std::time::Duration; 4 | 5 | use async_trait::async_trait; 6 | use backoff::future::retry_notify; 7 | use backoff::ExponentialBackoff; 8 | use log::{error, info, warn}; 9 | use prost::Message; 10 | use reqwest::Client; 11 | use statsrelay_protobuf::prometheus::WriteRequest; 12 | use tokio::select; 13 | use tokio::sync::{mpsc, watch, Semaphore}; 14 | use tokio::time::sleep; 15 | 16 | use crate::config::PromBackendClientConfig; 17 | use crate::stats::{self, Counter, Gauge, Scope}; 18 | use crate::statsd_proto::{Owned, Pdu}; 19 | 20 | use super::BackendClient; 21 | 22 | #[derive(Clone, Debug)] 23 | struct Stats { 24 | bytes_sent_total: Counter, 25 | requests_compression_fail: Counter, 26 | requests_fail: Counter, 27 | requests_in_flight: Gauge, 28 | requests_retry: Counter, 29 | requests_total: Counter, 30 | samples_parse_fail: Counter, 31 | samples_send_fail: Counter, 32 | samples_total: Counter, 33 | } 34 | 35 | impl Stats { 36 | fn new(scope: Scope) -> Self { 37 | Self { 38 | bytes_sent_total: scope.counter("bytes_sent_total").unwrap(), 39 | requests_compression_fail: scope.counter("requests_compression_fail").unwrap(), 40 | requests_fail: scope.counter("requests_fail").unwrap(), 41 | requests_in_flight: scope.gauge("requests_in_flight").unwrap(), 42 | requests_retry: scope.counter("requests_retry").unwrap(), 43 | requests_total: scope.counter("requests_total").unwrap(), 44 | samples_parse_fail: scope.counter("samples_parse_fail").unwrap(), 45 | samples_send_fail: scope.counter("samples_send_fail").unwrap(), 46 | samples_total: scope.counter("samples_total").unwrap(), 47 | } 48 | } 49 | } 50 | 51 | pub struct PromClient { 52 | endpoint: String, 53 | _inner: Arc>, 54 | sender: mpsc::Sender, 55 | } 56 | 57 | impl PromClient { 58 | /// Constructs a new [PromClient]. 59 | /// 60 | /// This method panics if a [reqwest::Client] cannot be initialized. 61 | pub fn new( 62 | scope: stats::Scope, 63 | endpoint: &str, 64 | config: PromBackendClientConfig, 65 | shutdown_send: Option>, 66 | ) -> Self { 67 | let inner = Client::builder() 68 | .timeout(Duration::from_millis(config.request_timeout_ms)) 69 | .build() 70 | .expect("failed to build PromClient"); 71 | let client = ReqwestPromWriteRequestClient { 72 | inner, 73 | endpoint: String::from(endpoint), 74 | }; 75 | Self::new_with_client(scope, client, endpoint, config, shutdown_send) 76 | } 77 | 78 | fn new_with_client( 79 | scope: stats::Scope, 80 | client: T, 81 | endpoint: &str, 82 | config: PromBackendClientConfig, 83 | shutdown_send: Option>, 84 | ) -> Self { 85 | let client = Arc::new(client); 86 | let stats = Stats::new(scope); 87 | let (sender, pdu_rx) = mpsc::channel::(config.max_queue); 88 | let (abort_retries_tx, abort_retries_rx) = watch::channel(false); 89 | tokio::spawn(client_task( 90 | stats, 91 | client, 92 | pdu_rx, 93 | abort_retries_rx, 94 | config, 95 | shutdown_send, 96 | )); 97 | 98 | let inner = Arc::new(Mutex::new(PromClientInner { abort_retries_tx })); 99 | 100 | Self { 101 | endpoint: String::from(endpoint), 102 | _inner: inner, 103 | sender, 104 | } 105 | } 106 | } 107 | 108 | impl BackendClient for PromClient { 109 | type Error = mpsc::error::TrySendError; 110 | 111 | fn try_send(&self, pdu: Pdu) -> Result<(), Self::Error> { 112 | self.sender.try_send(pdu) 113 | } 114 | 115 | fn endpoint(&self) -> &str { 116 | self.endpoint.as_str() 117 | } 118 | } 119 | 120 | struct PromClientInner { 121 | abort_retries_tx: watch::Sender, 122 | } 123 | 124 | impl Drop for PromClientInner { 125 | fn drop(&mut self) { 126 | let _ = self.abort_retries_tx.send(true); 127 | } 128 | } 129 | 130 | /// A thin client wrapper used for mocking in tests 131 | #[async_trait] 132 | trait PromWriteRequestClient { 133 | async fn send_write_request( 134 | &self, 135 | compressed_write_request: bytes::Bytes, 136 | ) -> Result; 137 | } 138 | 139 | #[derive(Clone, Debug)] 140 | struct ReqwestPromWriteRequestClient { 141 | inner: Client, 142 | endpoint: String, 143 | } 144 | 145 | #[async_trait] 146 | impl PromWriteRequestClient for ReqwestPromWriteRequestClient { 147 | async fn send_write_request( 148 | &self, 149 | compressed_write_request: bytes::Bytes, 150 | ) -> Result { 151 | self.inner 152 | .post(self.endpoint.clone()) 153 | .header("Content-Type", "application/x-protobuf") 154 | .header("Content-Encoding", "snappy") 155 | .header("X-Prometheus-Remote-Write-Version", "0.1.0") 156 | .body(compressed_write_request) 157 | .send() 158 | .await 159 | } 160 | } 161 | 162 | async fn client_task( 163 | stats: Stats, 164 | client: Arc, 165 | mut pdu_rx: mpsc::Receiver, 166 | abort_retries_rx: watch::Receiver, 167 | config: PromBackendClientConfig, 168 | shutdown_send: Option>, 169 | ) { 170 | let semaphore = Arc::new(Semaphore::new(config.max_in_flight)); 171 | // The receiving end of the flush channel blocks until all spawned tasks have terminated. 172 | let (flush_tx, mut flush_rx) = mpsc::channel::<()>(1); 173 | 174 | loop { 175 | let (batch, shutdown) = construct_batch( 176 | &mut pdu_rx, 177 | Duration::from_millis(config.batch_creation_timeout_ms), 178 | config.max_batch_size, 179 | ) 180 | .await; 181 | 182 | if !batch.is_empty() { 183 | let permit = semaphore.clone().acquire_owned().await.unwrap(); 184 | stats 185 | .requests_in_flight 186 | .set((config.max_in_flight - semaphore.available_permits()) as f64); 187 | 188 | let stats = stats.clone(); 189 | let client = client.clone(); 190 | let flush_tx = flush_tx.clone(); 191 | let abort_retries_rx = abort_retries_rx.clone(); 192 | 193 | tokio::spawn(async move { 194 | let samples_in_batch = batch.len() as f64; 195 | stats.requests_total.inc(); 196 | stats.samples_total.inc_by(samples_in_batch); 197 | 198 | let mut owned_batch: Vec = Vec::with_capacity(batch.len()); 199 | let mut failthrough_statsd_lines: Vec = Vec::new(); 200 | for pdu in batch { 201 | match Owned::try_from(&pdu) { 202 | Ok(owned) => owned_batch.push(owned), 203 | Err(e) => { 204 | info!("unparsable line ({:?}): {:?}", pdu.as_bytes(), e); 205 | stats.samples_parse_fail.inc(); 206 | if let Ok(line) = String::from_utf8(pdu.as_bytes().to_vec()) { 207 | failthrough_statsd_lines.push(line); 208 | } 209 | } 210 | } 211 | } 212 | 213 | let mut write_request = Owned::to_write_request(&owned_batch); 214 | write_request 215 | .failthrough_statsd_lines 216 | .append(&mut failthrough_statsd_lines); 217 | // N.B. According to the snap documentation, this compession should never fail for 218 | // our use case. 219 | let compressed_write_request = match compress_write_request(write_request) { 220 | Ok(c) => c, 221 | Err(e) => { 222 | stats.requests_compression_fail.inc(); 223 | error!( 224 | "failed to compress prometheus WriteRequest, dropping samples: {:?}", 225 | e 226 | ); 227 | return; 228 | } 229 | }; 230 | stats 231 | .bytes_sent_total 232 | .inc_by(compressed_write_request.len() as f64); 233 | 234 | let res = retry_notify( 235 | ExponentialBackoff::default(), 236 | || async { 237 | client 238 | .send_write_request(compressed_write_request.clone()) 239 | .await 240 | .map_err(|e| { 241 | if *abort_retries_rx.borrow() { 242 | return backoff::Error::permanent(e); 243 | } 244 | match should_retry(&e) { 245 | true => backoff::Error::transient(e), 246 | false => backoff::Error::permanent(e), 247 | } 248 | }) 249 | }, 250 | |_, _| { 251 | stats.requests_retry.inc(); 252 | }, 253 | ) 254 | .await; 255 | 256 | if let Err(e) = res { 257 | stats.requests_fail.inc(); 258 | stats.samples_send_fail.inc_by(samples_in_batch); 259 | warn!( 260 | "prometheus remote write request failed, dropping samples: {:?}", 261 | e 262 | ); 263 | } 264 | 265 | // Force move permit and flush_tx into this closure. 266 | drop(permit); 267 | drop(flush_tx); 268 | }); 269 | } 270 | 271 | if shutdown { 272 | // Wait for all spawned tasks to terminate. 273 | drop(flush_tx); 274 | let _ = flush_rx.recv().await; 275 | drop(shutdown_send); 276 | return; 277 | } 278 | } 279 | } 280 | 281 | async fn construct_batch( 282 | pdu_rx: &mut mpsc::Receiver, 283 | timeout: Duration, 284 | max_batch_size: usize, 285 | ) -> (Vec, bool) { 286 | let mut batch = Vec::::with_capacity(max_batch_size); 287 | let sleep = sleep(timeout); 288 | tokio::pin!(sleep); 289 | loop { 290 | select! { 291 | pdu = pdu_rx.recv() => { 292 | if let Some(pdu) = pdu { 293 | batch.push(pdu); 294 | if batch.len() >= max_batch_size { 295 | return (batch, false); 296 | } 297 | } else { 298 | return (batch, true) 299 | } 300 | } 301 | _ = &mut sleep => return (batch, false) 302 | } 303 | } 304 | } 305 | 306 | fn compress_write_request(write_request: WriteRequest) -> Result { 307 | let proto_encoded = write_request.encode_to_vec(); 308 | let max_size = snap::raw::max_compress_len(proto_encoded.len()); 309 | let mut buf = bytes::BytesMut::new(); 310 | buf.resize(max_size, 0x0); 311 | // This compression shouldn't fail given our use case. An error is returned if 312 | // the total number of bytes to compress exceeds 2^32 - 1, but we limit the size 313 | // of the request. 314 | let acutal_len = snap::raw::Encoder::new().compress(&proto_encoded, &mut buf)?; 315 | buf.truncate(acutal_len); 316 | Ok(buf.freeze()) 317 | } 318 | 319 | fn should_retry(e: &reqwest::Error) -> bool { 320 | match e.status() { 321 | Some(s) => s.is_server_error(), 322 | None => false, 323 | } 324 | } 325 | 326 | #[cfg(test)] 327 | mod tests { 328 | use std::{cmp, io::Cursor}; 329 | 330 | use bytes::Bytes; 331 | use hyper::http::response::Builder; 332 | 333 | use crate::{stats::Collector, statsd_proto::test::compare_owned_pdu_vecs}; 334 | 335 | use super::*; 336 | 337 | pub mod arbitraries { 338 | use quickcheck::Arbitrary; 339 | 340 | use crate::config::PromBackendClientConfig; 341 | 342 | impl Arbitrary for PromBackendClientConfig { 343 | fn arbitrary(g: &mut quickcheck::Gen) -> Self { 344 | let max_batch_size = u8::arbitrary(g) as usize + 1; 345 | 346 | PromBackendClientConfig { 347 | max_queue: 1000, 348 | request_timeout_ms: 1000, 349 | batch_creation_timeout_ms: 1000, 350 | max_batch_size, 351 | max_in_flight: 1, 352 | } 353 | } 354 | } 355 | } 356 | 357 | #[derive(Clone, Default)] 358 | struct MockPromWriteRequestClient { 359 | status: hyper::StatusCode, 360 | requests: Arc>>, 361 | } 362 | 363 | impl MockPromWriteRequestClient { 364 | fn new(status: hyper::StatusCode) -> Self { 365 | MockPromWriteRequestClient { 366 | status, 367 | requests: Arc::default(), 368 | } 369 | } 370 | } 371 | 372 | #[async_trait] 373 | impl PromWriteRequestClient for MockPromWriteRequestClient { 374 | async fn send_write_request( 375 | &self, 376 | compressed_write_request: bytes::Bytes, 377 | ) -> Result { 378 | self.requests.lock().unwrap().push(compressed_write_request); 379 | let r: reqwest::Response = Builder::new() 380 | .status(self.status) 381 | .body("{}") 382 | .unwrap() 383 | .into(); 384 | r.error_for_status() 385 | } 386 | } 387 | 388 | fn decode_response(compressed_write_request: &bytes::Bytes) -> anyhow::Result { 389 | let decompressed = snap::raw::Decoder::new().decompress_vec(compressed_write_request)?; 390 | let r = WriteRequest::decode(Cursor::new(decompressed))?; 391 | Ok(r) 392 | } 393 | 394 | #[quickcheck_async::tokio] 395 | async fn prom_client_no_failures( 396 | config: PromBackendClientConfig, 397 | input: Vec, 398 | ) -> anyhow::Result<()> { 399 | let mock_client = MockPromWriteRequestClient::new(hyper::StatusCode::OK); 400 | let scope = Collector::default().scope("stats"); 401 | let endpoint = "endpoint"; 402 | let (shutdown_send, mut shutdown_recv) = mpsc::channel::<()>(1); 403 | 404 | let prom_client = PromClient::new_with_client( 405 | scope, 406 | mock_client.clone(), 407 | endpoint, 408 | config.clone(), 409 | Some(shutdown_send), 410 | ); 411 | 412 | let pdus: Vec = input.iter().map(Pdu::from).collect(); 413 | for pdu in pdus { 414 | prom_client.try_send(pdu)?; 415 | } 416 | 417 | drop(prom_client); 418 | shutdown_recv.recv().await; 419 | 420 | let requests = mock_client.requests.lock().unwrap(); 421 | 422 | if requests.len() != (input.len() as f64 / config.max_batch_size as f64).ceil() as usize { 423 | return Err(anyhow::anyhow!( 424 | "unexpected number of requests: {:?}", 425 | requests 426 | )); 427 | } 428 | 429 | for (i, request) in requests.iter().enumerate() { 430 | let write_request = decode_response(request)?; 431 | let actual = Owned::try_from_write_request(&write_request)?; 432 | 433 | let start_idx = i * config.max_batch_size; 434 | let end_idx = cmp::min(start_idx + config.max_batch_size, input.len()); 435 | 436 | let expected = &input[start_idx..end_idx]; 437 | 438 | compare_owned_pdu_vecs(expected, &actual)?; 439 | } 440 | 441 | Ok(()) 442 | } 443 | 444 | #[tokio::test] 445 | async fn prom_client_abort_retries() { 446 | let config: PromBackendClientConfig = PromBackendClientConfig { 447 | max_queue: 10000, 448 | request_timeout_ms: 10000, 449 | batch_creation_timeout_ms: 500, 450 | max_batch_size: 5, 451 | max_in_flight: 1, 452 | }; 453 | let mock_client = MockPromWriteRequestClient::new(hyper::StatusCode::INTERNAL_SERVER_ERROR); 454 | let scope = Collector::default().scope("stats"); 455 | let endpoint = "endpoint"; 456 | let (shutdown_send, mut shutdown_recv) = mpsc::channel::<()>(1); 457 | 458 | let prom_client = PromClient::new_with_client( 459 | scope, 460 | mock_client.clone(), 461 | endpoint, 462 | config.clone(), 463 | Some(shutdown_send), 464 | ); 465 | 466 | let pdu = Pdu::parse(Bytes::from_static(b"foo.bar:3|c|@1.0|#tags")).unwrap(); 467 | 468 | prom_client.try_send(pdu).expect("failed to send pdu"); 469 | 470 | drop(prom_client); 471 | 472 | let sleep = sleep(Duration::from_secs(1)); 473 | tokio::pin!(sleep); 474 | tokio::select! { 475 | // shutdown_recv.recv() should return almost immediately if we successfully abort retries. 476 | _ = shutdown_recv.recv() => {} 477 | _ = &mut sleep => { 478 | panic!("client did not shutdown") 479 | } 480 | } 481 | } 482 | } 483 | -------------------------------------------------------------------------------- /statsrelay/src/backend_client/statsd_client.rs: -------------------------------------------------------------------------------- 1 | use bytes::{BufMut, Bytes, BytesMut}; 2 | use memchr::memchr; 3 | use stream_cancel::{Trigger, Tripwire}; 4 | use tokio::io::AsyncWriteExt; 5 | use tokio::net::TcpStream; 6 | use tokio::select; 7 | use tokio::sync::mpsc; 8 | use tokio::time::{sleep, timeout}; 9 | 10 | use std::sync::Arc; 11 | use std::sync::Mutex; 12 | use std::time::Duration; 13 | 14 | use crate::stats; 15 | use crate::statsd_proto::Pdu; 16 | 17 | use crate::config::StatsdBackendClientConfig; 18 | use log::{debug, info, warn}; 19 | use tokio::sync::mpsc::error::TryRecvError; 20 | use tokio::time::error::Elapsed; 21 | 22 | use super::BackendClient; 23 | 24 | pub struct StatsdClient { 25 | sender: mpsc::Sender, 26 | endpoint: String, 27 | inner: Arc>, 28 | } 29 | 30 | struct StatsdClientInner { 31 | _sender: mpsc::Sender, 32 | _trig: Trigger, 33 | } 34 | 35 | const RECONNECT_DELAY: Duration = Duration::from_secs(5); 36 | const CONNECT_TIMEOUT: Duration = Duration::from_secs(15); 37 | const BUFFER_FILL_WAIT: Duration = Duration::from_millis(50); 38 | const TCP_IDLE_DISCONNECT: Duration = Duration::from_secs(60); 39 | 40 | // SEND_THRESHOLD should correspond with the statsd_server.READ_BUFFER = 8KiB 41 | // We set the threshold around 1 metric worth of bytes (256) lower because 42 | // the receiver waits to exceed the threshold before sending rather than 43 | // cutting a new buffer for the metric that exceeds the threshold 44 | const SEND_THRESHOLD: usize = 8192 - 256; 45 | const INITIAL_BUF_CAPACITY: usize = SEND_THRESHOLD + 1024; 46 | 47 | impl StatsdClient { 48 | pub fn new( 49 | stats: stats::Scope, 50 | endpoint: &str, 51 | shutdown_send: Option>, 52 | config: StatsdBackendClientConfig, 53 | ) -> Self { 54 | // Currently, we need this tripwire to abort connection looping. This can probably be refactored 55 | let (trig, trip) = Tripwire::new(); 56 | let (sender_chan, recv_chan) = mpsc::channel::(config.max_queue); 57 | 58 | let eps = String::from(endpoint); 59 | let (ticker_sender, ticker_recv) = mpsc::channel::(1); 60 | tokio::spawn(ticker(eps.clone(), ticker_sender)); 61 | tokio::spawn(client_task( 62 | stats, 63 | eps.clone(), 64 | trip, 65 | recv_chan, 66 | ticker_recv, 67 | shutdown_send, 68 | config.trim_on_disconnect, 69 | )); 70 | let inner = StatsdClientInner { 71 | _sender: sender_chan.clone(), 72 | _trig: trig, 73 | }; 74 | 75 | StatsdClient { 76 | endpoint: eps, 77 | inner: Arc::new(Mutex::new(inner)), 78 | sender: sender_chan, 79 | } 80 | } 81 | } 82 | 83 | impl BackendClient for StatsdClient { 84 | type Error = mpsc::error::TrySendError; 85 | 86 | fn try_send(&self, pdu: Pdu) -> Result<(), Self::Error> { 87 | self.sender.try_send(pdu) 88 | } 89 | 90 | fn endpoint(&self) -> &str { 91 | self.endpoint.as_str() 92 | } 93 | } 94 | 95 | impl Clone for StatsdClient { 96 | fn clone(&self) -> Self { 97 | StatsdClient { 98 | endpoint: self.endpoint.clone(), 99 | inner: self.inner.clone(), 100 | sender: self.sender.clone(), 101 | } 102 | } 103 | } 104 | 105 | /// Repeatedly try to form a connection to and endpoint with backoff. If the 106 | /// tripwire is set, this function will then abort and return none. 107 | async fn form_connection( 108 | stats: stats::Scope, 109 | endpoint: &str, 110 | mut connect_tripwire: Tripwire, 111 | ) -> Option { 112 | let connections_made = stats.counter("connections_made").unwrap(); 113 | let connections_failed = stats.counter("connections_failed").unwrap(); 114 | loop { 115 | let connect_attempt = timeout(CONNECT_TIMEOUT, TcpStream::connect(endpoint)); 116 | 117 | let stream = match select!( 118 | connect = connect_attempt => connect, 119 | _ = (&mut connect_tripwire) => { 120 | debug!("aborting connection attempts to {:?}", endpoint); 121 | return None; 122 | }, 123 | ) { 124 | Err(_e) => { 125 | warn!("connect timeout to {:?}", endpoint); 126 | connections_failed.inc(); 127 | tokio::time::sleep(RECONNECT_DELAY).await; 128 | continue; 129 | } 130 | Ok(Err(e)) => { 131 | warn!("connect error to {:?} error {:?}", endpoint, e); 132 | connections_failed.inc(); 133 | tokio::time::sleep(RECONNECT_DELAY).await; 134 | continue; 135 | } 136 | Ok(Ok(s)) => { 137 | info!("statsd client connect {:?}", endpoint); 138 | s 139 | } 140 | }; 141 | connections_made.inc(); 142 | return Some(stream); 143 | } 144 | } 145 | 146 | // Since statsd has no notion of when a message is actually received, we have to 147 | // assume a buffer write is incomplete and just drop it here. This simply 148 | // advances to the next newline in the buffer if found. 149 | fn trim_to_next_newline(buf: &mut Bytes) { 150 | match memchr(b'\n', buf) { 151 | None => (), 152 | Some(pos) => { 153 | let _b = buf.split_to(pos + 1); 154 | } 155 | } 156 | } 157 | 158 | async fn client_sender( 159 | stats: stats::Scope, 160 | endpoint: String, 161 | connect_tripwire: Tripwire, 162 | mut recv: mpsc::Receiver, 163 | trim_on_disconnect: bool, 164 | ) { 165 | let bytes_sent = stats.counter("bytes_sent").unwrap(); 166 | let connections_aborted = stats.counter("connections_aborted").unwrap(); 167 | let send_error = stats.counter("send_error").unwrap(); 168 | 169 | let first_connect_tripwire = connect_tripwire.clone(); 170 | let mut lazy_connect: Option = 171 | form_connection(stats.clone(), endpoint.as_str(), first_connect_tripwire).await; 172 | 173 | loop { 174 | // We have a connection but this could get stale before we receive data. 175 | // First check if there's data ready and if not, give time for it to appear 176 | // before disconnecting the current connection so the next recv() forces a reconnect. 177 | // This shouldn't be necessary but there's an issue with some of the buffer 178 | // going missing when we try to write to a stale connection and get a broken pipe. 179 | // Ultimately we should use a tcp keepalive here. 180 | // See: https://users.rust-lang.org/t/tcpstream-write-silently-loses-one-message/38206 181 | let mut buf = match recv.try_recv() { 182 | Ok(b) => b, 183 | Err(TryRecvError::Empty) => { 184 | // We could add a branch to recv() without a timeout here 185 | // if the connection is already closed but the code gets messy. 186 | // This will just recreate the timer every ~10s instead. 187 | match timeout(TCP_IDLE_DISCONNECT, recv.recv()).await { 188 | Ok(Some(b)) => b, 189 | Ok(None) => { 190 | debug!("recv() is None - sender task {} exiting", endpoint); 191 | return; 192 | } 193 | Err(_) => { 194 | lazy_connect = None; 195 | continue; 196 | } 197 | } 198 | } 199 | Err(TryRecvError::Disconnected) => { 200 | debug!("recv disconnect - sender task {} exiting", endpoint); 201 | return; 202 | } 203 | }; 204 | 205 | loop { 206 | // keep flushing to the network until buffer is empty 207 | if buf.is_empty() { 208 | break; 209 | } 210 | let connect = match lazy_connect.as_mut() { 211 | None => { 212 | let reconnect_tripwire = connect_tripwire.clone(); 213 | lazy_connect = 214 | form_connection(stats.clone(), endpoint.as_str(), reconnect_tripwire).await; 215 | if lazy_connect.is_none() { 216 | // Early check to see if the tripwire is set and bail 217 | debug!("sender task {} exiting", endpoint); 218 | return; 219 | } 220 | lazy_connect.as_mut().unwrap() 221 | } 222 | Some(c) => c, 223 | }; 224 | // Write the buffer until success 225 | let result = connect.write_buf(&mut buf).await; 226 | match result { 227 | Ok(0) if !buf.is_empty() => { 228 | // Write 0 error, abort the connection and try again 229 | lazy_connect = None; 230 | 231 | if trim_on_disconnect { 232 | trim_to_next_newline(&mut buf); 233 | } 234 | connections_aborted.inc(); 235 | continue; 236 | } 237 | Ok(bytes) if buf.is_empty() => { 238 | bytes_sent.inc_by(bytes as f64); 239 | drop(buf); 240 | break; 241 | } 242 | Ok(bytes) => { 243 | bytes_sent.inc_by(bytes as f64); 244 | continue; 245 | } 246 | Err(e) => { 247 | warn!( 248 | "write error {} - {:?}, reforming a connection with this buffer", 249 | endpoint, e 250 | ); 251 | if trim_on_disconnect { 252 | trim_to_next_newline(&mut buf); 253 | } 254 | lazy_connect = None; 255 | send_error.inc(); 256 | continue; 257 | } 258 | }; 259 | } 260 | } 261 | } 262 | 263 | /// 264 | /// Ticker is responsible for making sure the statsd channel emits a payload at 265 | /// a particular rate (allowing for write combining). Due to an issue with 266 | /// non-async mpsc try_send being used to trigger the primary sender queue, the 267 | /// ticker is needed as opposed to a timeout() wrapper over a queue.recv, which 268 | /// does not reliably get woken by try_send. The upside of this we also form one 269 | /// less short lived timer, not that its really a major advantage. 270 | async fn ticker(endpoint: String, sender: mpsc::Sender) { 271 | loop { 272 | sleep(BUFFER_FILL_WAIT).await; 273 | if sender.send(true).await.is_err() { 274 | debug!("ticker task {} exiting", endpoint); 275 | return; 276 | } 277 | } 278 | } 279 | 280 | /// Receive metrics until the buffer fills up or timeout happens 281 | /// then send the buffer over the network with client_sender above 282 | async fn client_task( 283 | stats: stats::Scope, 284 | endpoint: String, 285 | connect_tripwire: Tripwire, 286 | mut recv: mpsc::Receiver, 287 | mut ticker_recv: mpsc::Receiver, 288 | shutdown_send: Option>, 289 | trim_on_disconnect: bool, 290 | ) { 291 | let partial_buffer_send = stats.counter("partial_buffer_send").unwrap(); 292 | let messages_queued = stats.counter("messages_queued").unwrap(); 293 | 294 | let mut buf = BytesMut::with_capacity(INITIAL_BUF_CAPACITY); 295 | let (buf_sender, buf_recv) = mpsc::channel(10); 296 | let sender_join = tokio::spawn(client_sender( 297 | stats, 298 | endpoint.clone(), 299 | connect_tripwire, 300 | buf_recv, 301 | trim_on_disconnect, 302 | )); 303 | 304 | loop { 305 | let (pdu, timeout) = select! { 306 | p = recv.recv() => (p, false), 307 | _ = ticker_recv.recv() => (None, true), 308 | }; 309 | 310 | match (pdu, timeout) { 311 | (Some(pdu), _) => { 312 | let pdu_bytes = pdu.as_bytes(); 313 | if buf.remaining_mut() < pdu_bytes.len() { 314 | buf.reserve(pdu_bytes.len() + 10); 315 | } 316 | buf.put(pdu_bytes); 317 | buf.put(b"\n".as_ref()); 318 | messages_queued.inc(); 319 | if buf.len() < SEND_THRESHOLD { 320 | continue; // Do not send yet 321 | } 322 | } 323 | (None, false) => { 324 | if buf.is_empty() { 325 | // No more queue, no more bytes, exit 326 | drop(buf_sender); 327 | let _ = sender_join.await; 328 | debug!("client task {} exiting", endpoint); 329 | drop(shutdown_send); 330 | return; 331 | } 332 | } 333 | (None, true) if buf.is_empty() => { 334 | continue; 335 | } 336 | (None, true) => { 337 | partial_buffer_send.inc(); 338 | // Timeout! Just go ahead and send whats in the buf now 339 | } 340 | }; 341 | 342 | // Flush the buffer to the mpsc channel for the client_sender to send over the network 343 | if let Err(send_error) = buf_sender.send(buf.freeze()).await { 344 | debug!("client task {} exiting", endpoint); 345 | warn!("error flushing buffer to client_sender {}", send_error); 346 | let _ = sender_join.await; 347 | drop(shutdown_send); 348 | return; 349 | } 350 | buf = BytesMut::with_capacity(INITIAL_BUF_CAPACITY); 351 | } 352 | } 353 | -------------------------------------------------------------------------------- /statsrelay/src/backends.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{HashMap, HashSet}; 2 | use std::sync::Arc; 3 | 4 | use parking_lot::RwLock; 5 | use stream_cancel::Tripwire; 6 | use thiserror::Error; 7 | use tokio::sync::mpsc; 8 | 9 | use crate::backend::Backend; 10 | use crate::discovery; 11 | use crate::stats; 12 | use crate::statsd_proto::Event; 13 | use crate::{config, processors}; 14 | 15 | #[derive(Error, Debug)] 16 | pub enum BackendError { 17 | #[error("Index not valid for backend {0}")] 18 | InvalidIndex(usize), 19 | } 20 | 21 | struct BackendsInner { 22 | backends: HashMap, 23 | processors: HashMap>, 24 | stats: stats::Scope, 25 | shutdown_recv: Option>, 26 | shutdown_send: Option>, 27 | } 28 | 29 | impl BackendsInner { 30 | fn new(stats: stats::Scope) -> Self { 31 | let (shutdown_send, shutdown_recv) = mpsc::channel::<()>(1); 32 | 33 | BackendsInner { 34 | backends: HashMap::new(), 35 | processors: HashMap::new(), 36 | stats, 37 | shutdown_recv: Some(shutdown_recv), 38 | shutdown_send: Some(shutdown_send), 39 | } 40 | } 41 | 42 | fn replace_processor( 43 | &mut self, 44 | name: &str, 45 | processor: Box, 46 | ) -> anyhow::Result<()> { 47 | self.processors.insert(name.to_owned(), processor); 48 | Ok(()) 49 | } 50 | 51 | fn replace_backend( 52 | &mut self, 53 | name: &str, 54 | c: &config::BackendConfig, 55 | discovery_update: Option<&discovery::Update>, 56 | ) -> anyhow::Result<()> { 57 | let previous = self.backends.get(name); 58 | let backend = Backend::new( 59 | self.stats.scope(name), 60 | c, 61 | previous, 62 | discovery_update, 63 | self.shutdown_send.clone(), 64 | )?; 65 | self.backends.insert(name.to_owned(), backend); 66 | Ok(()) 67 | } 68 | 69 | fn len(&self) -> usize { 70 | self.backends.len() 71 | } 72 | 73 | fn remove_backend(&mut self, name: &str) -> anyhow::Result<()> { 74 | self.backends.remove(name); 75 | // With the backend removed should we ensure it's flushed? 76 | Ok(()) 77 | } 78 | 79 | fn backend_names(&self) -> HashSet<&String> { 80 | self.backends.keys().collect() 81 | } 82 | 83 | fn provide_statsd(&self, pdu: &Event, route: &[config::Route]) { 84 | for dest in route { 85 | match dest.route_type { 86 | config::RouteType::Statsd => { 87 | if let Some(backend) = self.backends.get(dest.route_to.as_str()) { 88 | backend.provide_statsd(pdu) 89 | } 90 | } 91 | config::RouteType::Processor => { 92 | if let Some(chain) = self 93 | .processors 94 | .get(dest.route_to.as_str()) 95 | .and_then(|proc| proc.provide_statsd(pdu)) 96 | { 97 | match chain.new_events { 98 | None => self.provide_statsd(pdu, chain.route), 99 | Some(sv) => { 100 | for pdu in sv.as_ref() { 101 | self.provide_statsd(pdu, chain.route); 102 | } 103 | } 104 | } 105 | } 106 | } 107 | } 108 | } 109 | } 110 | 111 | /// Provide a periodic "tick" function to drive processors background 112 | /// housekeeping tasks asynchronously. 113 | fn processor_tick(&self, now: std::time::SystemTime, backends: &Backends) { 114 | for (_, proc) in self.processors.iter() { 115 | proc.tick(now, backends); 116 | } 117 | } 118 | 119 | /// Perform a flush operation on the processors, forcing an immediate state 120 | /// transfer. Each processor is flushed n_processors times to ensure no 121 | /// processor maintains state in a cascading pipeline. 122 | fn flush(&self, backends: &Backends) { 123 | for _i in 0..self.processors.len() { 124 | for (_, proc) in self.processors.iter() { 125 | proc.flush(backends); 126 | } 127 | } 128 | } 129 | 130 | /// Drain all backends, and return a receiver to ensure they complete 131 | fn drain_backends(&mut self) -> Option> { 132 | self.backends.clear(); 133 | drop(self.shutdown_send.take()); 134 | self.shutdown_recv.take() 135 | } 136 | } 137 | 138 | /// 139 | /// Backends provides a cloneable container for various protocol backends, 140 | /// handling logic like sharding, sampling, and other detectors. 141 | /// 142 | #[derive(Clone)] 143 | pub struct Backends { 144 | inner: Arc>, 145 | } 146 | 147 | impl Backends { 148 | pub fn new(stats: stats::Scope) -> Self { 149 | Backends { 150 | inner: Arc::new(RwLock::new(BackendsInner::new(stats))), 151 | } 152 | } 153 | 154 | pub fn replace_processor( 155 | &self, 156 | name: &str, 157 | processor: Box, 158 | ) -> anyhow::Result<()> { 159 | self.inner.write().replace_processor(name, processor) 160 | } 161 | 162 | pub fn replace_backend( 163 | &self, 164 | name: &str, 165 | c: &config::BackendConfig, 166 | discovery_update: Option<&discovery::Update>, 167 | ) -> anyhow::Result<()> { 168 | self.inner 169 | .write() 170 | .replace_backend(name, c, discovery_update) 171 | } 172 | 173 | pub fn remove_backend(&self, name: &str) -> anyhow::Result<()> { 174 | self.inner.write().remove_backend(name) 175 | } 176 | 177 | pub fn backend_names(&self) -> HashSet { 178 | self.inner 179 | .read() 180 | .backend_names() 181 | .iter() 182 | .map(|s| (*s).clone()) 183 | .collect() 184 | } 185 | 186 | pub fn len(&self) -> usize { 187 | self.inner.read().len() 188 | } 189 | 190 | pub fn is_empty(&self) -> bool { 191 | self.len() == 0 192 | } 193 | 194 | pub fn provide_statsd(&self, pdu: &Event, route: &[config::Route]) { 195 | self.inner.read().provide_statsd(pdu, route) 196 | } 197 | 198 | pub fn provide_statsd_slice(&self, pdu: &[Event], route: &[config::Route]) { 199 | let lock = self.inner.read(); 200 | for p in pdu { 201 | lock.provide_statsd(p, route); 202 | } 203 | } 204 | 205 | pub fn processor_tick(&self, now: std::time::SystemTime) { 206 | self.inner.read().processor_tick(now, self); 207 | } 208 | 209 | pub fn flush(&self) { 210 | self.inner.read().flush(self); 211 | } 212 | 213 | pub fn drain_backends(&mut self) -> Option> { 214 | self.inner.write().drain_backends() 215 | } 216 | } 217 | 218 | pub async fn ticker(tripwire: Tripwire, backends: Backends) { 219 | let mut ticker = tokio::time::interval_at( 220 | tokio::time::Instant::now(), 221 | tokio::time::Duration::from_secs(1), 222 | ); 223 | loop { 224 | tokio::select! { 225 | _ = tripwire.clone() => { return; } 226 | _ = ticker.tick() => { 227 | let back = backends.clone(); 228 | tokio::task::spawn_blocking(move || { 229 | back.processor_tick(std::time::SystemTime::now()) 230 | }).await.unwrap(); 231 | } 232 | } 233 | } 234 | } 235 | 236 | #[cfg(test)] 237 | pub mod test { 238 | 239 | use super::*; 240 | use crate::processors::{self, Processor}; 241 | use crate::statsd_proto; 242 | use crate::statsd_proto::Parsed; 243 | 244 | use std::convert::TryInto; 245 | use std::sync::atomic::AtomicU32; 246 | use std::sync::atomic::Ordering; 247 | use std::sync::Arc; 248 | 249 | struct AssertProc 250 | where 251 | T: Fn(&Event), 252 | { 253 | proc: T, 254 | count: Arc, 255 | } 256 | 257 | impl processors::Processor for AssertProc { 258 | fn provide_statsd(&self, sample: &Event) -> Option { 259 | (self.proc)(sample); 260 | self.count.fetch_add(1, Ordering::Acquire); 261 | None 262 | } 263 | } 264 | 265 | #[test] 266 | fn simple_nil_backend() { 267 | let scope = crate::stats::Collector::default().scope("prefix"); 268 | let _backend = Backends::new(scope); 269 | } 270 | 271 | fn make_counting_mock() -> (Arc, Box) { 272 | let counter = Arc::new(AtomicU32::new(0)); 273 | let proc = Box::new(AssertProc { 274 | proc: |_| {}, 275 | count: counter.clone(), 276 | }); 277 | (counter, proc) 278 | } 279 | 280 | fn make_asserting_mock( 281 | t: T, 282 | ) -> (Arc, Box) { 283 | let counter = Arc::new(AtomicU32::new(0)); 284 | let proc = Box::new(AssertProc { 285 | proc: t, 286 | count: counter.clone(), 287 | }); 288 | (counter, proc) 289 | } 290 | 291 | fn insert_proc(backend: &Backends, name: &str, proc: Box) { 292 | backend 293 | .inner 294 | .write() 295 | .processors 296 | .insert(name.to_owned(), proc); 297 | } 298 | 299 | #[test] 300 | fn processor_tag_test() { 301 | // Create the backend 302 | let scope = crate::stats::Collector::default().scope("prefix"); 303 | let backend = Backends::new(scope); 304 | 305 | // Create a mock processor to receive all messages 306 | let route_final = vec![config::Route { 307 | route_type: config::RouteType::Processor, 308 | route_to: "final".to_owned(), 309 | }]; 310 | let (counter, proc) = make_asserting_mock(|sample| { 311 | let owned: statsd_proto::Owned = sample.try_into().unwrap(); 312 | assert_eq!(owned.name(), b"foo.bar.__tags=value"); 313 | }); 314 | 315 | // Insert the assert processors 316 | insert_proc(&backend, "final", proc); 317 | 318 | // Create the processor under test 319 | let tn = processors::tag::Normalizer::new(&route_final); 320 | insert_proc(&backend, "tag", Box::new(tn)); 321 | 322 | let pdu = 323 | statsd_proto::Pdu::parse(bytes::Bytes::from_static(b"foo.bar:3|c|#tags:value|@1.0")) 324 | .unwrap(); 325 | let route = vec![config::Route { 326 | route_type: config::RouteType::Processor, 327 | route_to: "tag".to_owned(), 328 | }]; 329 | backend.provide_statsd(&Event::Pdu(pdu), &route); 330 | 331 | // Check how many messages the mock has received 332 | let actual_count = counter.load(Ordering::Acquire); 333 | assert_eq!(1, actual_count); 334 | } 335 | 336 | #[test] 337 | fn processor_fanout_test() { 338 | // Create the backend 339 | let scope = crate::stats::Collector::default().scope("prefix"); 340 | let backend = Backends::new(scope); 341 | 342 | // Create a mock processor to receive all messages, 2x over 343 | let route_final = vec![ 344 | config::Route { 345 | route_type: config::RouteType::Processor, 346 | route_to: "final1".to_owned(), 347 | }, 348 | config::Route { 349 | route_type: config::RouteType::Processor, 350 | route_to: "final2".to_owned(), 351 | }, 352 | ]; 353 | let (counter1, proc1) = make_counting_mock(); 354 | let (counter2, proc2) = make_counting_mock(); 355 | 356 | // Insert the assert processors 357 | insert_proc(&backend, "final1", proc1); 358 | insert_proc(&backend, "final2", proc2); 359 | 360 | // Create the processor under test 361 | let tn = processors::tag::Normalizer::new(&route_final); 362 | insert_proc(&backend, "tag", Box::new(tn)); 363 | 364 | let pdu = 365 | statsd_proto::Pdu::parse(bytes::Bytes::from_static(b"foo.bar:3|c|#tags:value|@1.0")) 366 | .unwrap(); 367 | let route = vec![config::Route { 368 | route_type: config::RouteType::Processor, 369 | route_to: "tag".to_owned(), 370 | }]; 371 | backend.provide_statsd(&Event::Pdu(pdu), &route); 372 | 373 | // Check how many messages the mock has received 374 | let actual_count = counter1.load(Ordering::Acquire); 375 | assert_eq!(1, actual_count); 376 | let actual_count2 = counter2.load(Ordering::Acquire); 377 | assert_eq!(1, actual_count2); 378 | } 379 | } 380 | -------------------------------------------------------------------------------- /statsrelay/src/cmd/loadgen.rs: -------------------------------------------------------------------------------- 1 | use bytes::{BufMut, BytesMut}; 2 | use chrono::prelude::*; 3 | use std::time::Duration; 4 | use structopt::StructOpt; 5 | use tokio::io::AsyncWriteExt; 6 | use tokio::net::TcpStream; 7 | 8 | const PRINT_INTERVAL: u64 = 100000; 9 | 10 | #[derive(StructOpt, Debug)] 11 | struct Options { 12 | #[structopt(short = "e", long = "--endpoint", default_value = "localhost:8129")] 13 | pub endpoint: String, 14 | } 15 | 16 | #[tokio::main] 17 | async fn main() { 18 | let options = Options::from_args(); 19 | let mut stream = TcpStream::connect(options.endpoint).await.unwrap(); 20 | let mut buf = BytesMut::with_capacity(131072); 21 | let mut counter = 0_u64; 22 | let mut last_time = Local::now(); 23 | loop { 24 | for _ in 0..1 { 25 | buf.put( 26 | format!( 27 | "hello.hello.hello.hello.hello.hello.hello.hello.hello:{}|c\n", 28 | counter 29 | ) 30 | .as_bytes() 31 | .as_ref(), 32 | ); 33 | } 34 | stream.write_buf(&mut buf).await.unwrap(); 35 | counter += 1; 36 | 37 | if counter % PRINT_INTERVAL == 0 { 38 | let now_time = Local::now(); 39 | let diff = now_time - last_time; 40 | last_time = now_time; 41 | println!( 42 | "{}: sent {:15} lines in {:5}ms ({:.0} l/s)", 43 | now_time.format("%H:%M:%S"), 44 | counter, 45 | diff.num_milliseconds(), 46 | PRINT_INTERVAL as f64 / (diff.num_milliseconds() as f64 / 1000.0) 47 | ); 48 | tokio::time::sleep(Duration::from_millis(40)).await; 49 | }; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /statsrelay/src/cmd/statsrelay.rs: -------------------------------------------------------------------------------- 1 | extern crate jemallocator; 2 | 3 | #[global_allocator] 4 | static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; 5 | 6 | use anyhow::Context; 7 | use futures::StreamExt; 8 | use futures::{stream::FuturesUnordered, FutureExt}; 9 | use stream_cancel::Tripwire; 10 | use structopt::StructOpt; 11 | 12 | use std::collections::HashMap; 13 | use std::collections::HashSet; 14 | 15 | use tokio::runtime; 16 | use tokio::select; 17 | use tokio::signal::unix::{signal, SignalKind}; 18 | use tokio::time::{sleep_until, Duration, Instant}; 19 | 20 | use env_logger::Env; 21 | use log::{debug, error, info, warn}; 22 | 23 | use statsrelay::config; 24 | use statsrelay::discovery; 25 | use statsrelay::processors; 26 | use statsrelay::stats; 27 | use statsrelay::statsd_server; 28 | use statsrelay::{admin, config::Config}; 29 | use statsrelay::{backends, stats::Scope}; 30 | 31 | #[derive(StructOpt, Debug, Clone)] 32 | struct Options { 33 | #[structopt(short = "c", long = "--config", default_value = "/etc/statsrelay.json")] 34 | pub config: String, 35 | 36 | #[structopt(long = "--config-check-and-exit")] 37 | pub config_check: bool, 38 | 39 | #[structopt(long = "--shutdown-delay", default_value = "0")] 40 | pub shutdown_delay: u32, 41 | 42 | #[structopt(long = "--shutdown-wait", default_value = "5")] 43 | pub shutdown_wait: u32, 44 | 45 | #[structopt(short = "t", long = "--threaded")] 46 | pub threaded: bool, 47 | 48 | #[structopt(long = "--version")] 49 | pub version: bool, 50 | } 51 | 52 | /// The main server invocation, for a given configuration, options and stats 53 | /// scope. The server will spawn any listeners, initialize a backend 54 | /// configuration update loop, as well as register signal handlers. 55 | async fn server(scope: stats::Scope, config: Config, opts: Options) { 56 | let backend_reloads = scope.counter("backend_reloads").unwrap(); 57 | let config_load_failures = scope.counter("backend_reloads_failure").unwrap(); 58 | let mut backends = backends::Backends::new(scope.scope("backends")); 59 | 60 | // Load processors 61 | if let Some(processors) = config.processors.as_ref() { 62 | load_processors(scope.scope("processors"), &backends, processors) 63 | .await 64 | .unwrap(); 65 | } 66 | 67 | // Build a pair of unrelated tripwires, which can shut down the input and 68 | // output portions of the stack at different times. 69 | let (server_cancel, server_tripwire) = Tripwire::new(); 70 | let (backend_ticker_cancel, backend_ticker_tripwire) = Tripwire::new(); 71 | 72 | let mut servers: FuturesUnordered<_> = config 73 | .statsd 74 | .servers 75 | .iter() 76 | .map({ 77 | |(server_name, server_config)| { 78 | let name = server_name.clone(); 79 | statsd_server::run( 80 | scope.scope("statsd_server").scope(server_name), 81 | server_tripwire.clone(), 82 | server_config.clone(), 83 | backends.clone(), 84 | ) 85 | .map(|_| name) 86 | } 87 | }) 88 | .collect(); 89 | 90 | // Trap ctrl+c and sigterm messages and perform a clean shutdown 91 | let mut sigint = signal(SignalKind::interrupt()).unwrap(); 92 | let mut sigterm = signal(SignalKind::terminate()).unwrap(); 93 | let shutdown_delay = opts.shutdown_delay; 94 | tokio::spawn(async move { 95 | select! { 96 | _ = sigint.recv() => info!("received sigint"), 97 | _ = sigterm.recv() => info!("received sigterm"), 98 | } 99 | if shutdown_delay > 0 { 100 | info!( 101 | "waiting {} seconds before shutting down (--shutdown-delay set", 102 | shutdown_delay 103 | ); 104 | tokio::time::sleep(Duration::from_secs(shutdown_delay as u64)).await; 105 | } 106 | // We only want to abort the server/input side of the system, let it 107 | // drain, then work on shutting down the output side. Cancelling the 108 | // server will complete the server future, which then allow the runtime 109 | // loop to proceed with an orderly shutdown. 110 | server_cancel.cancel(); 111 | }); 112 | 113 | // Trap sighup to support manual file reloading 114 | let mut sighup = signal(SignalKind::hangup()).unwrap(); 115 | // This task is designed to asynchronously build backend configurations, 116 | // which may in turn come from other data sources or discovery sources. 117 | // This inherently races with bringing up servers, to the point where a 118 | // server may not have any backends to dispatch to yet, if discovery is 119 | // very slow. This is the intended state, as configuration of processors 120 | // and any buffers should have already been performed. 121 | // 122 | // SIGHUP will attempt to reload backend configurations as well as any 123 | // discovery changes. 124 | let discovery_backends = backends.clone(); 125 | let options_reload = opts.clone(); 126 | tokio::spawn(async move { 127 | let mut last_config = config.clone(); 128 | let dconfig = config.discovery.unwrap_or_default(); 129 | let discovery_cache = discovery::Cache::new(); 130 | let mut discovery_stream = 131 | discovery::reflector(discovery_cache.clone(), discovery::as_stream(&dconfig)); 132 | loop { 133 | info!("loading configuration and updating backends"); 134 | backend_reloads.inc(); 135 | let config = match load_backend_configs( 136 | &discovery_cache, 137 | &discovery_backends, 138 | options_reload.config.as_ref(), 139 | ) 140 | .await 141 | { 142 | Ok(config) => { 143 | last_config = config.clone(); 144 | config 145 | } 146 | Err(e) => { 147 | config_load_failures.inc(); 148 | error!("error reloading configuration from disk, using original configuration: {:?}", e); 149 | last_config.clone() 150 | } 151 | }; 152 | let dconfig = config.discovery.unwrap_or_default(); 153 | 154 | tokio::select! { 155 | _ = sighup.recv() => { 156 | info!("received sighup"); 157 | discovery_stream = discovery::reflector(discovery_cache.clone(), discovery::as_stream(&dconfig)); 158 | info!("reloaded discovery stream"); 159 | } 160 | Some(event) = discovery_stream.next() => { 161 | info!("updating discovery for map {}", event.0); 162 | } 163 | }; 164 | } 165 | }); 166 | 167 | // Start processing processor tickers 168 | let ticker_backends = backends.clone(); 169 | tokio::spawn(backends::ticker( 170 | backend_ticker_tripwire.clone(), 171 | ticker_backends, 172 | )); 173 | 174 | // Wait for the server to finish 175 | while let Some(name) = servers.next().await { 176 | debug!("server {} exited", name) 177 | } 178 | debug!("forcing processor tick to flush"); 179 | backends.flush(); 180 | debug!("stopping backend ticker"); 181 | backend_ticker_cancel.cancel(); 182 | let shutdown_recv = backends.drain_backends(); 183 | match shutdown_recv { 184 | Some(mut shutdown_recv) => { 185 | let drain_timeout = Instant::now() + Duration::from_secs(opts.shutdown_wait as u64); 186 | select! { 187 | _ = sleep_until(drain_timeout) => { 188 | info!("backends didn't finish draining within {:?} seconds, aborting", opts.shutdown_wait); 189 | } 190 | _ = shutdown_recv.recv() => { 191 | info!("all backends finished"); 192 | } 193 | } 194 | } 195 | None => { 196 | warn!("backends already drained"); 197 | } 198 | } 199 | } 200 | 201 | fn main() -> anyhow::Result<()> { 202 | env_logger::Builder::from_env(Env::default().default_filter_or("info")).init(); 203 | let opts = Options::from_args(); 204 | 205 | if opts.version { 206 | println!( 207 | "statsrelay - {} - {}", 208 | statsrelay::built_info::PKG_VERSION, 209 | statsrelay::built_info::GIT_COMMIT_HASH.unwrap_or("unknown") 210 | ); 211 | return Ok(()); 212 | } 213 | info!( 214 | "statsrelay loading - {} - {}", 215 | statsrelay::built_info::PKG_VERSION, 216 | statsrelay::built_info::GIT_COMMIT_HASH.unwrap_or("unknown") 217 | ); 218 | 219 | let config = statsrelay::config::load(opts.config.as_ref()) 220 | .with_context(|| format!("can't load config file from {}", opts.config))?; 221 | info!("loaded config file {}", opts.config); 222 | debug!("servers defined: {:?}", config.statsd.servers); 223 | if opts.config_check { 224 | info!("--config-check-and-exit set, exiting"); 225 | return Ok(()); 226 | } 227 | 228 | let collector = stats::Collector::default(); 229 | 230 | if let Some(admin) = &config.admin { 231 | admin::spawn_admin_server(admin.port, collector.clone()); 232 | info!("spawned admin server on port {}", admin.port); 233 | } 234 | debug!("installed metrics receiver"); 235 | 236 | let mut builder = match opts.threaded { 237 | true => runtime::Builder::new_multi_thread(), 238 | false => runtime::Builder::new_current_thread(), 239 | }; 240 | 241 | let runtime = builder.enable_all().build().unwrap(); 242 | info!("tokio runtime built, threaded: {}", opts.threaded); 243 | 244 | let scope = collector.scope("statsrelay"); 245 | 246 | runtime.block_on(server(scope, config, opts)); 247 | 248 | drop(runtime); 249 | info!("runtime terminated"); 250 | Ok(()) 251 | } 252 | 253 | /// Load processors from a given config structure and pack them into the given 254 | /// backend set. Currently processors can't be reloaded at runtime. 255 | async fn load_processors( 256 | scope: Scope, 257 | backends: &backends::Backends, 258 | processors: &HashMap, 259 | ) -> anyhow::Result<()> { 260 | for (name, cp) in processors.iter() { 261 | let proc: Box = match cp { 262 | config::Processor::TagConverter(tc) => { 263 | info!("processor tag_converter: {:?}", tc); 264 | Box::new(processors::tag::Normalizer::new(tc.route.as_ref())) 265 | } 266 | config::Processor::Sampler(sampler) => { 267 | info!("processor sampler: {:?}", sampler); 268 | Box::new(processors::sampler::Sampler::new(sampler)?) 269 | } 270 | config::Processor::Cardinality(cardinality) => { 271 | info!("processor cardinality: {:?}", cardinality); 272 | Box::new(processors::cardinality::Cardinality::new( 273 | scope.scope(name), 274 | cardinality, 275 | )) 276 | } 277 | config::Processor::RegexFilter(regex) => { 278 | info!("processor regex_filter: {:?}", regex); 279 | Box::new(processors::regex_filter::RegexFilter::new( 280 | scope.scope(name), 281 | regex, 282 | )?) 283 | } 284 | }; 285 | backends.replace_processor(name.as_str(), proc)?; 286 | } 287 | Ok(()) 288 | } 289 | 290 | async fn load_backend_configs( 291 | discovery_cache: &discovery::Cache, 292 | backends: &backends::Backends, 293 | path: &str, 294 | ) -> anyhow::Result { 295 | // Check if we have to load the configuration file 296 | let config = match statsrelay::config::load(path) 297 | .with_context(|| format!("can't load config file from {}", path)) 298 | { 299 | Err(e) => { 300 | error!("failed to reload configuration: {}", e); 301 | return Err(e).context("failed to reload configuration file"); 302 | } 303 | Ok(ok) => ok, 304 | }; 305 | 306 | let duplicate = &config.statsd.backends; 307 | for (name, dp) in duplicate.iter() { 308 | let discovery_data = if let Some(discovery_name) = &dp.shard_map_source { 309 | discovery_cache.get(discovery_name) 310 | } else { 311 | None 312 | }; 313 | if let Err(e) = backends.replace_backend(name, dp, discovery_data.as_ref()) { 314 | error!("failed to replace backend index {} error {}", name, e); 315 | continue; 316 | } 317 | } 318 | let existing_backends = backends.backend_names(); 319 | let config_backends: HashSet = duplicate.keys().cloned().collect(); 320 | let difference = existing_backends.difference(&config_backends); 321 | for remove in difference { 322 | if let Err(e) = backends.remove_backend(remove) { 323 | error!("failed to remove backend {} with error {:?}", remove, e); 324 | } 325 | } 326 | 327 | info!("backends reloaded"); 328 | Ok(config) 329 | } 330 | -------------------------------------------------------------------------------- /statsrelay/src/config.rs: -------------------------------------------------------------------------------- 1 | use regex::bytes::RegexSet; 2 | use serde::{Deserialize, Deserializer, Serialize, Serializer}; 3 | use std::collections::HashMap; 4 | use std::convert::{AsRef, TryFrom, TryInto}; 5 | use std::fmt; 6 | use thiserror::Error; 7 | 8 | #[derive(Debug, Clone, PartialEq)] 9 | pub enum RouteType { 10 | Statsd, 11 | Processor, 12 | } 13 | 14 | impl TryFrom<&str> for RouteType { 15 | type Error = Error; 16 | 17 | fn try_from(value: &str) -> Result { 18 | match value { 19 | "statsd" => Ok(RouteType::Statsd), 20 | "processor" => Ok(RouteType::Processor), 21 | _ => Err(Error::UnknownRouteType(value.to_string())), 22 | } 23 | } 24 | } 25 | 26 | impl From<&RouteType> for &str { 27 | fn from(t: &RouteType) -> Self { 28 | match t { 29 | RouteType::Statsd => "statsd", 30 | RouteType::Processor => "processor", 31 | } 32 | } 33 | } 34 | 35 | impl fmt::Display for RouteType { 36 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 37 | let s: &str = self.into(); 38 | write!(f, "{}", s) 39 | } 40 | } 41 | 42 | #[derive(Debug, Clone, PartialEq)] 43 | pub struct Route { 44 | pub route_type: RouteType, 45 | pub route_to: String, 46 | } 47 | 48 | impl fmt::Display for Route { 49 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 50 | write!(f, "{}:{}", self.route_type, self.route_to) 51 | } 52 | } 53 | 54 | impl<'de> Deserialize<'de> for Route { 55 | fn deserialize(deserializer: D) -> Result 56 | where 57 | D: Deserializer<'de>, 58 | { 59 | let s: &str = Deserialize::deserialize(deserializer)?; 60 | let parts: Vec<&str> = s.split(':').collect(); 61 | if let [ty, to] = &parts[..] { 62 | Ok(Route { 63 | route_type: (*ty).try_into().map_err(serde::de::Error::custom)?, 64 | route_to: (*to).into(), 65 | }) 66 | } else { 67 | Err(Error::MalformedRoute(s.to_string())).map_err(serde::de::Error::custom) 68 | } 69 | } 70 | } 71 | 72 | impl Serialize for Route { 73 | fn serialize(&self, serializer: S) -> Result 74 | where 75 | S: Serializer, 76 | { 77 | serializer.serialize_str(format!("{}:{}", self.route_type, self.route_to).as_str()) 78 | } 79 | } 80 | 81 | pub mod processor { 82 | use super::*; 83 | 84 | #[derive(Debug, Serialize, Deserialize, Clone)] 85 | pub struct Sampler { 86 | pub window: u32, 87 | pub timer_reservoir_size: Option, 88 | 89 | pub route: Vec, 90 | } 91 | 92 | #[derive(Serialize, Deserialize, Debug, Clone)] 93 | pub struct TagConverter { 94 | pub route: Vec, 95 | } 96 | 97 | #[derive(Serialize, Deserialize, Debug, Clone)] 98 | pub struct Cardinality { 99 | pub size_limit: usize, 100 | pub rotate_after_seconds: u64, 101 | pub buckets: usize, 102 | pub route: Vec, 103 | } 104 | 105 | #[derive(Serialize, Deserialize, Debug, Clone)] 106 | pub struct RegexFilter { 107 | pub remove: Option>, 108 | pub allow: Option>, 109 | pub route: Vec, 110 | } 111 | } 112 | 113 | #[derive(Serialize, Deserialize, Debug, Clone)] 114 | #[serde(tag = "type", rename_all = "snake_case")] 115 | pub enum Processor { 116 | Sampler(processor::Sampler), 117 | TagConverter(processor::TagConverter), 118 | Cardinality(processor::Cardinality), 119 | RegexFilter(processor::RegexFilter), 120 | } 121 | 122 | #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] 123 | pub struct BackendConfig { 124 | #[serde(flatten)] 125 | pub client_config: BackendClientConfig, 126 | #[serde(default)] 127 | pub shard_map: Vec, 128 | pub shard_map_source: Option, 129 | pub suffix: Option, 130 | pub prefix: Option, 131 | pub input_blocklist: Option, 132 | pub input_filter: Option, 133 | } 134 | 135 | #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] 136 | #[serde(tag = "type", rename_all = "snake_case")] 137 | pub enum BackendClientConfig { 138 | Statsd(StatsdBackendClientConfig), 139 | PromRemoteWrite(PromBackendClientConfig), 140 | } 141 | 142 | #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] 143 | pub struct StatsdBackendClientConfig { 144 | #[serde(default = "default_max_queue")] 145 | pub max_queue: usize, 146 | #[serde(default = "default_trim_on_disconnect")] 147 | pub trim_on_disconnect: bool, 148 | } 149 | 150 | #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] 151 | pub struct PromBackendClientConfig { 152 | #[serde(default = "default_max_queue")] 153 | pub max_queue: usize, 154 | #[serde(default = "default_request_timeout_ms")] 155 | pub request_timeout_ms: u64, 156 | #[serde(default = "default_batch_creation_timeout_ms")] 157 | pub batch_creation_timeout_ms: u64, 158 | #[serde(default = "default_max_batch_size")] 159 | pub max_batch_size: usize, 160 | #[serde(default = "default_max_in_flight")] 161 | pub max_in_flight: usize, 162 | } 163 | 164 | fn default_max_queue() -> usize { 165 | 100000 166 | } 167 | fn default_request_timeout_ms() -> u64 { 168 | 10000 169 | } 170 | fn default_batch_creation_timeout_ms() -> u64 { 171 | 50 172 | } 173 | fn default_max_batch_size() -> usize { 174 | 500 175 | } 176 | fn default_max_in_flight() -> usize { 177 | 16 178 | } 179 | fn default_trim_on_disconnect() -> bool { 180 | false 181 | } 182 | 183 | impl BackendConfig { 184 | pub fn generate_input_filter(&self) -> Option { 185 | let mut filters: Vec = Vec::new(); 186 | 187 | if let Some(ref blocklist) = self.input_blocklist { 188 | filters.push(blocklist.clone()); 189 | } 190 | if let Some(ref filter) = self.input_filter { 191 | filters.push(filter.clone()); 192 | } 193 | 194 | if !filters.is_empty() { 195 | Some(RegexSet::new(filters).unwrap()) 196 | } else { 197 | None 198 | } 199 | } 200 | } 201 | 202 | #[derive(Serialize, Deserialize, Debug, Clone)] 203 | pub struct StatsdServerConfig { 204 | pub bind: String, 205 | pub socket: Option, 206 | pub read_buffer: Option, 207 | pub route: Vec, 208 | pub read_timeout_secs: Option, 209 | } 210 | 211 | #[derive(Serialize, Deserialize, Debug, Clone)] 212 | pub struct StatsdConfig { 213 | pub servers: HashMap, 214 | pub backends: HashMap, 215 | } 216 | 217 | #[derive(Debug, Serialize, Deserialize, Clone)] 218 | #[serde(tag = "type", rename_all = "snake_case")] 219 | pub enum DiscoveryTransform { 220 | Format { pattern: String }, 221 | Repeat { count: u32 }, 222 | } 223 | 224 | #[derive(Serialize, Deserialize, Debug, Clone)] 225 | pub struct S3DiscoverySource { 226 | pub bucket: String, 227 | pub key: String, 228 | pub interval: u32, 229 | pub transforms: Option>, 230 | } 231 | 232 | #[derive(Serialize, Deserialize, Debug, Clone)] 233 | pub struct PathDiscoverySource { 234 | pub path: String, 235 | pub interval: u32, 236 | pub transforms: Option>, 237 | } 238 | 239 | #[derive(Serialize, Deserialize, Debug, Clone)] 240 | #[serde(tag = "type", rename_all = "snake_case")] 241 | pub enum DiscoverySource { 242 | StaticFile(PathDiscoverySource), 243 | S3(S3DiscoverySource), 244 | } 245 | 246 | #[derive(Debug, Serialize, Deserialize, Clone, Default)] 247 | pub struct Discovery { 248 | pub sources: HashMap, 249 | } 250 | 251 | #[derive(Debug, Serialize, Deserialize, Clone)] 252 | pub struct AdminConfig { 253 | pub port: u16, 254 | } 255 | 256 | #[derive(Serialize, Deserialize, Debug, Clone)] 257 | pub struct Config { 258 | pub admin: Option, 259 | pub statsd: StatsdConfig, 260 | pub discovery: Option, 261 | pub processors: Option>, 262 | } 263 | 264 | #[derive(Error, Debug)] 265 | pub enum Error { 266 | #[error("could not locate discovery source {0}")] 267 | UnknownDiscoverySource(String), 268 | #[error("malformed route {0}")] 269 | MalformedRoute(String), 270 | #[error("invalid route type {0}")] 271 | UnknownRouteType(String), 272 | #[error("invalid routing destination {0}")] 273 | UnknownRoutingDestination(Route), 274 | } 275 | 276 | fn check_routes(config: &Config, routes: &[Route]) -> Result<(), Error> { 277 | let result: Result, Error> = routes 278 | .iter() 279 | .map(|route| match route.route_type { 280 | RouteType::Statsd => config 281 | .statsd 282 | .backends 283 | .get(route.route_to.as_str()) 284 | .ok_or_else(|| Error::UnknownRoutingDestination(route.clone())) 285 | .map(|_| ()), 286 | RouteType::Processor => { 287 | if let Some(procs) = &config.processors { 288 | return procs 289 | .get(route.route_to.as_str()) 290 | .ok_or_else(|| Error::UnknownRoutingDestination(route.clone())) 291 | .map(|_| ()); 292 | } else { 293 | Err(Error::UnknownRoutingDestination(route.clone())) 294 | } 295 | } 296 | }) 297 | .collect(); 298 | result.map(|_| ()) 299 | } 300 | 301 | fn check_config_route(config: &Config) -> Result<(), Error> { 302 | for (_, statsd) in config.statsd.servers.iter() { 303 | check_routes(config, statsd.route.as_ref())?; 304 | } 305 | let routes: Result, Error> = config 306 | .clone() 307 | .processors 308 | .unwrap_or_default() 309 | .iter() 310 | .map(|(_, proc)| match proc { 311 | Processor::Sampler(sampler) => check_routes(config, sampler.route.as_ref()), 312 | Processor::TagConverter(tc) => check_routes(config, tc.route.as_ref()), 313 | Processor::Cardinality(c) => check_routes(config, c.route.as_ref()), 314 | Processor::RegexFilter(filter) => check_routes(config, filter.route.as_ref()), 315 | }) 316 | .collect(); 317 | routes.map(|_| ()) 318 | } 319 | 320 | fn check_config_discovery(config: &Config, discovery: &Discovery) -> anyhow::Result<()> { 321 | for (_, statsd_dupl) in config.statsd.backends.iter() { 322 | if let Some(source) = &statsd_dupl.shard_map_source { 323 | if discovery.sources.get(source).is_none() { 324 | return Err(Error::UnknownDiscoverySource(source.clone()).into()); 325 | } 326 | } 327 | } 328 | Ok(()) 329 | } 330 | 331 | fn check_config(config: &Config) -> anyhow::Result<()> { 332 | let default = Discovery::default(); 333 | let discovery = &config.discovery.as_ref().unwrap_or(&default); 334 | // Every reference to a shard_map needs a reference to a valid discovery block 335 | check_config_discovery(config, discovery)?; 336 | check_config_route(config)?; 337 | Ok(()) 338 | } 339 | 340 | pub fn load(path: &str) -> anyhow::Result { 341 | let input = std::fs::read_to_string(path)?; 342 | let config: Config = serde_json::from_str(input.as_ref())?; 343 | // Perform some high level validation 344 | check_config(&config)?; 345 | Ok(config) 346 | } 347 | 348 | #[cfg(test)] 349 | pub mod test { 350 | use super::*; 351 | use std::io::Write; 352 | use tempfile::NamedTempFile; 353 | 354 | #[test] 355 | fn load_example_config() { 356 | let config = r#" 357 | { 358 | "statsd": { 359 | "servers": { 360 | "default": 361 | { 362 | "bind": "127.0.0.1:BIND_STATSD_PORT", 363 | "route": ["statsd:test1"], 364 | "read_buffer": 65535 365 | } 366 | }, 367 | "backends": { 368 | "test1": 369 | { 370 | "type": "statsd", 371 | "prefix": "test-1.", 372 | "shard_map": [ 373 | "127.0.0.1:SEND_STATSD_PORT" 374 | ], 375 | "suffix": ".suffix" 376 | }, 377 | "mapsource": 378 | { 379 | "type": "statsd", 380 | "input_filter": "^(?=dontmatchme)", 381 | "prefix": "test-2.", 382 | "shard_map_source": "my_s3" 383 | }, 384 | "prom": 385 | { 386 | "type": "prom_remote_write", 387 | "request_timeout_ms": 60000, 388 | "max_batch_size": 200000 389 | } 390 | } 391 | }, 392 | "processors": { 393 | "tag1": { 394 | "type": "tag_converter", 395 | "route": ["statsd:test1"] 396 | }, 397 | "regex": { 398 | "type": "regex_filter", 399 | "allow": [".*"], 400 | "route": ["statsd:test1"] 401 | } 402 | }, 403 | "discovery": { 404 | "sources": { 405 | "file": { 406 | "type":"static_file", 407 | "path":"/tmp/file", 408 | "interval":5 409 | }, 410 | "my_s3": { 411 | "type": "s3", 412 | "bucket": "foo", 413 | "key": "bar", 414 | "interval": 3, 415 | "transforms": [ 416 | { 417 | "type": "repeat", 418 | "count": 3 419 | }, 420 | { 421 | "type": "format", 422 | "pattern": "{}:123" 423 | } 424 | ] 425 | } 426 | } 427 | } 428 | } 429 | "#; 430 | let mut tf = NamedTempFile::new().unwrap(); 431 | tf.write_all(config.as_bytes()).unwrap(); 432 | let config = load(tf.path().to_str().unwrap()).unwrap(); 433 | // Check servers 434 | let default_server = config.statsd.servers.get("default").unwrap(); 435 | assert_eq!( 436 | default_server.bind, 437 | "127.0.0.1:BIND_STATSD_PORT".to_string() 438 | ); 439 | // Check backends 440 | let statsd_backend = config.statsd.backends.get("test1").unwrap(); 441 | assert_eq!( 442 | statsd_backend, 443 | &BackendConfig { 444 | client_config: BackendClientConfig::Statsd(StatsdBackendClientConfig { 445 | max_queue: 100000, 446 | trim_on_disconnect: false, 447 | },), 448 | prefix: Some("test-1.".to_string()), 449 | shard_map: vec!["127.0.0.1:SEND_STATSD_PORT".to_string()], 450 | suffix: Some(".suffix".to_string()), 451 | shard_map_source: None, 452 | input_blocklist: None, 453 | input_filter: None, 454 | }, 455 | ); 456 | let prom_backend = config.statsd.backends.get("prom").unwrap(); 457 | assert_eq!( 458 | prom_backend, 459 | &BackendConfig { 460 | client_config: BackendClientConfig::PromRemoteWrite(PromBackendClientConfig { 461 | max_queue: 100000, 462 | request_timeout_ms: 60000, 463 | batch_creation_timeout_ms: 50, 464 | max_batch_size: 200000, 465 | max_in_flight: 16, 466 | }), 467 | prefix: None, 468 | shard_map: vec![], 469 | suffix: None, 470 | shard_map_source: None, 471 | input_blocklist: None, 472 | input_filter: None, 473 | }, 474 | ); 475 | // Check processors 476 | assert_eq!(2, config.clone().processors.unwrap_or_default().len()); 477 | // Check discovery 478 | let discovery = config.discovery.unwrap(); 479 | assert_eq!(2, discovery.sources.len()); 480 | let s3_source = discovery.sources.get("my_s3").unwrap(); 481 | match s3_source { 482 | DiscoverySource::S3(source) => { 483 | assert!(source.bucket == "foo"); 484 | } 485 | _ => panic!("not an s3 source"), 486 | }; 487 | } 488 | } 489 | -------------------------------------------------------------------------------- /statsrelay/src/cuckoofilter/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Seif Lotfy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /statsrelay/src/cuckoofilter/bucket.rs: -------------------------------------------------------------------------------- 1 | pub const FINGERPRINT_SIZE: usize = 1; 2 | pub const BUCKET_SIZE: usize = 4; 3 | const EMPTY_FINGERPRINT_DATA: [u8; FINGERPRINT_SIZE] = [100; FINGERPRINT_SIZE]; 4 | 5 | // Fingerprint Size is 1 byte so lets remove the Vec 6 | #[derive(PartialEq, Copy, Clone, Hash)] 7 | pub struct Fingerprint { 8 | pub data: [u8; FINGERPRINT_SIZE], 9 | } 10 | 11 | impl Fingerprint { 12 | /// Attempts to create a new Fingerprint based on the given 13 | /// number. If the created Fingerprint would be equal to the 14 | /// empty Fingerprint, None is returned. 15 | pub fn from_data(data: [u8; FINGERPRINT_SIZE]) -> Option { 16 | let result = Self { data }; 17 | if result.is_empty() { 18 | None 19 | } else { 20 | Some(result) 21 | } 22 | } 23 | 24 | /// Returns the empty Fingerprint. 25 | pub fn empty() -> Self { 26 | Self { 27 | data: EMPTY_FINGERPRINT_DATA, 28 | } 29 | } 30 | 31 | /// Checks if this is the empty Fingerprint. 32 | pub fn is_empty(&self) -> bool { 33 | self.data == EMPTY_FINGERPRINT_DATA 34 | } 35 | 36 | /// Sets the fingerprint value to a previously exported one via an in-memory copy. 37 | fn slice_copy(&mut self, fingerprint: &[u8]) { 38 | self.data.copy_from_slice(fingerprint); 39 | } 40 | } 41 | 42 | /// Manages `BUCKET_SIZE` fingerprints at most. 43 | #[derive(Clone)] 44 | pub struct Bucket { 45 | pub buffer: [Fingerprint; BUCKET_SIZE], 46 | } 47 | 48 | impl Bucket { 49 | /// Creates a new bucket with a pre-allocated buffer. 50 | pub fn new() -> Self { 51 | Self { 52 | buffer: [Fingerprint::empty(); BUCKET_SIZE], 53 | } 54 | } 55 | 56 | /// Inserts the fingerprint into the buffer if the buffer is not full. 57 | /// This operation is O(1). 58 | pub fn insert(&mut self, fp: Fingerprint) -> bool { 59 | for entry in &mut self.buffer { 60 | if entry.is_empty() { 61 | *entry = fp; 62 | return true; 63 | } 64 | } 65 | false 66 | } 67 | 68 | /// Deletes the given fingerprint from the bucket. This operation is O(1). 69 | pub fn delete(&mut self, fp: Fingerprint) -> bool { 70 | match self.get_fingerprint_index(fp) { 71 | Some(index) => { 72 | self.buffer[index] = Fingerprint::empty(); 73 | true 74 | } 75 | None => false, 76 | } 77 | } 78 | 79 | /// Returns the index of the given fingerprint, if its found. O(1) 80 | pub fn get_fingerprint_index(&self, fp: Fingerprint) -> Option { 81 | self.buffer.iter().position(|e| *e == fp) 82 | } 83 | 84 | /// Returns all current fingerprint data of the current buffer for storage. 85 | pub fn get_fingerprint_data(&self) -> Vec { 86 | self.buffer 87 | .iter() 88 | .flat_map(|f| f.data.iter()) 89 | .cloned() 90 | .collect() 91 | } 92 | 93 | /// Empties the bucket by setting each used entry to Fingerprint::empty(). Returns the number of entries that were modified. 94 | #[inline(always)] 95 | pub fn clear(&mut self) { 96 | *self = Self::new() 97 | } 98 | } 99 | 100 | impl From<&[u8]> for Bucket { 101 | /// Constructs a buffer of fingerprints from a set of previously exported fingerprints. 102 | fn from(fingerprints: &[u8]) -> Self { 103 | let mut buffer = [Fingerprint::empty(); BUCKET_SIZE]; 104 | for (idx, value) in fingerprints.chunks(FINGERPRINT_SIZE).enumerate() { 105 | buffer[idx].slice_copy(value); 106 | } 107 | Self { buffer } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /statsrelay/src/cuckoofilter/mod.rs: -------------------------------------------------------------------------------- 1 | //! Cuckoo filter probabilistic data structure for membership testing and cardinality counting. 2 | //! 3 | //! # Usage 4 | //! 5 | //! This crate is [on crates.io](https://crates.io/crates/cuckoofilter) and can be 6 | //! used by adding `cuckoofilter` to the dependencies in your project's `Cargo.toml`. 7 | //! 8 | //! ```toml 9 | //! [dependencies] 10 | //! cuckoofilter = "0.3" 11 | //! ``` 12 | //! 13 | //! And this in your crate root: 14 | //! 15 | //! ```rust 16 | //! use statsrelay::cuckoofilter; 17 | //! ``` 18 | 19 | mod bucket; 20 | mod util; 21 | 22 | use bucket::{Bucket, Fingerprint, BUCKET_SIZE, FINGERPRINT_SIZE}; 23 | use util::{get_alt_index, get_fai, FaI}; 24 | 25 | use std::cmp; 26 | use std::collections::hash_map::DefaultHasher; 27 | use std::error::Error as StdError; 28 | use std::fmt; 29 | use std::hash::{Hash, Hasher}; 30 | use std::iter::repeat; 31 | use std::marker::PhantomData; 32 | use std::mem; 33 | 34 | use rand::{Rng, SeedableRng}; 35 | #[cfg(feature = "serde_support")] 36 | use serde_derive::{Deserialize, Serialize}; 37 | 38 | /// If insertion fails, we will retry this many times. 39 | pub const MAX_REBUCKET: u32 = 10; 40 | 41 | /// The default number of buckets. 42 | pub const DEFAULT_CAPACITY: usize = (1 << 20) - 1; 43 | 44 | #[derive(Debug)] 45 | pub enum CuckooError { 46 | NotEnoughSpace, 47 | } 48 | 49 | impl fmt::Display for CuckooError { 50 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 51 | f.write_str("NotEnoughSpace") 52 | } 53 | } 54 | 55 | impl StdError for CuckooError { 56 | fn description(&self) -> &str { 57 | "Not enough space to store this item, rebucketing failed." 58 | } 59 | } 60 | 61 | /// A cuckoo filter class exposes a Bloomier filter interface, 62 | /// providing methods of add, delete, contains. 63 | /// 64 | /// # Examples 65 | /// 66 | /// ``` 67 | /// use statsrelay::cuckoofilter; 68 | /// 69 | /// let words = vec!["foo", "bar", "xylophone", "milagro"]; 70 | /// let mut cf = cuckoofilter::CuckooFilter::new(); 71 | /// 72 | /// let mut insertions = 0; 73 | /// for s in &words { 74 | /// if cf.test_and_add(s).unwrap() { 75 | /// insertions += 1; 76 | /// } 77 | /// } 78 | /// 79 | /// assert_eq!(insertions, words.len()); 80 | /// assert_eq!(cf.len(), words.len()); 81 | /// 82 | /// // Re-add the first element. 83 | /// cf.add(words[0]); 84 | /// 85 | /// assert_eq!(cf.len(), words.len() + 1); 86 | /// 87 | /// for s in &words { 88 | /// cf.delete(s); 89 | /// } 90 | /// 91 | /// assert_eq!(cf.len(), 1); 92 | /// assert!(!cf.is_empty()); 93 | /// 94 | /// cf.delete(words[0]); 95 | /// 96 | /// assert_eq!(cf.len(), 0); 97 | /// assert!(cf.is_empty()); 98 | /// 99 | /// for s in &words { 100 | /// if cf.test_and_add(s).unwrap() { 101 | /// insertions += 1; 102 | /// } 103 | /// } 104 | /// 105 | /// cf.clear(); 106 | /// 107 | /// assert!(cf.is_empty()); 108 | /// 109 | /// ``` 110 | pub struct CuckooFilter { 111 | buckets: Box<[Bucket]>, 112 | len: usize, 113 | rng: rand::rngs::SmallRng, 114 | _hasher: std::marker::PhantomData, 115 | } 116 | 117 | impl Default for CuckooFilter { 118 | fn default() -> Self { 119 | Self::new() 120 | } 121 | } 122 | 123 | impl CuckooFilter { 124 | /// Construct a CuckooFilter with default capacity and hasher. 125 | pub fn new() -> Self { 126 | Self::with_capacity(DEFAULT_CAPACITY) 127 | } 128 | } 129 | 130 | impl CuckooFilter 131 | where 132 | H: Hasher + Default, 133 | { 134 | /// Constructs a Cuckoo Filter with a given max capacity 135 | pub fn with_capacity(cap: usize) -> Self { 136 | let capacity = cmp::max(1, cap.next_power_of_two() / BUCKET_SIZE); 137 | 138 | Self { 139 | buckets: repeat(Bucket::new()) 140 | .take(capacity) 141 | .collect::>() 142 | .into_boxed_slice(), 143 | len: 0, 144 | rng: rand::rngs::SmallRng::from_entropy(), 145 | _hasher: PhantomData, 146 | } 147 | } 148 | 149 | /// Checks if `data` is in the filter. 150 | pub fn contains(&self, data: &T) -> bool { 151 | let FaI { fp, i1, i2 } = get_fai::(data); 152 | let len = self.buckets.len(); 153 | self.buckets[i1 % len] 154 | .get_fingerprint_index(fp) 155 | .or_else(|| self.buckets[i2 % len].get_fingerprint_index(fp)) 156 | .is_some() 157 | } 158 | 159 | /// Adds `data` to the filter. Returns `Ok` if the insertion was successful, 160 | /// but could fail with a `NotEnoughSpace` error, especially when the filter 161 | /// is nearing its capacity. 162 | /// Note that while you can put any hashable type in the same filter, beware 163 | /// for side effects like that the same number can have diferent hashes 164 | /// depending on the type. 165 | /// So for the filter, 4711i64 isn't the same as 4711u64. 166 | /// 167 | /// **Note:** When this returns `NotEnoughSpace`, the element given was 168 | /// actually added to the filter, but some random *other* element was 169 | /// removed. This might improve in the future. 170 | pub fn add(&mut self, data: &T) -> Result<(), CuckooError> { 171 | let fai = get_fai::(data); 172 | if self.put(fai.fp, fai.i1) || self.put(fai.fp, fai.i2) { 173 | return Ok(()); 174 | } 175 | let len = self.buckets.len(); 176 | let mut i = fai.random_index(&mut self.rng); 177 | let mut fp = fai.fp; 178 | for _ in 0..MAX_REBUCKET { 179 | let other_fp; 180 | { 181 | let loc = &mut self.buckets[i % len].buffer[self.rng.gen_range(0..BUCKET_SIZE)]; 182 | other_fp = *loc; 183 | *loc = fp; 184 | i = get_alt_index::(other_fp, i); 185 | } 186 | if self.put(other_fp, i) { 187 | return Ok(()); 188 | } 189 | fp = other_fp; 190 | } 191 | // fp is dropped here, which means that the last item that was 192 | // rebucketed gets removed from the filter. 193 | // TODO: One could introduce a single-item cache for this element, 194 | // check this cache in all methods additionally to the actual filter, 195 | // and return NotEnoughSpace if that cache is already in use. 196 | // This would complicate the code, but stop random elements from 197 | // getting removed and result in nicer behaviour for the user. 198 | Err(CuckooError::NotEnoughSpace) 199 | } 200 | 201 | /// Adds `data` to the filter if it does not exist in the filter yet. 202 | /// Returns `Ok(true)` if `data` was not yet present in the filter and added 203 | /// successfully. 204 | pub fn test_and_add(&mut self, data: &T) -> Result { 205 | if self.contains(data) { 206 | Ok(false) 207 | } else { 208 | self.add(data).map(|_| true) 209 | } 210 | } 211 | 212 | /// Number of items in the filter. 213 | pub fn len(&self) -> usize { 214 | self.len 215 | } 216 | 217 | /// Exports fingerprints in all buckets, along with the filter's length for storage. 218 | /// The filter can be recovered by passing the `ExportedCuckooFilter` struct to the 219 | /// `from` method of `CuckooFilter`. 220 | pub fn export(&self) -> ExportedCuckooFilter { 221 | self.into() 222 | } 223 | 224 | /// Number of bytes the filter occupies in memory 225 | pub fn memory_usage(&self) -> usize { 226 | mem::size_of_val(self) + self.buckets.len() * mem::size_of::() 227 | } 228 | 229 | /// Check if filter is empty 230 | pub fn is_empty(&self) -> bool { 231 | self.len == 0 232 | } 233 | 234 | /// Deletes `data` from the filter. Returns true if `data` existed in the 235 | /// filter before. 236 | pub fn delete(&mut self, data: &T) -> bool { 237 | let FaI { fp, i1, i2 } = get_fai::(data); 238 | self.remove(fp, i1) || self.remove(fp, i2) 239 | } 240 | 241 | /// Empty all the buckets in a filter and reset the number of items. 242 | pub fn clear(&mut self) { 243 | if self.is_empty() { 244 | return; 245 | } 246 | 247 | for bucket in self.buckets.iter_mut() { 248 | bucket.clear(); 249 | } 250 | self.len = 0; 251 | } 252 | 253 | /// Extracts fingerprint values from all buckets, used for exporting the filters data. 254 | fn values(&self) -> Vec { 255 | self.buckets 256 | .iter() 257 | .flat_map(|b| b.get_fingerprint_data().into_iter()) 258 | .collect() 259 | } 260 | 261 | /// Removes the item with the given fingerprint from the bucket indexed by i. 262 | fn remove(&mut self, fp: Fingerprint, i: usize) -> bool { 263 | let len = self.buckets.len(); 264 | if self.buckets[i % len].delete(fp) { 265 | self.len -= 1; 266 | true 267 | } else { 268 | false 269 | } 270 | } 271 | 272 | fn put(&mut self, fp: Fingerprint, i: usize) -> bool { 273 | let len = self.buckets.len(); 274 | if self.buckets[i % len].insert(fp) { 275 | self.len += 1; 276 | true 277 | } else { 278 | false 279 | } 280 | } 281 | } 282 | 283 | /// A minimal representation of the CuckooFilter which can be transfered or stored, then recovered at a later stage. 284 | #[derive(Debug)] 285 | #[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))] 286 | pub struct ExportedCuckooFilter { 287 | #[cfg_attr(feature = "serde_support", serde(with = "serde_bytes"))] 288 | pub values: Vec, 289 | pub length: usize, 290 | } 291 | 292 | impl From for CuckooFilter { 293 | /// Converts a simplified representation of a filter used for export to a 294 | /// fully functioning version. 295 | /// 296 | /// # Contents 297 | /// 298 | /// * `values` - A serialized version of the `CuckooFilter`'s memory, where the 299 | /// fingerprints in each bucket are chained one after another, then in turn all 300 | /// buckets are chained together. 301 | /// * `length` - The number of valid fingerprints inside the `CuckooFilter`. 302 | /// This value is used as a time saving method, otherwise all fingerprints 303 | /// would need to be checked for equivalence against the null pattern. 304 | fn from(exported: ExportedCuckooFilter) -> Self { 305 | // Assumes that the `BUCKET_SIZE` and `FINGERPRINT_SIZE` constants do not change. 306 | Self { 307 | buckets: exported 308 | .values 309 | .chunks(BUCKET_SIZE * FINGERPRINT_SIZE) 310 | .map(Bucket::from) 311 | .collect::>() 312 | .into_boxed_slice(), 313 | len: exported.length, 314 | rng: rand::rngs::SmallRng::from_entropy(), 315 | _hasher: PhantomData, 316 | } 317 | } 318 | } 319 | 320 | impl From<&CuckooFilter> for ExportedCuckooFilter 321 | where 322 | H: Hasher + Default, 323 | { 324 | /// Converts a `CuckooFilter` into a simplified version which can be serialized and stored 325 | /// for later use. 326 | fn from(cuckoo: &CuckooFilter) -> Self { 327 | Self { 328 | values: cuckoo.values(), 329 | length: cuckoo.len(), 330 | } 331 | } 332 | } 333 | -------------------------------------------------------------------------------- /statsrelay/src/cuckoofilter/util.rs: -------------------------------------------------------------------------------- 1 | use super::bucket::{Fingerprint, FINGERPRINT_SIZE}; 2 | 3 | use std::hash::{Hash, Hasher}; 4 | 5 | use byteorder::{BigEndian, WriteBytesExt}; 6 | 7 | // A struct combining *F*ingerprint *a*nd *I*ndexes, 8 | // to have a return type with named fields 9 | // instead of a tuple with unnamed fields. 10 | pub struct FaI { 11 | pub fp: Fingerprint, 12 | pub i1: usize, 13 | pub i2: usize, 14 | } 15 | 16 | fn get_hash(data: &T) -> (u32, u32) { 17 | let mut hasher = ::default(); 18 | data.hash(&mut hasher); 19 | let result = hasher.finish(); 20 | 21 | // split 64bit hash value in the upper and the lower 32bit parts, 22 | // one used for the fingerprint, the other used for the indexes. 23 | ((result >> 32) as u32, result as u32) 24 | } 25 | 26 | pub fn get_alt_index(fp: Fingerprint, i: usize) -> usize { 27 | let (_, index_hash) = get_hash::<_, H>(&fp.data); 28 | let alt_i = index_hash as usize; 29 | (i ^ alt_i) as usize 30 | } 31 | 32 | impl FaI { 33 | fn from_data(data: &T) -> Self { 34 | let (fp_hash, index_hash) = get_hash::<_, H>(data); 35 | 36 | let mut fp_hash_arr = [0; FINGERPRINT_SIZE]; 37 | let _ = (&mut fp_hash_arr[..]).write_u32::(fp_hash); 38 | let mut valid_fp_hash: [u8; FINGERPRINT_SIZE] = [0; FINGERPRINT_SIZE]; 39 | let mut n = 0; 40 | let fp; 41 | 42 | // increment every byte of the hash until we find one that is a valid fingerprint 43 | loop { 44 | for i in 0..FINGERPRINT_SIZE { 45 | valid_fp_hash[i] = fp_hash_arr[i] + n; 46 | } 47 | 48 | if let Some(val) = Fingerprint::from_data(valid_fp_hash) { 49 | fp = val; 50 | break; 51 | } 52 | n += 1; 53 | } 54 | 55 | let i1 = index_hash as usize; 56 | let i2 = get_alt_index::(fp, i1); 57 | Self { fp, i1, i2 } 58 | } 59 | 60 | pub fn random_index(&self, r: &mut R) -> usize { 61 | if r.gen() { 62 | self.i1 63 | } else { 64 | self.i2 65 | } 66 | } 67 | } 68 | 69 | pub fn get_fai(data: &T) -> FaI { 70 | FaI::from_data::<_, H>(data) 71 | } 72 | 73 | #[cfg(test)] 74 | mod tests { 75 | use super::*; 76 | 77 | #[test] 78 | fn test_fp_and_index() { 79 | use std::collections::hash_map::DefaultHasher; 80 | let data = "seif"; 81 | let fai = get_fai::<_, DefaultHasher>(data); 82 | let FaI { fp, i1, i2 } = fai; 83 | let i11 = get_alt_index::(fp, i2); 84 | assert_eq!(i11, i1); 85 | 86 | let i22 = get_alt_index::(fp, i11); 87 | assert_eq!(i22, i2); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /statsrelay/src/discovery.rs: -------------------------------------------------------------------------------- 1 | use crate::config::{ 2 | Discovery, DiscoverySource, DiscoveryTransform, PathDiscoverySource, S3DiscoverySource, 3 | }; 4 | 5 | use std::sync::Arc; 6 | use std::time::Duration; 7 | use std::{fs::File, ops::Add}; 8 | use std::{io::BufReader, pin::Pin}; 9 | 10 | use async_stream::stream; 11 | use dashmap::DashMap; 12 | use futures::{stream::Stream, StreamExt}; 13 | use log::warn; 14 | use rusoto_s3::S3; 15 | use serde::{Deserialize, Serialize}; 16 | use tokio::io::AsyncReadExt; 17 | use tokio::time::Instant; 18 | use tokio_stream::StreamMap; 19 | 20 | // Transformer is a set of transformations to apply to a discovery set, for 21 | // example formatting output or repeating elements 22 | trait Transformer { 23 | fn transform(&self, input: &Update) -> Option; 24 | } 25 | 26 | /// Convert an update into another update based on a format string 27 | fn transform_format(format: &str, input: &Update) -> Option { 28 | if !format.contains("{}") { 29 | return None; 30 | } 31 | Some(Update { 32 | hosts: input 33 | .hosts 34 | .iter() 35 | .map(|input| String::from(format).replace("{}", input)) 36 | .collect(), 37 | }) 38 | } 39 | 40 | /// A transformer which repeats each element count times, e.g. a,b count =2 would produce a,a,b,b 41 | fn transform_repeat(count: u32, input: &Update) -> Option { 42 | match count { 43 | 0 => None, 44 | 1 => Some(input.clone()), 45 | n => Some(Update { 46 | hosts: input 47 | .hosts 48 | .iter() 49 | .flat_map(|input| std::iter::repeat(input.clone()).take(n as usize)) 50 | .collect(), 51 | }), 52 | } 53 | } 54 | 55 | impl Transformer for DiscoveryTransform { 56 | fn transform(&self, input: &Update) -> Option { 57 | match self { 58 | DiscoveryTransform::Format { pattern } => transform_format(pattern, input), 59 | DiscoveryTransform::Repeat { count } => transform_repeat(*count, input), 60 | } 61 | } 62 | } 63 | 64 | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] 65 | pub struct Update { 66 | hosts: Vec, 67 | } 68 | 69 | impl Update { 70 | pub fn sources(&self) -> &Vec { 71 | &self.hosts 72 | } 73 | } 74 | 75 | #[derive(Debug, thiserror::Error)] 76 | pub enum Error { 77 | #[error("reading a discovery source had no data")] 78 | EmptyObjectError, 79 | } 80 | 81 | async fn poll_s3_source(config: S3DiscoverySource) -> anyhow::Result { 82 | let region = rusoto_core::Region::default(); 83 | let http_client = rusoto_core::HttpClient::new()?; 84 | let provider = rusoto_credential::AutoRefreshingProvider::new( 85 | rusoto_sts::WebIdentityProvider::from_k8s_env())?; 86 | let region = rusoto_core::Region::default(); 87 | let s3 = rusoto_s3::S3Client::new_with(http_client, provider, region); 88 | let req = rusoto_s3::GetObjectRequest { 89 | bucket: config.bucket.clone(), 90 | key: config.key.clone(), 91 | ..Default::default() 92 | }; 93 | let resp = s3.get_object(req).await?; 94 | let mut buffer = Vec::with_capacity(resp.content_length.unwrap_or(0_i64) as usize); 95 | let mut update = match resp.body { 96 | Some(contents) => { 97 | contents.into_async_read().read_to_end(&mut buffer).await?; 98 | let update: Update = serde_json::from_slice(buffer.as_ref())?; 99 | update 100 | } 101 | None => { 102 | warn!("no cluster state located at {:?}", config.key); 103 | return Err(Error::EmptyObjectError.into()); 104 | } 105 | }; 106 | 107 | for trans in config.transforms.unwrap_or_default().iter() { 108 | if let Some(new_update) = trans.transform(&update) { 109 | update = new_update; 110 | } 111 | } 112 | Ok(update) 113 | } 114 | 115 | async fn poll_file_source(config: PathDiscoverySource, path: String) -> anyhow::Result { 116 | let result = tokio::task::spawn_blocking(move || { 117 | let file = File::open(path)?; 118 | let reader = BufReader::new(file); 119 | let mut update: Update = serde_json::from_reader(reader)?; 120 | 121 | for trans in config.transforms.unwrap_or_default().iter() { 122 | if let Some(new_update) = trans.transform(&update) { 123 | update = new_update; 124 | } 125 | } 126 | Ok(update) 127 | }) 128 | .await?; 129 | result 130 | } 131 | 132 | /// A generic stream which takes a callable async function taking an 133 | /// update (or lack thereof), polling at the defined interval, emitting the 134 | /// output when changed as a stream. 135 | fn polled_stream(config: T, interval: u64, callable: C) -> impl Stream 136 | where 137 | T: Clone + Send + Sync, 138 | C: Fn(T) -> Pin> + Send>>, 139 | { 140 | let mut last_update = Update::default(); 141 | let duration = Duration::from_secs(interval as u64); 142 | let start = Instant::now().add(duration); 143 | stream! { 144 | 145 | let mut ticker = tokio::time::interval_at(start, duration); 146 | loop { 147 | let new_update = match callable(config.clone()).await { 148 | Err(e) => { 149 | warn!("unable to fetch discovery source due to error {:?}", e); 150 | ticker.tick().await; 151 | continue; 152 | }, 153 | Ok(update) => update, 154 | }; 155 | if new_update != last_update { 156 | yield new_update.clone(); 157 | } 158 | last_update = new_update; 159 | ticker.tick().await; 160 | } 161 | } 162 | } 163 | 164 | pub fn as_stream(config: &Discovery) -> impl Stream { 165 | let mut streams: StreamMap + Send>>> = 166 | StreamMap::new(); 167 | 168 | for (name, source) in config.sources.iter() { 169 | match source { 170 | DiscoverySource::S3(source) => { 171 | let ns = Box::pin(polled_stream( 172 | source.clone(), 173 | source.interval as u64, 174 | move |s| Box::pin(poll_s3_source(s)), 175 | )); 176 | //let ns = Box::pin(s3_stream(source.clone())); 177 | streams.insert(name.clone(), ns); 178 | } 179 | DiscoverySource::StaticFile(source) => { 180 | let cs = source.clone(); 181 | let ns = Box::pin(polled_stream( 182 | source.path.clone(), 183 | source.interval as u64, 184 | move |s| Box::pin(poll_file_source(cs.clone(), s)), 185 | )); 186 | //let ns = Box::pin(static_file_stream(source.clone())); 187 | streams.insert(name.clone(), ns); 188 | } 189 | } 190 | } 191 | streams 192 | } 193 | 194 | #[derive(Clone)] 195 | pub struct Cache { 196 | cache: Arc>, 197 | } 198 | 199 | impl Cache { 200 | pub fn new() -> Self { 201 | Cache { 202 | cache: Arc::new(DashMap::new()), 203 | } 204 | } 205 | 206 | pub fn store(&self, event: &(String, Update)) { 207 | self.cache.insert(event.0.clone(), event.1.clone()); 208 | } 209 | 210 | pub fn get(&self, key: &str) -> Option { 211 | self.cache.get(key).map(|s| s.clone()) 212 | } 213 | } 214 | 215 | impl Default for Cache { 216 | fn default() -> Self { 217 | Cache::new() 218 | } 219 | } 220 | 221 | pub fn reflector(cache: Cache, stream: S) -> impl Stream 222 | where 223 | S: Stream, 224 | { 225 | stream.inspect(move |event| cache.store(event)) 226 | } 227 | 228 | #[cfg(test)] 229 | pub mod tests { 230 | use crate::config::DiscoveryTransform; 231 | 232 | use super::{Transformer, Update}; 233 | 234 | #[test] 235 | fn format() { 236 | let o1 = Update { 237 | hosts: vec!["a", "b"].iter().map(|s| (*s).into()).collect(), 238 | }; 239 | let transformer = DiscoveryTransform::Format { 240 | pattern: "{}hello".into(), 241 | }; 242 | let f = transformer.transform(&o1).unwrap(); 243 | assert_eq!(f.hosts[0], "ahello"); 244 | assert_eq!(f.hosts[1], "bhello"); 245 | 246 | let bad_transformer = DiscoveryTransform::Format { 247 | pattern: "foo".into(), 248 | }; 249 | 250 | assert!(bad_transformer.transform(&o1).is_none()); 251 | } 252 | 253 | #[test] 254 | fn repeat() { 255 | let o1 = Update { 256 | hosts: vec!["a", "b"].iter().map(|s| (*s).into()).collect(), 257 | }; 258 | let transformer = DiscoveryTransform::Repeat { count: 4 }; 259 | let f = transformer.transform(&o1).unwrap(); 260 | assert_eq!(f.hosts, vec!["a", "a", "a", "a", "b", "b", "b", "b"]); 261 | 262 | let bad_transformer = DiscoveryTransform::Repeat { count: 0 }; 263 | 264 | assert!(bad_transformer.transform(&o1).is_none()); 265 | } 266 | } 267 | -------------------------------------------------------------------------------- /statsrelay/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod admin; 2 | pub mod backend; 3 | pub mod backend_client; 4 | pub mod backends; 5 | pub mod config; 6 | pub mod cuckoofilter; 7 | pub mod discovery; 8 | pub mod processors; 9 | pub mod shard; 10 | pub mod stats; 11 | pub mod statsd_proto; 12 | pub mod statsd_server; 13 | pub mod built_info { 14 | // The file has been placed there by the build script. 15 | include!(concat!(env!("OUT_DIR"), "/built.rs")); 16 | } 17 | -------------------------------------------------------------------------------- /statsrelay/src/processors/cardinality.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryInto; 2 | use std::hash::{Hash, Hasher}; 3 | use std::time::{Duration, SystemTime}; 4 | 5 | use super::super::config; 6 | use super::super::statsd_proto::Event; 7 | use super::{Output, Processor}; 8 | use crate::stats::{Counter, Gauge, Scope}; 9 | use crate::{ 10 | backends::Backends, 11 | statsd_proto::{Owned, Parsed}, 12 | }; 13 | 14 | use crate::cuckoofilter::{self, CuckooFilter}; 15 | use ahash::AHasher; 16 | use parking_lot::Mutex; 17 | 18 | use log::warn; 19 | 20 | struct TimeBoundedCuckoo 21 | where 22 | H: Hasher + Default, 23 | { 24 | filter: CuckooFilter, 25 | valid_until: SystemTime, 26 | } 27 | 28 | impl TimeBoundedCuckoo 29 | where 30 | H: Hasher + Default, 31 | { 32 | fn new(valid_until: SystemTime) -> Self { 33 | TimeBoundedCuckoo { 34 | filter: CuckooFilter::with_capacity((1 << 22) - 1), 35 | valid_until, 36 | } 37 | } 38 | } 39 | 40 | struct MultiCuckoo 41 | where 42 | H: Hasher + Default, 43 | { 44 | buckets: usize, 45 | window: Duration, 46 | filters: Vec>, 47 | } 48 | 49 | impl MultiCuckoo 50 | where 51 | H: Hasher + Default, 52 | { 53 | fn new(buckets: usize, window: &Duration) -> Self { 54 | assert!(buckets > 0); 55 | let now = SystemTime::now(); 56 | let cuckoos: Vec<_> = (1..(buckets + 1)) 57 | .map(|bucket| TimeBoundedCuckoo::new(now + (*window * bucket as u32))) 58 | .collect(); 59 | MultiCuckoo { 60 | buckets, 61 | window: *window, 62 | filters: cuckoos, 63 | } 64 | } 65 | 66 | fn len(&self) -> usize { 67 | self.filters[0].filter.len() 68 | } 69 | 70 | fn contains(&self, data: &T) -> bool { 71 | self.filters[0].filter.contains(data) 72 | } 73 | 74 | fn add(&mut self, data: &T) -> Result<(), cuckoofilter::CuckooError> { 75 | let results: Result, _> = self 76 | .filters 77 | .iter_mut() 78 | .map(|filter| filter.filter.test_and_add(data)) 79 | .collect(); 80 | results.map(|_| ()) 81 | } 82 | 83 | fn rotate(&mut self, with_time: SystemTime) { 84 | if self.filters[0] 85 | .valid_until 86 | .duration_since(with_time) 87 | .is_err() 88 | { 89 | // duration_since returns err if the given is later then the valid_until time, aka expired 90 | self.filters.remove(0); 91 | self.filters.push(TimeBoundedCuckoo::new( 92 | with_time + (self.window * (self.buckets + 1) as u32), 93 | )); 94 | } 95 | } 96 | } 97 | 98 | pub struct Cardinality { 99 | route: Vec, 100 | filter: Mutex>, 101 | limit: usize, 102 | counter_flagged_metrics: Counter, 103 | gauge_metric_hwm: Gauge, 104 | } 105 | 106 | impl Cardinality { 107 | pub fn new(scope: Scope, from_config: &config::processor::Cardinality) -> Self { 108 | let window = Duration::from_secs(from_config.rotate_after_seconds); 109 | // Record a limit gauge for visibility 110 | let limit_gauge = scope.gauge("limit").unwrap(); 111 | limit_gauge.set(from_config.size_limit as f64); 112 | Cardinality { 113 | route: from_config.route.clone(), 114 | filter: Mutex::new(MultiCuckoo::new(from_config.buckets, &window)), 115 | limit: from_config.size_limit as usize, 116 | counter_flagged_metrics: scope.counter("flagged_metrics").unwrap(), 117 | gauge_metric_hwm: scope.gauge("count_hwm").unwrap(), 118 | } 119 | } 120 | 121 | fn rotate(&self) { 122 | self.filter.lock().rotate(SystemTime::now()) 123 | } 124 | } 125 | 126 | impl Processor for Cardinality { 127 | fn provide_statsd(&self, sample: &Event) -> Option { 128 | let mut filter = self.filter.lock(); 129 | let contains = filter.contains(sample); 130 | let len = filter.len(); 131 | self.gauge_metric_hwm.set(len as f64); 132 | 133 | if !contains && len > self.limit { 134 | if (self.counter_flagged_metrics.get() as u64) % 1000 == 0 { 135 | // Enforce parsing of the metric to give a clean debug log 136 | let owned: Owned = sample.try_into().ok()?; 137 | warn!("metric flagged for cardinality limits: {}", owned.id()); 138 | } 139 | self.counter_flagged_metrics.inc(); 140 | return None; 141 | } 142 | let _ = filter.add(sample); 143 | Some(Output { 144 | route: self.route.as_ref(), 145 | new_events: None, 146 | }) 147 | } 148 | 149 | fn tick(&self, _time: std::time::SystemTime, _backends: &Backends) { 150 | self.rotate(); 151 | } 152 | } 153 | 154 | #[cfg(test)] 155 | pub mod test { 156 | use std::vec; 157 | 158 | use crate::statsd_proto::{Id, Owned, Type}; 159 | 160 | use super::*; 161 | 162 | #[test] 163 | fn cuckoo_simple_contains() { 164 | let a = "a".to_string(); 165 | let b = "b".to_string(); 166 | 167 | let mut mc: MultiCuckoo = MultiCuckoo::new(2, &Duration::from_secs(60)); 168 | 169 | mc.add(&a).unwrap(); 170 | assert!(!mc.contains(&b)); 171 | assert!(mc.contains(&a)); 172 | mc.add(&b).unwrap(); 173 | assert!(mc.contains(&b)); 174 | } 175 | 176 | #[test] 177 | fn cuckoo_simple_rotate() { 178 | let a = "a".to_string(); 179 | let b = "b".to_string(); 180 | 181 | let now = SystemTime::now(); 182 | let mut mc: MultiCuckoo = MultiCuckoo::new(2, &Duration::from_secs(60)); 183 | 184 | mc.add(&a).unwrap(); 185 | assert!(!mc.contains(&b)); 186 | assert!(mc.contains(&a)); 187 | mc.add(&b).unwrap(); 188 | assert!(mc.contains(&b)); 189 | // Rotate once, add only a 190 | mc.rotate(now + Duration::from_secs(61)); 191 | assert!(mc.contains(&a)); 192 | assert!(mc.contains(&b)); 193 | assert!(mc.len() == 2); 194 | mc.add(&a).unwrap(); 195 | // Rotate again, b should drop out 196 | mc.rotate(now + Duration::from_secs(122)); 197 | assert!(mc.contains(&a)); 198 | assert!(!mc.contains(&b)); 199 | assert!(mc.len() == 1); 200 | } 201 | 202 | #[test] 203 | fn test_cardinality_limit() { 204 | let names: Vec = (0..400) 205 | .map(|val| { 206 | let id = Id { 207 | name: format!("metric.{}", val as u32).as_bytes().to_vec(), 208 | mtype: Type::Counter, 209 | tags: vec![], 210 | }; 211 | Event::Parsed(Owned::new(id, 1.0, None)) 212 | }) 213 | .collect(); 214 | 215 | let config = config::processor::Cardinality { 216 | size_limit: 100_usize, 217 | rotate_after_seconds: 10, 218 | buckets: 2, 219 | route: vec![], 220 | }; 221 | let scope = crate::stats::Collector::default().scope("test"); 222 | let filter = Cardinality::new(scope, &config); 223 | for name in &names[0..101] { 224 | assert!(filter.provide_statsd(name).is_some()); 225 | } 226 | let len = filter.filter.lock().len(); 227 | assert!(len == 101, "length isn't as expected {}", len); 228 | for name in &names[101..] { 229 | assert!( 230 | filter.provide_statsd(name).is_none(), 231 | "sample {:?} was allowed", 232 | name 233 | ); 234 | } 235 | assert!( 236 | filter.gauge_metric_hwm.get() == 101_f64, 237 | "metric high water mark was set, hwm {}", 238 | filter.gauge_metric_hwm.get() 239 | ); 240 | assert!( 241 | filter.counter_flagged_metrics.get() > 298_f64, 242 | "flagged metric counter was increased, count {}", 243 | filter.counter_flagged_metrics.get() 244 | ); 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /statsrelay/src/processors/mod.rs: -------------------------------------------------------------------------------- 1 | use super::backends::Backends; 2 | use crate::config; 3 | use crate::statsd_proto::Event; 4 | use smallvec::SmallVec; 5 | 6 | pub mod cardinality; 7 | pub mod regex_filter; 8 | pub mod sampler; 9 | pub mod tag; 10 | 11 | pub struct Output<'a> { 12 | /// Lists of new events returned if the processor has modified the 13 | /// sample in any way. If this is none but a route is set, downstream 14 | /// processors will be called with the original reference to the Sample 15 | pub new_events: Option>, 16 | pub route: &'a [config::Route], 17 | } 18 | pub trait Processor { 19 | /// Tick is designed for processors to do any internal housekeeping. A copy 20 | /// of the called time is provided for mocking, and a reference to the 21 | /// Backends structure is provided to re-inject messages into processor 22 | /// framework if desired. 23 | fn tick(&self, _time: std::time::SystemTime, _backends: &Backends) {} 24 | 25 | /// Provides a signal that the processor should cease operating, 26 | /// specifically designed to allow processors which do buffering to flush 27 | /// internal state out to backends. The Drop trait should be used to handle 28 | /// actual cleanup of resources. 29 | fn flush(&self, _backends: &Backends) {} 30 | 31 | fn provide_statsd(&self, sample: &Event) -> Option; 32 | } 33 | -------------------------------------------------------------------------------- /statsrelay/src/processors/regex_filter.rs: -------------------------------------------------------------------------------- 1 | use regex::RegexSet; 2 | 3 | use super::{Output, Processor}; 4 | use crate::stats; 5 | use crate::{config::processor, statsd_proto::Event}; 6 | use crate::{config::Route, statsd_proto::Parsed}; 7 | 8 | pub struct RegexFilter { 9 | allow: Option, 10 | remove: Option, 11 | route: Vec, 12 | 13 | counter_remove: stats::Counter, 14 | } 15 | 16 | impl RegexFilter { 17 | pub fn new( 18 | scope: stats::Scope, 19 | from_config: &processor::RegexFilter, 20 | ) -> Result { 21 | let allow = from_config.allow.as_ref().map(RegexSet::new).transpose()?; 22 | let remove = from_config.remove.as_ref().map(RegexSet::new).transpose()?; 23 | Ok(RegexFilter { 24 | allow, 25 | remove, 26 | route: from_config.route.clone(), 27 | counter_remove: scope.counter("removed").unwrap(), 28 | }) 29 | } 30 | } 31 | 32 | impl Processor for RegexFilter { 33 | fn provide_statsd(&self, event: &Event) -> Option { 34 | let name = std::str::from_utf8(match event { 35 | Event::Parsed(parsed) => parsed.id().name.as_ref(), 36 | Event::Pdu(pdu) => pdu.name(), 37 | }) 38 | .ok()?; 39 | if let Some(allow) = &self.allow { 40 | if !allow.is_match(name) { 41 | self.counter_remove.inc(); 42 | return None; 43 | } 44 | } 45 | if let Some(remove) = &self.remove { 46 | if remove.is_match(name) { 47 | self.counter_remove.inc(); 48 | return None; 49 | } 50 | } 51 | Some(Output { 52 | new_events: None, 53 | route: self.route.as_ref(), 54 | }) 55 | } 56 | } 57 | 58 | #[cfg(test)] 59 | pub mod test { 60 | 61 | use super::*; 62 | 63 | #[test] 64 | fn build_filter() { 65 | let c = processor::RegexFilter { 66 | route: vec![], 67 | remove: Some(vec![r"^hello.*".to_owned(), r"^goodbye.*".to_owned()]), 68 | allow: None, 69 | }; 70 | let sink = stats::Collector::default(); 71 | let scope = sink.scope("prefix"); 72 | let filter = RegexFilter::new(scope, &c).unwrap(); 73 | 74 | let event1 = Event::Pdu( 75 | crate::statsd_proto::Pdu::parse(bytes::Bytes::from_static(b"hello.world:c|1")).unwrap(), 76 | ); 77 | let event2 = Event::Pdu( 78 | crate::statsd_proto::Pdu::parse(bytes::Bytes::from_static(b"goodbye.world:c|1")) 79 | .unwrap(), 80 | ); 81 | let event3 = Event::Pdu( 82 | crate::statsd_proto::Pdu::parse(bytes::Bytes::from_static(b"pineapples:c|1")).unwrap(), 83 | ); 84 | 85 | assert!(filter.provide_statsd(&event1).is_none(), "should remove"); 86 | assert!(filter.provide_statsd(&event2).is_none(), "should remove"); 87 | assert!( 88 | filter.provide_statsd(&event3).is_some(), 89 | "should not remove" 90 | ); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /statsrelay/src/processors/sampler.rs: -------------------------------------------------------------------------------- 1 | use super::Output; 2 | use crate::backends::Backends; 3 | use crate::processors; 4 | use crate::statsd_proto::Id; 5 | use crate::statsd_proto::{Event, Owned, Type}; 6 | use crate::{config, statsd_proto::Parsed}; 7 | 8 | use ahash::RandomState; 9 | use parking_lot::Mutex; 10 | use std::cell::RefCell; 11 | use std::time::SystemTime; 12 | use thiserror::Error; 13 | 14 | use std::collections::HashMap; 15 | use std::convert::TryInto; 16 | 17 | const DEFAULT_RESERVOIR: u32 = 100; 18 | 19 | fn scale(value: f64, sample_rate: Option) -> (f64, f64) { 20 | match sample_rate { 21 | None => (value, 1_f64), 22 | Some(rate) => { 23 | let scale = 1_f64 / rate; 24 | if scale > 0_f64 && scale <= 1_f64 { 25 | (value * scale, scale) 26 | } else { 27 | (value, 1_f64) 28 | } 29 | } 30 | } 31 | } 32 | 33 | #[derive(Error, Debug)] 34 | pub enum Error { 35 | #[error("invalid sampler configuration")] 36 | InvalidConfig, 37 | } 38 | 39 | #[derive(Debug, Default)] 40 | struct Counter { 41 | value: f64, 42 | samples: f64, 43 | } 44 | 45 | impl Counter { 46 | fn to_event(&self, id: &Id) -> Event { 47 | let value = self.value / self.samples; 48 | let sample_rate = 1_f64 / self.samples; 49 | Event::Parsed(Owned::new(id.clone(), value, Some(sample_rate))) 50 | } 51 | } 52 | 53 | #[derive(Debug)] 54 | struct Timer { 55 | values: Vec, 56 | filled_count: u32, 57 | reservoir_size: u32, 58 | count: f64, 59 | sum: f64, 60 | } 61 | 62 | impl Timer { 63 | fn new(reservoir_size: u32) -> Self { 64 | Timer { 65 | values: Vec::with_capacity(reservoir_size as usize), 66 | filled_count: 0, 67 | reservoir_size, 68 | count: 0_f64, 69 | sum: 0_f64, 70 | } 71 | } 72 | 73 | fn add(&mut self, value: f64, sample_rate: Option) { 74 | // Do an initial fill if we haven't filled the full reservoir 75 | if self.values.len() < self.reservoir_size as usize { 76 | self.values.push(value); 77 | } else { 78 | match fastrand::u32(..) % self.filled_count { 79 | idx if idx < self.reservoir_size => self.values[idx as usize] = value, 80 | _ => (), 81 | } 82 | } 83 | let (sum, count) = scale(value, sample_rate); 84 | // Keep track of a sample rate scaled count independently from the 85 | // reservoir sample fill 86 | self.count += count; 87 | self.sum += sum; 88 | self.filled_count += 1; 89 | } 90 | } 91 | 92 | #[derive(Debug, Default)] 93 | struct Gauge { 94 | value: f64, 95 | } 96 | 97 | impl Gauge { 98 | fn to_event(&self, id: &Id) -> Event { 99 | Event::Parsed(Owned::new(id.clone(), self.value, None)) 100 | } 101 | } 102 | 103 | #[derive(Debug)] 104 | pub struct Sampler { 105 | config: config::processor::Sampler, 106 | counters: Mutex>>, 107 | timers: Mutex>>, 108 | gauges: Mutex>>, 109 | 110 | last_flush: Mutex>, 111 | 112 | route_to: Vec, 113 | } 114 | 115 | impl Sampler { 116 | pub fn new(config: &config::processor::Sampler) -> Result { 117 | let counters: RefCell> = RefCell::new(HashMap::default()); 118 | let timers: RefCell> = RefCell::new(HashMap::default()); 119 | let gauges: RefCell> = RefCell::new(HashMap::default()); 120 | Ok(Sampler { 121 | config: config.clone(), 122 | counters: Mutex::new(counters), 123 | timers: Mutex::new(timers), 124 | gauges: Mutex::new(gauges), 125 | route_to: config.route.clone(), 126 | last_flush: Mutex::new(RefCell::new(std::time::SystemTime::now())), 127 | }) 128 | } 129 | 130 | fn record_timer(&self, owned: &Owned) { 131 | let lock = self.timers.lock(); 132 | let mut hm = lock.borrow_mut(); 133 | 134 | match hm.get_mut(owned.id()) { 135 | Some(v) => { 136 | v.add(owned.value(), owned.sample_rate()); 137 | } 138 | None => { 139 | let mut timer = Timer::new( 140 | self.config 141 | .timer_reservoir_size 142 | .unwrap_or(DEFAULT_RESERVOIR), 143 | ); 144 | timer.add(owned.value(), owned.sample_rate()); 145 | hm.insert(owned.id().clone(), timer); 146 | } 147 | } 148 | } 149 | 150 | fn record_gauge(&self, owned: &Owned) { 151 | let lock = self.gauges.lock(); 152 | let mut hm = lock.borrow_mut(); 153 | // Note: Using the entry API would make logical sense to avoid 154 | // re-hashing the same Id on insert, however it costs more to 155 | // clone the Id as the entry API does not allow for trait Clone 156 | // key references and supporting lazy-cloning. 157 | match hm.get_mut(owned.id()) { 158 | Some(v) => v.value = owned.value(), 159 | None => { 160 | hm.insert( 161 | owned.id().clone(), 162 | Gauge { 163 | value: owned.value(), 164 | }, 165 | ); 166 | } 167 | }; 168 | } 169 | 170 | fn record_counter(&self, owned: &Owned) { 171 | // Adjust values based on sample rate. In the end, emission will 172 | // re-scale everything back to the sample rate. 173 | let (scaled, counts) = scale(owned.value(), owned.sample_rate()); 174 | 175 | let lock = self.counters.lock(); 176 | let mut hm = lock.borrow_mut(); 177 | 178 | match hm.get_mut(owned.id()) { 179 | Some(v) => { 180 | v.value += scaled; 181 | v.samples += counts; 182 | } 183 | None => { 184 | hm.insert( 185 | owned.id().clone(), 186 | Counter { 187 | value: scaled, 188 | samples: counts, 189 | }, 190 | ); 191 | } 192 | } 193 | } 194 | 195 | fn handle_flush(&self, backends: &Backends) { 196 | let mut gauges = self.gauges.lock().replace(HashMap::default()); 197 | for (id, gauge) in gauges.drain() { 198 | let pdu = gauge.to_event(&id); 199 | backends.provide_statsd(&pdu, self.route_to.as_ref()) 200 | } 201 | 202 | let mut counters = self.counters.lock().replace(HashMap::default()); 203 | for (id, counter) in counters.drain() { 204 | let pdu = counter.to_event(&id); 205 | backends.provide_statsd(&pdu, self.route_to.as_ref()); 206 | } 207 | 208 | let mut timers = self.timers.lock().replace(HashMap::default()); 209 | for (id, timer) in timers.drain() { 210 | let sample_rate = timer.values.len() as f64 / timer.count; 211 | for value in timer.values { 212 | let pdu = Event::Parsed(Owned::new(id.clone(), value, Some(sample_rate))); 213 | backends.provide_statsd(&pdu, self.route_to.as_ref()); 214 | } 215 | } 216 | } 217 | 218 | fn check_tick_passed(&self, earlier: SystemTime, time: SystemTime) -> bool { 219 | match time.duration_since(earlier) { 220 | Err(_) => false, 221 | Ok(duration) if duration.as_secs() < self.config.window as u64 => false, 222 | Ok(_) => true, 223 | } 224 | } 225 | } 226 | 227 | impl processors::Processor for Sampler { 228 | fn provide_statsd(&self, sample: &Event) -> Option { 229 | let owned: Result = sample.try_into(); 230 | match owned { 231 | Err(_) => None, 232 | Ok(owned) if owned.metric_type() == &Type::Timer => { 233 | self.record_timer(&owned); 234 | None 235 | } 236 | Ok(owned) if owned.metric_type() == &Type::Counter => { 237 | self.record_counter(&owned); 238 | None 239 | } 240 | Ok(owned) if owned.metric_type() == &Type::Gauge => { 241 | self.record_gauge(&owned); 242 | None 243 | } 244 | Ok(_) => Some(Output { 245 | route: &self.route_to, 246 | new_events: None, 247 | }), 248 | } 249 | } 250 | 251 | fn tick(&self, time: std::time::SystemTime, backends: &Backends) { 252 | // Take a lock on the last flush, which guards all other flushes. 253 | let flush_lock = self.last_flush.lock(); 254 | let earlier = *flush_lock.borrow(); 255 | if !self.check_tick_passed(earlier, time) { 256 | return; 257 | }; 258 | self.handle_flush(backends); 259 | flush_lock.replace(time); 260 | } 261 | 262 | fn flush(&self, backends: &Backends) { 263 | let _flush_lock = self.last_flush.lock(); 264 | self.handle_flush(backends); 265 | } 266 | } 267 | 268 | #[cfg(test)] 269 | pub mod test { 270 | use super::*; 271 | 272 | #[test] 273 | fn fill_timer() { 274 | let mut timer = Timer::new(100); 275 | for x in 0..200 { 276 | timer.add(x as f64, None); 277 | } 278 | assert_eq!(timer.filled_count, 200); 279 | assert_eq!(timer.count, 200_f64); 280 | assert_eq!(timer.sum, 19900_f64); 281 | assert_eq!(timer.values.len(), 100); 282 | } 283 | } 284 | -------------------------------------------------------------------------------- /statsrelay/src/processors/tag.rs: -------------------------------------------------------------------------------- 1 | use crate::config; 2 | use crate::processors; 3 | use crate::statsd_proto; 4 | use crate::statsd_proto::Event; 5 | use std::convert::TryInto; 6 | 7 | use smallvec::smallvec; 8 | 9 | pub struct Normalizer { 10 | route: Vec, 11 | } 12 | 13 | impl Normalizer { 14 | pub fn new(route: &[config::Route]) -> Self { 15 | Normalizer { 16 | route: route.to_vec(), 17 | } 18 | } 19 | } 20 | 21 | impl processors::Processor for Normalizer { 22 | fn provide_statsd(&self, sample: &Event) -> Option { 23 | let owned: Result = sample.try_into(); 24 | owned 25 | .map(|inp| { 26 | let out = statsd_proto::convert::to_inline_tags(inp); 27 | processors::Output { 28 | new_events: Some(smallvec![Event::Parsed(out)]), 29 | route: self.route.as_ref(), 30 | } 31 | }) 32 | .ok() 33 | } 34 | } 35 | 36 | #[cfg(test)] 37 | pub mod test { 38 | use processors::Processor; 39 | use statsd_proto::Parsed; 40 | 41 | use super::*; 42 | 43 | #[test] 44 | fn make_normalizer() { 45 | let route = vec![config::Route { 46 | route_type: config::RouteType::Processor, 47 | route_to: "null".to_string(), 48 | }]; 49 | 50 | let tn = Normalizer::new(&route); 51 | let pdu = 52 | statsd_proto::Pdu::parse(bytes::Bytes::from_static(b"foo.bar:3|c|#tags:value|@1.0")) 53 | .unwrap(); 54 | let sample = Event::Pdu(pdu); 55 | let result = tn.provide_statsd(&sample).unwrap(); 56 | 57 | let first_sample = &result.new_events.as_ref().unwrap()[0]; 58 | let owned: statsd_proto::Owned = first_sample.try_into().unwrap(); 59 | assert_eq!(owned.name(), b"foo.bar.__tags=value"); 60 | assert_eq!(route, result.route); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /statsrelay/src/shard.rs: -------------------------------------------------------------------------------- 1 | use std::io::Cursor; 2 | 3 | use crate::statsd_proto::Pdu; 4 | 5 | // HASHLIB_SEED same as the legacy statsrelay code base 6 | const HASHLIB_SEED: u32 = 0xaccd3d34; 7 | 8 | pub fn statsrelay_compat_hash(pdu: &Pdu) -> u32 { 9 | murmur3::murmur3_32(&mut Cursor::new(pdu.name()), HASHLIB_SEED).unwrap_or(0) 10 | } 11 | 12 | pub struct Ring { 13 | members: Vec, 14 | } 15 | 16 | impl Ring { 17 | pub fn new() -> Self { 18 | Ring { 19 | members: Vec::new(), 20 | } 21 | } 22 | 23 | pub fn push(&mut self, c: C) { 24 | self.members.push(c); 25 | } 26 | 27 | pub fn len(&self) -> usize { 28 | self.members.len() 29 | } 30 | 31 | pub fn is_empty(&self) -> bool { 32 | self.len() == 0 33 | } 34 | 35 | pub fn pick_from(&self, code: u32) -> &C { 36 | let l = self.members.len(); 37 | self.members.get(code as usize % l).unwrap() 38 | } 39 | 40 | pub fn act_on(&mut self, code: u32, mut f: F) 41 | where 42 | F: FnMut(&mut C), 43 | { 44 | let len = self.members.len(); 45 | let c = &mut self.members[code as usize % len]; 46 | f(c); 47 | } 48 | 49 | pub fn swap(&mut self, other: Ring) { 50 | self.members = other.members; 51 | } 52 | } 53 | 54 | impl Default for Ring { 55 | fn default() -> Self { 56 | Ring::new() 57 | } 58 | } 59 | 60 | #[cfg(test)] 61 | pub mod test { 62 | use super::*; 63 | use bytes::Bytes; 64 | 65 | #[test] 66 | fn test_swap() { 67 | let mut ring = Ring::new(); 68 | ring.push(0); 69 | ring.push(1); 70 | assert_eq!(ring.len(), 2); 71 | let mut ring2 = Ring::new(); 72 | ring2.push(2); 73 | ring2.push(3); 74 | ring2.push(4); 75 | assert_eq!(ring2.len(), 3); 76 | ring.swap(ring2); 77 | assert_eq!(ring.len(), 3); 78 | } 79 | #[test] 80 | fn test_hash() { 81 | let mut ring = Ring::new(); 82 | ring.push(0); 83 | ring.push(1); 84 | ring.push(2); 85 | ring.push(3); 86 | 87 | assert_eq!( 88 | *ring.pick_from(statsrelay_compat_hash( 89 | &Pdu::parse(Bytes::copy_from_slice(b"apple:1|c")).unwrap() 90 | )), 91 | 2 92 | ); 93 | assert_eq!( 94 | *ring.pick_from(statsrelay_compat_hash( 95 | &Pdu::parse(Bytes::copy_from_slice(b"banana:1|c")).unwrap() 96 | )), 97 | 3 98 | ); 99 | assert_eq!( 100 | *ring.pick_from(statsrelay_compat_hash( 101 | &Pdu::parse(Bytes::copy_from_slice(b"orange:1|c")).unwrap() 102 | )), 103 | 0 104 | ); 105 | assert_eq!( 106 | *ring.pick_from(statsrelay_compat_hash( 107 | &Pdu::parse(Bytes::copy_from_slice(b"lemon:1|c")).unwrap() 108 | )), 109 | 1 110 | ); 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /statsrelay/src/stats.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use dashmap::DashMap; 4 | use prometheus::{Encoder, Registry, TextEncoder}; 5 | 6 | pub const SEP: &str = ":"; 7 | /// A wrapped stats implementation, to allow multiple backends to be used 8 | /// instead of just prometheus, when required. Right now this implementation is 9 | /// extremely simple and only works with prometheus exporting, and will require 10 | /// some revisions to improve. 11 | /// 12 | /// All types are clone-able - for the Collector and all built metric types, 13 | /// they will continue to refer to the same set of names and values and do not 14 | /// create new values. Scopes can be cloned, but do not share lineage and allow 15 | /// sub-scopes to be made independently. Building a reference to the same 16 | /// counter name will return the same underlying counter atomic. 17 | 18 | #[derive(Clone, Debug)] 19 | pub struct Collector { 20 | // Registry is an Arc<> locked type and therefor is freely cloneable 21 | registry: Registry, 22 | counters: Arc>, 23 | gauges: Arc>, 24 | } 25 | 26 | impl Default for Collector { 27 | fn default() -> Self { 28 | Collector { 29 | registry: Registry::new(), 30 | counters: Arc::new(DashMap::new()), 31 | gauges: Arc::new(DashMap::new()), 32 | } 33 | } 34 | } 35 | 36 | impl Collector { 37 | pub fn scope(&self, prefix: &str) -> Scope { 38 | Scope { 39 | collector: self.clone(), 40 | scope: String::from(prefix), 41 | } 42 | } 43 | 44 | /// Generate and return a byte buffer containing a Prometheus formatted text 45 | /// output of the current contents of this collector. 46 | pub fn prometheus_output(&self) -> anyhow::Result> { 47 | let output = self.registry.gather(); 48 | let encoder = TextEncoder::new(); 49 | let mut buffer = vec![]; 50 | 51 | encoder.encode(&output, &mut buffer)?; 52 | Ok(buffer) 53 | } 54 | 55 | /// Attempt to register a new counter. If the counter already exists, it 56 | /// will return the previously registered counter instead of the one passed 57 | /// in. 58 | fn register_counter(&self, c: Counter) -> anyhow::Result { 59 | let counter = match self.counters.get(&c.name) { 60 | Some(counter) => counter.clone(), 61 | None => { 62 | self.registry.register(Box::new(c.clone().counter))?; 63 | self.counters.insert(c.name.clone(), c.clone()); 64 | c 65 | } 66 | }; 67 | 68 | Ok(counter) 69 | } 70 | 71 | fn register_gauge(&self, g: Gauge) -> anyhow::Result { 72 | let gauge = match self.gauges.get(&g.name) { 73 | Some(gauge) => gauge.clone(), 74 | None => { 75 | self.registry.register(Box::new(g.clone().gauge))?; 76 | self.gauges.insert(g.name.clone(), g.clone()); 77 | g 78 | } 79 | }; 80 | Ok(gauge) 81 | } 82 | } 83 | 84 | #[derive(Clone, Debug)] 85 | pub struct Scope { 86 | collector: Collector, 87 | scope: String, 88 | } 89 | 90 | impl Scope { 91 | pub fn scope(&self, extend: &str) -> Scope { 92 | Scope { 93 | scope: format!("{}{}{}", self.scope, SEP, extend), 94 | collector: self.collector.clone(), 95 | } 96 | } 97 | 98 | /// Create a new counter with the given scope, or return an existing 99 | /// underlying counter 100 | pub fn counter(&self, name: &str) -> anyhow::Result { 101 | let name = format!("{}{}{}", self.scope, SEP, name); 102 | let counter = Counter::new(name)?; 103 | self.collector.register_counter(counter) 104 | } 105 | 106 | /// Create a new gauge with the given scope, or return the existing gauge 107 | /// with the same name 108 | pub fn gauge(&self, name: &str) -> anyhow::Result { 109 | let name = format!("{}{}{}", self.scope, SEP, name); 110 | let gauge = Gauge::new(name.as_str())?; 111 | self.collector.register_gauge(gauge) 112 | } 113 | } 114 | 115 | #[derive(Clone, Debug)] 116 | pub struct Gauge { 117 | name: String, 118 | gauge: prometheus::Gauge, 119 | } 120 | 121 | impl Gauge { 122 | fn new(name: &str) -> anyhow::Result { 123 | let pg = prometheus::Gauge::new(name.to_owned(), "a gauge")?; 124 | Ok(Self { 125 | name: name.to_owned(), 126 | gauge: pg, 127 | }) 128 | } 129 | 130 | pub fn set(&self, value: f64) { 131 | self.gauge.set(value) 132 | } 133 | 134 | pub fn get(&self) -> f64 { 135 | self.gauge.get() 136 | } 137 | } 138 | 139 | #[derive(Clone, Debug)] 140 | pub struct Counter { 141 | name: String, 142 | counter: prometheus::Counter, 143 | } 144 | 145 | impl Counter { 146 | fn new(name: String) -> anyhow::Result { 147 | let pcounter = prometheus::Counter::new(name.clone(), "a counter")?; 148 | Ok(Self { 149 | name, 150 | counter: pcounter, 151 | }) 152 | } 153 | 154 | /// Increment a counter 155 | pub fn inc(&self) { 156 | self.counter.inc(); 157 | } 158 | 159 | pub fn inc_by(&self, value: f64) { 160 | self.counter.inc_by(value); 161 | } 162 | 163 | /// Return the current counter value 164 | pub fn get(&self) -> f64 { 165 | self.counter.get() 166 | } 167 | } 168 | 169 | #[cfg(test)] 170 | pub mod test { 171 | use super::*; 172 | 173 | #[test] 174 | pub fn test_counter() { 175 | let collector = Collector::default(); 176 | let scope = collector.scope("prefix"); 177 | let ctr1 = scope.counter("counter").unwrap(); 178 | ctr1.inc(); 179 | let ctr2 = scope.counter("counter").unwrap(); 180 | // Ensure we have the same counter object 181 | assert_eq!(ctr2.get(), 1_f64); 182 | ctr2.inc(); 183 | assert_eq!(ctr1.get(), 2_f64); 184 | } 185 | 186 | #[test] 187 | pub fn test_gauge() { 188 | let collector = Collector::default(); 189 | let scope = collector.scope("prefix"); 190 | let ctr1 = scope.gauge("gauge").unwrap(); 191 | ctr1.set(12_f64); 192 | let ctr2 = scope.gauge("gauge").unwrap(); 193 | // Ensure we have the same gauge object 194 | assert_eq!(ctr2.get(), 12_f64); 195 | ctr2.set(13_f64); 196 | assert_eq!(ctr1.get(), 13_f64); 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /statsrelay/src/statsd_server.rs: -------------------------------------------------------------------------------- 1 | use bytes::{BufMut, BytesMut}; 2 | use memchr::memchr; 3 | use stream_cancel::Tripwire; 4 | use tokio::io::{AsyncRead, AsyncWrite}; 5 | use tokio::io::{AsyncReadExt, AsyncWriteExt}; 6 | use tokio::net::unix; 7 | use tokio::net::{TcpListener, UnixListener, UnixStream}; 8 | use tokio::select; 9 | use tokio::time::timeout; 10 | 11 | use std::io::ErrorKind; 12 | use std::net::UdpSocket; 13 | use std::sync::atomic::AtomicBool; 14 | use std::sync::atomic::Ordering::Relaxed; 15 | use std::sync::Arc; 16 | use std::time::Duration; 17 | 18 | use log::{debug, info, warn}; 19 | 20 | use crate::backends::Backends; 21 | use crate::config; 22 | use crate::config::StatsdServerConfig; 23 | use crate::stats; 24 | use crate::statsd_proto::{Event, Pdu}; 25 | 26 | const TCP_READ_TIMEOUT: Duration = Duration::from_secs(62); 27 | const READ_BUFFER_SIZE: usize = 8192; 28 | 29 | struct UdpServer { 30 | shutdown_gate: Arc, 31 | } 32 | 33 | impl Drop for UdpServer { 34 | fn drop(&mut self) { 35 | self.shutdown_gate.store(true, Relaxed); 36 | } 37 | } 38 | 39 | impl UdpServer { 40 | fn new() -> Self { 41 | UdpServer { 42 | shutdown_gate: Arc::new(AtomicBool::new(false)), 43 | } 44 | } 45 | 46 | fn udp_worker( 47 | &mut self, 48 | stats: stats::Scope, 49 | bind: String, 50 | backends: Backends, 51 | route: Vec, 52 | ) -> std::thread::JoinHandle<()> { 53 | let socket = UdpSocket::bind(bind.as_str()).unwrap(); 54 | 55 | let processed_lines = stats.counter("processed_lines").unwrap(); 56 | let incoming_bytes = stats.counter("incoming_bytes").unwrap(); 57 | // We set a small timeout to allow aborting the UDP server if there is no 58 | // incoming traffic. 59 | socket 60 | .set_read_timeout(Some(Duration::from_secs(1))) 61 | .unwrap(); 62 | info!("statsd udp server running on {}", bind); 63 | let gate = self.shutdown_gate.clone(); 64 | std::thread::spawn(move || { 65 | info!("started udp reader thread"); 66 | let mut buf = BytesMut::with_capacity(65535); 67 | loop { 68 | if gate.load(Relaxed) { 69 | break; 70 | } 71 | buf.resize(65535, 0_u8); 72 | match socket.recv_from(buf.as_mut()) { 73 | Ok((size, _remote)) => { 74 | buf.truncate(size); 75 | incoming_bytes.inc_by(size as f64); 76 | let r = process_buffer_newlines(&mut buf); 77 | processed_lines.inc_by(r.len() as f64); 78 | backends.provide_statsd_slice(&r, &route); 79 | 80 | if let Ok(p) = Pdu::parse(buf.clone().freeze()) { 81 | backends.provide_statsd(&Event::Pdu(p), &route); 82 | } 83 | } 84 | Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock => (), 85 | Err(e) => warn!("udp receiver error {:?}", e), 86 | } 87 | } 88 | info!("terminating statsd udp"); 89 | }) 90 | } 91 | } 92 | 93 | fn process_buffer_newlines(buf: &mut BytesMut) -> Vec { 94 | let mut ret: Vec = Vec::new(); 95 | loop { 96 | match memchr(b'\n', buf) { 97 | None => break, 98 | Some(newline) => { 99 | let mut incoming = buf.split_to(newline + 1); 100 | let length: usize = incoming.len(); 101 | if length < 3 { 102 | continue; // Not a real metric. Likely \n or \r\n. 103 | } else if incoming[length - 2] == b'\r' { 104 | incoming.truncate(length - 2); 105 | } else { 106 | incoming.truncate(length - 1); 107 | } 108 | let frozen = incoming.freeze(); 109 | if frozen == "status" { 110 | // Consume a line consisting of just the word status, and do not produce a PDU 111 | continue; 112 | } 113 | if let Ok(pdu) = Pdu::parse(frozen) { 114 | ret.push(Event::Pdu(pdu)); 115 | } 116 | } 117 | }; 118 | } 119 | ret 120 | } 121 | 122 | async fn client_handler( 123 | stats: stats::Scope, 124 | peer: String, 125 | mut tripwire: Tripwire, 126 | mut socket: T, 127 | backends: Backends, 128 | route: Vec, 129 | config: StatsdServerConfig, 130 | ) where 131 | T: AsyncRead + AsyncWrite + Unpin, 132 | { 133 | let incoming_bytes = stats.counter("incoming_bytes").unwrap(); 134 | let disconnects = stats.counter("disconnects").unwrap(); 135 | let processed_lines = stats.counter("lines").unwrap(); 136 | let read_timeouts = stats.counter("read_timeout").unwrap(); 137 | let socket_errors = stats.counter("socket_error").unwrap(); 138 | 139 | let read_buffer_size = config.read_buffer.unwrap_or(READ_BUFFER_SIZE); 140 | let mut buf = BytesMut::with_capacity(read_buffer_size); 141 | 142 | loop { 143 | if buf.remaining_mut() < read_buffer_size { 144 | buf.reserve(read_buffer_size); 145 | } 146 | let result = select! { 147 | r = timeout( 148 | config.read_timeout_secs.map(|s| Duration::from_secs(s as u64)).unwrap_or(TCP_READ_TIMEOUT), 149 | //per socket.read_buf - If the timeout completes first it is guaranteed that no data was read 150 | socket.read_buf(&mut buf)) => 151 | { 152 | match r { 153 | Err(_e) => Err(std::io::Error::new(ErrorKind::TimedOut, "read timeout")), 154 | Ok(Err(e)) => Err(e), 155 | Ok(Ok(r)) => Ok(r), 156 | } 157 | }, 158 | _ = &mut tripwire => Err(std::io::Error::new(ErrorKind::Other, "shutting down")), 159 | }; 160 | 161 | match result { 162 | Ok(bytes) if buf.is_empty() && bytes == 0 => { 163 | debug!("closing reader (empty buffer, eof) {}", peer); 164 | break; 165 | } 166 | Ok(bytes) if bytes == 0 => { 167 | let r = process_buffer_newlines(&mut buf); 168 | processed_lines.inc_by(r.len() as f64); 169 | 170 | backends.provide_statsd_slice(&r, &route); 171 | let remaining = buf.clone().freeze(); 172 | if let Ok(p) = Pdu::parse(remaining) { 173 | backends.provide_statsd(&Event::Pdu(p), &route); 174 | }; 175 | debug!("remaining {:?}", buf); 176 | debug!("closing reader {}", peer); 177 | break; 178 | } 179 | Ok(bytes) => { 180 | incoming_bytes.inc_by(bytes as f64); 181 | 182 | let r = process_buffer_newlines(&mut buf); 183 | processed_lines.inc_by(r.len() as f64); 184 | backends.provide_statsd_slice(&r, &route); 185 | } 186 | Err(e) if e.kind() == ErrorKind::Other => { 187 | // Ignoring the results of the write call here 188 | let _ = timeout( 189 | Duration::from_secs(1), 190 | socket.write_all(b"server closing due to shutdown, goodbye\n"), 191 | ) 192 | .await; 193 | break; 194 | } 195 | Err(e) if e.kind() == ErrorKind::TimedOut => { 196 | read_timeouts.inc(); 197 | debug!("read timeout, closing {}", peer); 198 | break; 199 | } 200 | Err(e) => { 201 | socket_errors.inc(); 202 | debug!("socket error {:?} {}", e, peer); 203 | break; 204 | } 205 | } 206 | } 207 | disconnects.inc(); 208 | } 209 | 210 | /// Wrapper type to adapt an optional listener and either return an accept 211 | /// future, or a pending future which never returns. This wrapper is needed to 212 | /// work around that .accept() is an opaque impl Future type, so can't be 213 | /// readily mixed into a stream. 214 | async fn optional_accept( 215 | listener: Option<&UnixListener>, 216 | ) -> std::io::Result<(UnixStream, unix::SocketAddr)> { 217 | if let Some(listener) = listener { 218 | listener.accept().await 219 | } else { 220 | futures::future::pending().await 221 | } 222 | } 223 | 224 | pub async fn run( 225 | stats: stats::Scope, 226 | tripwire: Tripwire, 227 | config: StatsdServerConfig, 228 | backends: Backends, 229 | ) { 230 | let tcp_listener = TcpListener::bind(config.bind.as_str()).await.unwrap(); 231 | info!("statsd tcp server running on {}", config.bind); 232 | 233 | let unix_listener = config.socket.as_ref().map(|socket| { 234 | let unix = UnixListener::bind(socket.as_str()).unwrap(); 235 | info!("statsd unix server running on {}", socket); 236 | unix 237 | }); 238 | 239 | // Spawn the threaded, non-async blocking UDP server 240 | let mut udp = UdpServer::new(); 241 | let udp_join = udp.udp_worker( 242 | stats.scope("udp"), 243 | config.bind.clone(), 244 | backends.clone(), 245 | config.route.clone(), 246 | ); 247 | 248 | let accept_connections = stats.counter("accepts").unwrap(); 249 | let accept_connections_unix = stats.counter("accepts_unix").unwrap(); 250 | let accept_failures = stats.counter("accept_failures").unwrap(); 251 | let accept_failures_unix = stats.counter("accept_failures_unix").unwrap(); 252 | 253 | let routes = config.route.clone(); 254 | let server_config = config.clone(); 255 | async move { 256 | loop { 257 | select! { 258 | _ = tripwire.clone() => { 259 | info!("stopped stream listener loop"); 260 | return 261 | } 262 | // Wrap the unix acceptor for different stats 263 | unix_res = optional_accept(unix_listener.as_ref()) => { 264 | match unix_res { 265 | Ok((socket,_)) => { 266 | let peer_addr = format!("{:?}", socket.peer_addr()); 267 | debug!("accepted unix connection from {:?}", socket.peer_addr()); 268 | accept_connections_unix.inc(); 269 | tokio::spawn(client_handler(stats.scope("connections_unix"), peer_addr, tripwire.clone(), socket, backends.clone(), routes.clone(), server_config.clone())); 270 | } 271 | Err(err) => { 272 | accept_failures_unix.inc(); 273 | info!("unix accept error = {:?}", err); 274 | } 275 | } 276 | } 277 | socket_res = tcp_listener.accept() => { 278 | 279 | match socket_res { 280 | Ok((socket,_)) => { 281 | let peer_addr = format!("{:?}", socket.peer_addr()); 282 | debug!("accepted connection from {:?}", socket.peer_addr()); 283 | accept_connections.inc(); 284 | tokio::spawn(client_handler(stats.scope("connections"), peer_addr, tripwire.clone(), socket, backends.clone(), routes.clone(), server_config.clone())); 285 | } 286 | Err(err) => { 287 | accept_failures.inc(); 288 | info!("accept error = {:?}", err); 289 | } 290 | } 291 | } 292 | } 293 | } 294 | } 295 | .await; 296 | drop(udp); 297 | // The socket file descriptor is not removed on teardown. Lets remove it if enabled. 298 | if let Some(socket) = config.socket.as_ref() { 299 | let _ = std::fs::remove_file(socket); 300 | } 301 | tokio::task::spawn_blocking(move || { 302 | udp_join.join().unwrap(); 303 | }) 304 | .await 305 | .unwrap(); 306 | } 307 | 308 | #[cfg(test)] 309 | pub mod test { 310 | use super::*; 311 | #[test] 312 | fn test_process_buffer_no_newlines() { 313 | let mut b = BytesMut::new(); 314 | // Validate we don't consume non-newlines 315 | b.put_slice(b"hello"); 316 | let r = process_buffer_newlines(&mut b); 317 | assert!(r.is_empty()); 318 | assert!(b.split().as_ref() == b"hello"); 319 | } 320 | 321 | #[test] 322 | fn test_process_buffer_newlines() { 323 | let mut b = BytesMut::new(); 324 | // Validate we don't consume newlines, but not a remnant 325 | b.put_slice(b"hello:1|c\nhello:1|c\nhello2"); 326 | let r = process_buffer_newlines(&mut b); 327 | assert!(r.len() == 2); 328 | assert!(b.split().as_ref() == b"hello2"); 329 | } 330 | 331 | #[test] 332 | fn test_process_buffer_cr_newlines() { 333 | let mut found = 0; 334 | let mut b = BytesMut::new(); 335 | // Validate we don't consume newlines, but not a remnant 336 | b.put_slice(b"hello:1|c\r\nhello:1|c\nhello2"); 337 | let r = process_buffer_newlines(&mut b); 338 | for w in r { 339 | let pdu: Pdu = w.into(); 340 | assert!(pdu.pdu_type() == b"c"); 341 | assert!(pdu.name() == b"hello"); 342 | found += 1 343 | } 344 | assert_eq!(2, found); 345 | assert!(b.split().as_ref() == b"hello2"); 346 | } 347 | 348 | #[test] 349 | fn test_process_buffer_status() { 350 | let mut found = 0; 351 | let mut b = BytesMut::new(); 352 | // Validate we don't consume newlines, but not a remnant 353 | b.put_slice(b"status\r\nhello:1|c\nhello2"); 354 | let r = process_buffer_newlines(&mut b); 355 | for w in r { 356 | let pdu: Pdu = w.into(); 357 | assert!(pdu.pdu_type() == b"c"); 358 | assert!(pdu.name() == b"hello"); 359 | found += 1 360 | } 361 | assert_eq!(1, found); 362 | assert!(b.split().as_ref() == b"hello2"); 363 | } 364 | } 365 | --------------------------------------------------------------------------------