├── .circleci └── config.yml ├── .clang-format ├── .github ├── release-drafter-config.yml └── workflows │ └── release-drafter.yml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── LICENSE.md ├── Makefile ├── README.md ├── cmake └── UseCodeCoverage.cmake ├── examples ├── CMakeLists.txt └── quantile_example.c ├── src ├── .gitignore ├── CMakeLists.txt ├── td_malloc.h ├── tdigest.c └── tdigest.h └── tests ├── CMakeLists.txt ├── benchmark └── histogram_benchmark.cpp └── unit ├── minunit.h └── td_test.c /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Use the latest 2.1 version of CircleCI pipeline process engine. See: https://circleci.com/docs/2.0/configuration-reference 2 | version: 2.1 3 | 4 | jobs: 5 | lint: 6 | docker: 7 | - image: redislabsmodules/llvm-toolset:latest 8 | steps: 9 | - checkout 10 | - run: 11 | name: lint 12 | command: | 13 | make lint 14 | 15 | sanitize: 16 | docker: 17 | - image: redislabsmodules/llvm-toolset:latest 18 | steps: 19 | - checkout 20 | - run: 21 | name: Install CMAKE 22 | command: 'apt update -q && apt install -y cmake' 23 | - run: 24 | name: Pull Submodules 25 | command: git submodule update --init --recursive 26 | - run: 27 | name: Build & Test with sanitizers 28 | command: | 29 | make sanitize 30 | 31 | static-analysis-infer: 32 | docker: 33 | - image: redisbench/infer-linux64:1.0.0 34 | steps: 35 | - checkout 36 | - run: 37 | name: Submodule checkout 38 | command: git submodule update --init --recursive 39 | - run: 40 | name: run fbinfer 41 | command: | 42 | CC=clang CXX=clang++ INFER=infer make static-analysis 43 | build: 44 | docker: 45 | - image: "debian:bullseye" 46 | steps: 47 | - run: 48 | name: Installing SUDO 49 | command: 'apt update && apt install -y sudo && rm -rf /var/lib/apt/lists/*' 50 | - run: 51 | name: Installing GCC 52 | command: 'apt update && apt install -y gcc g++' 53 | - run: 54 | name: Install CMAKE 55 | command: 'apt install -y cmake' 56 | - run: 57 | name: Installing LCOV 58 | command: 'apt install -y lcov' 59 | - run: 60 | name: Installing CURL 61 | command: 'apt install -y curl' 62 | - run: 63 | name: Installing GIT 64 | command: 'apt install -y git' 65 | - checkout 66 | - run: 67 | name: Pull Submodules 68 | command: git submodule update --init --recursive 69 | - run: 70 | name: Build & Test 71 | command: | 72 | make clean 73 | make coverage 74 | cd build && bash <(curl -s https://codecov.io/bash) -f coverage.info -X gcov -x gcov-7 || echo "Codecov did not collect coverage reports" 75 | - run: 76 | name: Install benchmark dependencies 77 | command: | 78 | apt update 79 | apt install python3-pip -y 80 | pip3 install redisbench-admin 81 | - run: 82 | name: Benchmark 83 | command: | 84 | make bench 85 | redisbench-admin export \ 86 | --results-format google.benchmark \ 87 | --github_repo $CIRCLE_PROJECT_REPONAME \ 88 | --github_org $CIRCLE_PROJECT_USERNAME \ 89 | --github_branch $CIRCLE_BRANCH \ 90 | --benchmark-result-file results.json 91 | 92 | 93 | 94 | workflows: 95 | commit: 96 | jobs: 97 | - lint 98 | - build: 99 | context: common 100 | - sanitize 101 | nightly: 102 | triggers: 103 | - schedule: 104 | cron: "0 0 * * *" 105 | filters: 106 | branches: 107 | only: 108 | - master 109 | jobs: 110 | - build: 111 | context: common 112 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | IndentWidth: 4 2 | ColumnLimit: 100 3 | SortIncludes: false 4 | AlignEscapedNewlinesLeft: false 5 | SpacesBeforeTrailingComments: 1 -------------------------------------------------------------------------------- /.github/release-drafter-config.yml: -------------------------------------------------------------------------------- 1 | name-template: 'Version $NEXT_PATCH_VERSION' 2 | tag-template: 'v$NEXT_PATCH_VERSION' 3 | categories: 4 | - title: 'Features' 5 | labels: 6 | - 'feature' 7 | - 'enhancement' 8 | - title: 'Bug Fixes' 9 | labels: 10 | - 'fix' 11 | - 'bugfix' 12 | - 'bug' 13 | - title: 'Maintenance' 14 | label: 'chore' 15 | change-template: '- $TITLE (#$NUMBER)' 16 | exclude-labels: 17 | - 'skip-changelog' 18 | template: | 19 | ## Changes 20 | 21 | $CHANGES 22 | -------------------------------------------------------------------------------- /.github/workflows/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name: Release Drafter 2 | 3 | on: 4 | push: 5 | # branches to consider in the event; optional, defaults to all 6 | branches: 7 | - master 8 | 9 | jobs: 10 | update_release_draft: 11 | runs-on: ubuntu-latest 12 | steps: 13 | # Drafts your next Release notes as Pull Requests are merged into "master" 14 | - uses: release-drafter/release-drafter@v5 15 | with: 16 | # (Optional) specify config name to use, relative to .github/. Default: release-drafter.yml 17 | config-name: release-drafter-config.yml 18 | env: 19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | README_cache 3 | src/*.o 4 | src/*.so 5 | src/*.dll 6 | CMakeLists.txt.user 7 | CMakeCache.txt 8 | CMakeFiles 9 | CMakeScripts 10 | Testing 11 | cmake-build-debug 12 | cmake_install.cmake 13 | install_manifest.txt 14 | compile_commands.json 15 | CTestTestfile.cmake 16 | _deps 17 | build/* 18 | .vscode 19 | tests/vendor/* 20 | 21 | # perf related 22 | perf.data* 23 | 24 | 25 | # fb infer static analysis 26 | infer-out/* 27 | 28 | # Prerequisites 29 | *.d 30 | 31 | # Object files 32 | *.o 33 | *.ko 34 | *.obj 35 | *.elf 36 | 37 | # Linker output 38 | *.ilk 39 | *.map 40 | *.exp 41 | 42 | # Precompiled Headers 43 | *.gch 44 | *.pch 45 | 46 | # Libraries 47 | *.lib 48 | *.a 49 | *.la 50 | *.lo 51 | 52 | # Shared objects (inc. Windows DLLs) 53 | *.dll 54 | *.so 55 | *.so.* 56 | *.dylib 57 | 58 | # Executables 59 | *.exe 60 | *.out 61 | *.app 62 | *.i*86 63 | *.x86_64 64 | *.hex 65 | 66 | # Debug files 67 | *.dSYM/ 68 | *.su 69 | *.idb 70 | *.pdb 71 | 72 | # Kernel Module Compile Results 73 | *.mod* 74 | *.cmd 75 | .tmp_versions/ 76 | modules.order 77 | Module.symvers 78 | Mkfile.old 79 | dkms.conf 80 | 81 | # IDE 82 | .idea -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tests/vendor/google/benchmark"] 2 | path = tests/vendor/google/benchmark 3 | url = https://github.com/google/benchmark.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ## cmake flags 2 | cmake_minimum_required (VERSION 3.0) 3 | project(tdigest) 4 | 5 | # CMake modules should be included in ${CMAKE_SOURCE_DIR}/cmake 6 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake) 7 | 8 | # --- Build options --- 9 | option(BUILD_SHARED "Build shared library" ON) 10 | option(BUILD_STATIC "Build static library" ON) 11 | option(BUILD_BENCHMARK "Build benchmark" ON) 12 | option(BUILD_TESTS "Build tests" ON) 13 | OPTION(ENABLE_CODECOVERAGE "Enable code coverage testing support" OFF) 14 | OPTION(ENABLE_PROFILE "Enable code profiling support" OFF) 15 | option(BUILD_EXAMPLES "Build examples" ON) 16 | 17 | # --- Build properties --- 18 | 19 | # Set a default build type if none was specified 20 | set(default_build_type "Release") 21 | 22 | IF(NOT CMAKE_BUILD_TYPE) 23 | message(STATUS "Setting build type to '${default_build_type}' as none was specified.") 24 | set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE 25 | STRING "Choose the type of build." FORCE) 26 | # Set the possible values of build type for cmake-gui 27 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS 28 | "Debug" "Release" "MinSizeRel" "RelWithDebInfo") 29 | ENDIF() 30 | 31 | 32 | if(ENABLE_SANITIZERS) 33 | message(STATUS "Forcing build type to Debug to run coverage.") 34 | set(CMAKE_BUILD_TYPE "Debug" CACHE 35 | STRING "Choose the type of build." FORCE) 36 | set (CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wshadow -Wpointer-arith -Wcast-qual -Wunused -Wstrict-prototypes -Wmissing-prototypes -Wwrite-strings -Werror -fno-omit-frame-pointer -fsanitize=address") 37 | set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wshadow -Wpointer-arith -Wcast-qual -Wunused -Wstrict-prototypes -Wmissing-prototypes -Wwrite-strings -Werror -fno-omit-frame-pointer -fsanitize=address") 38 | set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") 39 | ENDIF() 40 | 41 | if(ENABLE_CODECOVERAGE) 42 | message(STATUS "Forcing build type to Debug to run coverage.") 43 | set(CMAKE_BUILD_TYPE "Debug" CACHE 44 | STRING "Choose the type of build." FORCE) 45 | # --- System Libraries --- 46 | include(GNUInstallDirs) 47 | include(UseCodeCoverage) 48 | ENDIF() 49 | 50 | # Generate position-independent code (-fPIC on UNIX) 51 | set(CMAKE_POSITION_INDEPENDENT_CODE ON) 52 | 53 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -std=c99") 54 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") 55 | 56 | if(ENABLE_PROFILE) 57 | message(STATUS "Enabling profile flags.") 58 | string (REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) 59 | string (REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) 60 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -g -ggdb -fno-omit-frame-pointer") 61 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g -ggdb -fno-omit-frame-pointer") 62 | # enable vectorization report flags 63 | # using Clang 64 | if (CMAKE_C_COMPILER_ID MATCHES "Clang") 65 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Rpass-analysis=loop-vectorize -Rpass=loop-vectorize -Rpass-missed=loop-vectorize") 66 | 67 | # using GCC 68 | elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU") 69 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -ftree-vectorize -fopt-info-vec-all") 70 | 71 | # using Intel C++ 72 | elseif (CMAKE_C_COMPILER_ID STREQUAL "Intel") 73 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -qopt-report=5 -qopt-report-phase=vec") 74 | 75 | # using Visual Studio C++ 76 | elseif (CMAKE_C_COMPILER_ID STREQUAL "MSVC") 77 | # TBD 78 | endif() 79 | endif(ENABLE_PROFILE) 80 | 81 | # --- Build directories --- 82 | add_subdirectory("src") 83 | 84 | # --- Documentation --- 85 | # TODO 86 | 87 | # --- Unit Tests --- 88 | ENABLE_TESTING() 89 | 90 | if(BUILD_TESTS OR BUILD_BENCHMARK) 91 | add_subdirectory("tests") 92 | endif() 93 | 94 | # --- Examples --- 95 | if(BUILD_EXAMPLES) 96 | add_subdirectory("examples") 97 | endif() 98 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2019 2 | COPYRIGHT HOLDER: Bob Rudis 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2019 Bob Rudis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------------------------------------------- 2 | # simple Makefile for T-Digest, relies on cmake to do the actual build. 3 | # Use CMAKE_LIBRARY_OPTIONS,CMAKE_LIBRARY_SHARED_OPTIONS,CMAKE_LIBRARY_STATIC_OPTIONS or CMAKE_FULL_OPTIONS argument to this Makefile to pass options to cmake. 4 | #---------------------------------------------------------------------------------------------------- 5 | 6 | CC?=gcc 7 | INFER?=./deps/infer 8 | INFER_DOCKER?=redisbench/infer-linux64:1.0.0 9 | ROOT=$(shell pwd) 10 | SRCDIR := $(ROOT)/src 11 | TESTDIR := $(ROOT)/tests/unit 12 | BENCHDIR := $(ROOT)/tests/benchmark 13 | 14 | 15 | ifndef CMAKE_LIBRARY_SHARED_OPTIONS 16 | CMAKE_LIBRARY_SHARED_OPTIONS=\ 17 | -DBUILD_SHARED=ON \ 18 | -DBUILD_STATIC=OFF \ 19 | -DENABLE_CODECOVERAGE=OFF \ 20 | -DBUILD_TESTS=OFF \ 21 | -DBUILD_BENCHMARK=OFF \ 22 | -DBUILD_EXAMPLES=OFF 23 | endif 24 | 25 | ifndef CMAKE_LIBRARY_STATIC_OPTIONS 26 | CMAKE_LIBRARY_STATIC_OPTIONS=\ 27 | -DBUILD_SHARED=OFF \ 28 | -DBUILD_STATIC=ON \ 29 | -DENABLE_CODECOVERAGE=OFF \ 30 | -DBUILD_TESTS=OFF \ 31 | -DBUILD_BENCHMARK=OFF \ 32 | -DBUILD_EXAMPLES=OFF 33 | endif 34 | 35 | ifndef CMAKE_LIBRARY_OPTIONS 36 | CMAKE_LIBRARY_OPTIONS=\ 37 | -DBUILD_SHARED=ON \ 38 | -DBUILD_STATIC=ON \ 39 | -DENABLE_CODECOVERAGE=OFF \ 40 | -DBUILD_TESTS=OFF \ 41 | -DBUILD_EXAMPLES=OFF 42 | endif 43 | 44 | ifndef CMAKE_FULL_OPTIONS 45 | CMAKE_FULL_OPTIONS=\ 46 | -DBUILD_SHARED=ON \ 47 | -DBUILD_STATIC=ON \ 48 | -DBUILD_TESTS=ON \ 49 | -DBUILD_BENCHMARK=ON \ 50 | -DBUILD_EXAMPLES=ON 51 | endif 52 | 53 | 54 | ifndef CMAKE_PROFILE_OPTIONS 55 | CMAKE_PROFILE_OPTIONS=\ 56 | -DBUILD_SHARED=ON \ 57 | -DBUILD_STATIC=OFF \ 58 | -DENABLE_CODECOVERAGE=OFF \ 59 | -DBUILD_TESTS=OFF \ 60 | -DBUILD_BENCHMARK=ON \ 61 | -DBUILD_EXAMPLES=OFF \ 62 | -DENABLE_PROFILE=ON 63 | endif 64 | 65 | 66 | ifndef CMAKE_SANITIZE_OPTIONS 67 | CMAKE_SANITIZE_OPTIONS=\ 68 | -DBUILD_SHARED=ON \ 69 | -DBUILD_STATIC=OFF \ 70 | -DENABLE_CODECOVERAGE=OFF \ 71 | -DBUILD_TESTS=ON \ 72 | -DBUILD_BENCHMARK=OFF \ 73 | -DBUILD_EXAMPLES=OFF \ 74 | -DENABLE_PROFILE=OFF \ 75 | -DENABLE_SANITIZERS=ON 76 | endif 77 | 78 | ifndef CMAKE_TEST_OPTIONS 79 | CMAKE_TEST_OPTIONS=\ 80 | -DBUILD_SHARED=ON \ 81 | -DBUILD_STATIC=ON \ 82 | -DBUILD_TESTS=ON \ 83 | -DENABLE_CODECOVERAGE=ON \ 84 | -DBUILD_BENCHMARK=OFF \ 85 | -DBUILD_EXAMPLES=OFF 86 | endif 87 | 88 | ifndef CMAKE_BENCHMARK_OPTIONS 89 | CMAKE_BENCHMARK_OPTIONS=\ 90 | -DBUILD_SHARED=ON \ 91 | -DBUILD_STATIC=OFF \ 92 | -DENABLE_CODECOVERAGE=OFF \ 93 | -DBUILD_TESTS=OFF \ 94 | -DBUILD_BENCHMARK=ON \ 95 | -DBUILD_EXAMPLES=OFF \ 96 | -DENABLE_PROFILE=OFF 97 | endif 98 | 99 | default: full 100 | 101 | # just build the static library. Do not build tests or benchmarks 102 | library_static: 103 | ( mkdir -p build; cd build ; cmake $(CMAKE_LIBRARY_STATIC_OPTIONS) .. ; $(MAKE) ) 104 | 105 | # just build the shared library. Do not build tests or benchmarks 106 | library_shared: 107 | ( mkdir -p build; cd build ; cmake $(CMAKE_LIBRARY_SHARED_OPTIONS) .. ; $(MAKE) ) 108 | 109 | # just build the static and shared libraries. Do not build tests or benchmarks 110 | library_all: 111 | ( mkdir -p build; cd build ; cmake $(CMAKE_LIBRARY_OPTIONS) .. ; $(MAKE) ) 112 | 113 | # just build the static and shared libraries and produce measurements 114 | # of accuracy versus compression factor for fixed data size 115 | # TODO: 116 | 117 | # just build the static and shared libraries and tests 118 | unit_tests: 119 | ( mkdir -p build; cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ; $(MAKE) test) 120 | 121 | test: 122 | $(MAKE) unit_tests 123 | 124 | coverage: 125 | ( mkdir -p build; cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ; $(MAKE) test; make coverage; ) 126 | 127 | format: 128 | clang-format -style=file -i $(SRCDIR)/*.c 129 | clang-format -style=file -i $(SRCDIR)/*.h 130 | clang-format -style=file -i $(TESTDIR)/*.c 131 | clang-format -style=file -i $(TESTDIR)/*.h 132 | clang-format -style=file -i $(BENCHDIR)/*.cpp 133 | 134 | lint: 135 | clang-format -style=file -Werror -n $(SRCDIR)/*.c 136 | clang-format -style=file -Werror -n $(SRCDIR)/*.h 137 | clang-format -style=file -Werror -n $(TESTDIR)/*.c 138 | clang-format -style=file -Werror -n $(TESTDIR)/*.h 139 | clang-format -style=file -Werror -n $(BENCHDIR)/*.cpp 140 | 141 | # build all 142 | full: 143 | ( mkdir -p build; cd build ; cmake $(CMAKE_FULL_OPTIONS) .. ; $(MAKE) ) 144 | 145 | # static-analysis-docker: 146 | # $(MAKE) clean 147 | # docker run -v $(ROOT)/:/t-digest-c/ --user "$(username):$(usergroup)" $(INFER_DOCKER) bash -c "cd t-digest-c && CC=clang infer run --keep-going --fail-on-issue --biabduction -- make test" 148 | 149 | clean: distclean 150 | 151 | distclean: 152 | rm -rf build/* 153 | 154 | sanitize: clean 155 | ( mkdir -p build; cd build ; cmake $(CMAKE_SANITIZE_OPTIONS) .. ; $(MAKE) VERBOSE=1 ) 156 | $(SHOW) build/tests/td_test 157 | 158 | profile: clean 159 | ( mkdir -p build; cd build ; cmake $(CMAKE_PROFILE_OPTIONS) .. ; $(MAKE) VERBOSE=1 2> $(basename $@).compiler_stedrr_output.txt ) 160 | 161 | bench: clean 162 | ( mkdir -p build; cd build ; cmake $(CMAKE_BENCHMARK_OPTIONS) .. ; $(MAKE) VERBOSE=1 ) 163 | $(SHOW) build/tests/histogram_benchmark --benchmark_min_time=5 --benchmark_out=results.json --benchmark_out_format=json 164 | 165 | bench-quantile: clean 166 | ( mkdir -p build; cd build ; cmake $(CMAKE_BENCHMARK_OPTIONS) .. ; $(MAKE) VERBOSE=1 ) 167 | $(SHOW) build/tests/histogram_benchmark --benchmark_min_time=5 --benchmark_filter="BM_td_quantile_lognormal_dist_given_array*|BM_td_quantiles_*" 168 | 169 | perf-stat-bench: 170 | ( mkdir -p build; cd build ; cmake $(CMAKE_PROFILE_OPTIONS) .. ; $(MAKE) VERBOSE=1 ) 171 | $(SHOW) perf stat build/tests/histogram_benchmark --benchmark_min_time=10 172 | 173 | perf-record-bench: clean 174 | ( mkdir -p build; cd build ; cmake $(CMAKE_PROFILE_OPTIONS) .. ; $(MAKE) VERBOSE=1 ) 175 | $(SHOW) perf record -g -o perf.data.td_add \ 176 | build/tests/histogram_benchmark 177 | 178 | perf-report-bench: 179 | $(SHOW) perf report -g "graph,0.5,caller" -i perf.data.td_add 180 | 181 | perf-report-bench-pprof: 182 | go tool pprof -web perf.data.td_add 183 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ![License](https://img.shields.io/badge/License-MIT-blue.svg) 3 | [![CircleCI](https://circleci.com/gh/RedisBloom/t-digest-c.svg?style=svg)](https://circleci.com/gh/RedisBloom/t-digest-c) 4 | [![codecov](https://codecov.io/gh/RedisBloom/t-digest-c/branch/master/graph/badge.svg)](https://codecov.io/gh/RedisBloom/t-digest-c) 5 | 6 | # T-Digest 7 | 8 | Adaptive histogram based on something like streaming k-means crossed with Q-digest. 9 | 10 | 11 | This implementation is a descendent of Ted MergingDigest, available at: 12 | [https://github.com/tdunning/t-digest/](https://github.com/tdunning/t-digest/) 13 | 14 | 15 | And contains the work of Andrew Werner originally available at: 16 | [https://github.com/ajwerner/tdigestc](https://github.com/ajwerner/tdigestc) 17 | 18 | ## Description 19 | 20 | The t-Digest construction algorithm uses a variant of 1-dimensional 21 | k-means clustering to produce a very compact data structure that allows 22 | accurate estimation of quantiles. This t-Digest data structure can be 23 | used to estimate quantiles, compute other rank statistics or even to 24 | estimate related measures like trimmed means. The advantage of the 25 | t-Digest over previous digests for this purpose is that the t-Digest 26 | handles data with full floating point resolution. The accuracy of 27 | quantile estimates produced by t-Digests can be orders of magnitude more 28 | accurate than those produced by previous digest algorithms. Methods are 29 | provided to create and update t-Digests and retrieve quantiles from the 30 | accumulated distributions. 31 | 32 | See [the original paper by Ted Dunning & Otmar 33 | Ertl](https://arxiv.org/abs/1902.04023) for more details on t-Digests. 34 | 35 | ## What’s Inside 36 | 37 | The following functions are implemented: 38 | 39 | - `td_add`: Add a value to the t-Digest with the specified count 40 | - `td_create`: Allocate a new histogram 41 | - `td_reset`: Empty out a histogram and re-initialize it 42 | - `td_free`: Frees the memory associated with the t-Digest 43 | - `td_compress`: Re-examines a the t-Digest to determine whether some centroids are redundant 44 | - `td_merge`: Merge one t-Digest into another 45 | - `td_cdf`: Returns the fraction of all points added which are ≤ x. 46 | - `td_quantile`: Returns an estimate of the cutoff such that a specified fraction of the data added to the t-Digest would be less than or equal to the cutoff. 47 | - `td_quantiles`: Returns an estimate of the cutoff such that a specified fraction of the data added to the t-Digest would be less than or equal to the given cutoffs. 48 | - `td_size`: Return the number of points that have been added to the t-Digest 49 | - `td_centroid_count`: Return the number of centroids being used by the t-Digest 50 | - `td_min`: Get the minimum value from the histogram. Will return __DBL_MAX__ if the histogram is empty 51 | - `td_max`: Get the maximum value from the histogram. Will return __DBL_MIN__ if the histogram is empty 52 | - `td_trimmed_mean`: Returns the trimmed mean ignoring values outside given cutoff upper and lower limits 53 | - `td_trimmed_mean_symmetric`: Returns the trimmed mean ignoring values outside given a symmetric cutoff limits 54 | 55 | ## Build notes 56 | 57 | ``` 58 | # Build 59 | git clone https://github.com/RedisBloom/t-digest-c.git 60 | cd t-digest-c/ 61 | git submodule update --init --recursive 62 | make 63 | ``` 64 | 65 | ## Testing 66 | Assuming you've followed the previous build steps, it should be as easy as: 67 | ``` 68 | # Run the unit tests 69 | make test 70 | ``` 71 | 72 | ## Benchmarking 73 | 74 | Assuming you've followed the previous build steps, it should be as easy as: 75 | ``` 76 | # Run the benchmark 77 | make bench 78 | ``` 79 | 80 | ## Code of Conduct 81 | 82 | Please note that this project is released with a Contributor Code of 83 | Conduct. By participating in this project you agree to abide by its 84 | terms. 85 | -------------------------------------------------------------------------------- /cmake/UseCodeCoverage.cmake: -------------------------------------------------------------------------------- 1 | # - Enable Code Coverage 2 | # 3 | # Variables you may define are: 4 | # CODECOV_HTMLOUTPUTDIR - the name of the directory where HTML results are placed. Defaults to "coverage_html" 5 | # CODECOV_XMLOUTPUTFILE - the name of the directory where HTML results are placed. Defaults to "coverage.xml" 6 | # CODECOV_GCOVR_OPTIONS - additional options given to gcovr commands. 7 | # 8 | 9 | if(ENABLE_CODECOVERAGE) 10 | 11 | if ( NOT CMAKE_BUILD_TYPE STREQUAL "Debug" ) 12 | message( WARNING "Code coverage results with an optimised (non-Debug) build may be misleading" ) 13 | endif ( NOT CMAKE_BUILD_TYPE STREQUAL "Debug" ) 14 | 15 | if ( NOT DEFINED CODECOV_OUTPUTFILE ) 16 | set( CODECOV_OUTPUTFILE coverage.info ) 17 | endif ( NOT DEFINED CODECOV_OUTPUTFILE ) 18 | 19 | if ( NOT DEFINED CODECOV_HTMLOUTPUTDIR ) 20 | set( CODECOV_HTMLOUTPUTDIR coverage_results ) 21 | endif ( NOT DEFINED CODECOV_HTMLOUTPUTDIR ) 22 | 23 | if ( CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_GNUCXX ) 24 | find_program( CODECOV_GCOV gcov ) 25 | find_program( CODECOV_LCOV lcov ) 26 | find_program( CODECOV_GENHTML genhtml ) 27 | add_definitions( -fprofile-arcs -ftest-coverage ) 28 | link_libraries( gcov ) 29 | set( CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS} --coverage ) 30 | add_custom_target( coverage_init ALL ${CODECOV_LCOV} --base-directory . --directory ${CMAKE_BINARY_DIR}/src --output-file ${CODECOV_OUTPUTFILE} --capture --initial ) 31 | add_custom_target( coverage ${CODECOV_LCOV} --base-directory . --directory ${CMAKE_BINARY_DIR}/src --output-file ${CODECOV_OUTPUTFILE} --capture COMMAND genhtml -o ${CODECOV_HTMLOUTPUTDIR} ${CODECOV_OUTPUTFILE} ) 32 | endif ( CMAKE_COMPILER_IS_GNUCXX ) 33 | 34 | endif(ENABLE_CODECOVERAGE) 35 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(quantile_example quantile_example.c) 2 | target_link_libraries(quantile_example tdigest) -------------------------------------------------------------------------------- /examples/quantile_example.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "tdigest.h" 3 | 4 | #define STREAM_SIZE 1000000 5 | 6 | static inline double randMToN(double M, double N) 7 | { 8 | return M + (rand() / (RAND_MAX / (N - M))); 9 | } 10 | 11 | 12 | int main() 13 | { 14 | 15 | td_histogram_t *mdigest = td_new(500); 16 | printf("compression is %f capacity is %d\n", mdigest->compression, mdigest->cap); 17 | double seeds[STREAM_SIZE]; 18 | for (int i = 0; i < STREAM_SIZE; ++i) 19 | { 20 | seeds[i] = randMToN(0, 10); 21 | } 22 | 23 | for (int i = 0; i < STREAM_SIZE; ++i) 24 | { 25 | td_add(mdigest, seeds[i], 1); 26 | } 27 | td_compress(mdigest); 28 | for (int i = 0; i < 10; ++i) 29 | { 30 | const double v = seeds[i]; 31 | printf("value %f is at percentile %f\n", v, td_cdf(mdigest, v)); 32 | } 33 | printf("\n"); 34 | for (int i = 0; i <= 100; i += 10) 35 | { 36 | printf("%d percentile has value %f\n", i, td_quantile(mdigest, i / 100.0)); 37 | } 38 | } -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | FILE(GLOB c_files "*.c") 2 | FILE(GLOB header_files "*.h") 3 | 4 | if (BUILD_SHARED) 5 | add_library(tdigest SHARED ${c_files} ${header_files}) 6 | target_link_libraries(tdigest m) 7 | target_include_directories(tdigest SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 8 | set_target_properties(tdigest PROPERTIES PUBLIC_HEADER "${header_files}") 9 | install(TARGETS tdigest DESTINATION lib${LIB_SUFFIX} PUBLIC_HEADER DESTINATION include) 10 | endif(BUILD_SHARED) 11 | 12 | if (BUILD_STATIC) 13 | add_library(tdigest_static STATIC ${c_files} ${header_files}) 14 | target_link_libraries(tdigest_static m) 15 | target_include_directories(tdigest_static SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 16 | set_target_properties(tdigest_static PROPERTIES PUBLIC_HEADER "${header_files}") 17 | install(TARGETS tdigest_static DESTINATION lib${LIB_SUFFIX} PUBLIC_HEADER DESTINATION include) 18 | endif(BUILD_STATIC) 19 | -------------------------------------------------------------------------------- /src/td_malloc.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Adaptive histogram based on something like streaming k-means crossed with Q-digest. 3 | * The implementation is a direct descendent of MergingDigest 4 | * https://github.com/tdunning/t-digest/ 5 | * 6 | * Copyright (c) 2021 Redis, All rights reserved. 7 | * 8 | * Allocator selection. 9 | * 10 | * This file is used in order to change the t-digest allocator at compile time. 11 | * Just define the following defines to what you want to use. Also add 12 | * the include of your alternate allocator if needed (not needed in order 13 | * to use the default libc allocator). */ 14 | 15 | #ifndef TD_ALLOC_H 16 | #define TD_ALLOC_H 17 | #define __td_malloc malloc 18 | #define __td_calloc calloc 19 | #define __td_realloc realloc 20 | #define __td_free free 21 | #endif 22 | -------------------------------------------------------------------------------- /src/tdigest.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "tdigest.h" 6 | #include 7 | #include 8 | 9 | #ifndef TD_MALLOC_INCLUDE 10 | #define TD_MALLOC_INCLUDE "td_malloc.h" 11 | #endif 12 | 13 | #include TD_MALLOC_INCLUDE 14 | 15 | #define __td_max(x, y) (((x) > (y)) ? (x) : (y)) 16 | #define __td_min(x, y) (((x) < (y)) ? (x) : (y)) 17 | 18 | static inline double weighted_average_sorted(double x1, double w1, double x2, double w2) { 19 | const double x = (x1 * w1 + x2 * w2) / (w1 + w2); 20 | return __td_max(x1, __td_min(x, x2)); 21 | } 22 | 23 | static inline bool _tdigest_long_long_add_safe(long long a, long long b) { 24 | if (b < 0) { 25 | return (a >= __LONG_LONG_MAX__ - b); 26 | } else { 27 | return (a <= __LONG_LONG_MAX__ - b); 28 | } 29 | } 30 | 31 | static inline double weighted_average(double x1, double w1, double x2, double w2) { 32 | if (x1 <= x2) { 33 | return weighted_average_sorted(x1, w1, x2, w2); 34 | } else { 35 | return weighted_average_sorted(x2, w2, x1, w1); 36 | } 37 | } 38 | 39 | static void inline swap(double *arr, int i, int j) { 40 | const double temp = arr[i]; 41 | arr[i] = arr[j]; 42 | arr[j] = temp; 43 | } 44 | 45 | static void inline swap_l(long long *arr, int i, int j) { 46 | const long long temp = arr[i]; 47 | arr[i] = arr[j]; 48 | arr[j] = temp; 49 | } 50 | 51 | static unsigned int partition(double *means, long long *weights, unsigned int start, 52 | unsigned int end, unsigned int pivot_idx) { 53 | const double pivotMean = means[pivot_idx]; 54 | swap(means, pivot_idx, end); 55 | swap_l(weights, pivot_idx, end); 56 | 57 | int i = start - 1; 58 | 59 | for (unsigned int j = start; j < end; j++) { 60 | // If current element is smaller than the pivot 61 | if (means[j] < pivotMean) { 62 | // increment index of smaller element 63 | i++; 64 | swap(means, i, j); 65 | swap_l(weights, i, j); 66 | } 67 | } 68 | swap(means, i + 1, end); 69 | swap_l(weights, i + 1, end); 70 | return i + 1; 71 | } 72 | 73 | /** 74 | * Standard quick sort except that sorting rearranges parallel arrays 75 | * 76 | * @param means Values to sort on 77 | * @param weights The auxillary values to sort. 78 | * @param start The beginning of the values to sort 79 | * @param end The value after the last value to sort 80 | */ 81 | static void td_qsort(double *means, long long *weights, unsigned int start, unsigned int end) { 82 | if (start < end) { 83 | // two elements can be directly compared 84 | if ((end - start) == 1) { 85 | if (means[start] > means[end]) { 86 | swap(means, start, end); 87 | swap_l(weights, start, end); 88 | } 89 | return; 90 | } 91 | // generating a random number as a pivot was very expensive vs the array size 92 | // const unsigned int pivot_idx = start + rand()%(end - start + 1); 93 | const unsigned int pivot_idx = (end + start) / 2; // central pivot 94 | const unsigned int new_pivot_idx = partition(means, weights, start, end, pivot_idx); 95 | if (new_pivot_idx > start) { 96 | td_qsort(means, weights, start, new_pivot_idx - 1); 97 | } 98 | td_qsort(means, weights, new_pivot_idx + 1, end); 99 | } 100 | } 101 | 102 | static inline size_t cap_from_compression(double compression) { 103 | if ((size_t)compression > ((SIZE_MAX / sizeof(double) / 6) - 10)) { 104 | return 0; 105 | } 106 | 107 | return (6 * (size_t)(compression)) + 10; 108 | } 109 | 110 | static inline bool should_td_compress(td_histogram_t *h) { 111 | return ((h->merged_nodes + h->unmerged_nodes) >= (h->cap - 1)); 112 | } 113 | 114 | static inline int next_node(td_histogram_t *h) { return h->merged_nodes + h->unmerged_nodes; } 115 | 116 | int td_compress(td_histogram_t *h); 117 | 118 | static inline int _check_overflow(const double v) { 119 | // double-precision overflow detected on h->unmerged_weight 120 | if (v == INFINITY) { 121 | return EDOM; 122 | } 123 | return 0; 124 | } 125 | 126 | static inline int _check_td_overflow(const double new_unmerged_weight, 127 | const double new_total_weight) { 128 | // double-precision overflow detected on h->unmerged_weight 129 | if (new_unmerged_weight == INFINITY) { 130 | return EDOM; 131 | } 132 | if (new_total_weight == INFINITY) { 133 | return EDOM; 134 | } 135 | const double denom = 2 * MM_PI * new_total_weight * log(new_total_weight); 136 | if (denom == INFINITY) { 137 | return EDOM; 138 | } 139 | 140 | return 0; 141 | } 142 | 143 | int td_centroid_count(td_histogram_t *h) { return next_node(h); } 144 | 145 | void td_reset(td_histogram_t *h) { 146 | if (!h) { 147 | return; 148 | } 149 | h->min = __DBL_MAX__; 150 | h->max = -h->min; 151 | h->merged_nodes = 0; 152 | h->merged_weight = 0; 153 | h->unmerged_nodes = 0; 154 | h->unmerged_weight = 0; 155 | h->total_compressions = 0; 156 | } 157 | 158 | int td_init(double compression, td_histogram_t **result) { 159 | 160 | const size_t capacity = cap_from_compression(compression); 161 | if (capacity < 1) { 162 | return 1; 163 | } 164 | td_histogram_t *histogram; 165 | histogram = (td_histogram_t *)__td_malloc(sizeof(td_histogram_t)); 166 | if (!histogram) { 167 | return 1; 168 | } 169 | histogram->cap = capacity; 170 | histogram->compression = (double)compression; 171 | td_reset(histogram); 172 | histogram->nodes_mean = (double *)__td_calloc(capacity, sizeof(double)); 173 | if (!histogram->nodes_mean) { 174 | td_free(histogram); 175 | return 1; 176 | } 177 | histogram->nodes_weight = (long long *)__td_calloc(capacity, sizeof(long long)); 178 | if (!histogram->nodes_weight) { 179 | td_free(histogram); 180 | return 1; 181 | } 182 | *result = histogram; 183 | 184 | return 0; 185 | } 186 | 187 | td_histogram_t *td_new(double compression) { 188 | td_histogram_t *mdigest = NULL; 189 | td_init(compression, &mdigest); 190 | return mdigest; 191 | } 192 | 193 | void td_free(td_histogram_t *histogram) { 194 | if (histogram->nodes_mean) { 195 | __td_free((void *)(histogram->nodes_mean)); 196 | } 197 | if (histogram->nodes_weight) { 198 | __td_free((void *)(histogram->nodes_weight)); 199 | } 200 | __td_free((void *)(histogram)); 201 | } 202 | 203 | int td_merge(td_histogram_t *into, td_histogram_t *from) { 204 | if (td_compress(into) != 0) 205 | return EDOM; 206 | if (td_compress(from) != 0) 207 | return EDOM; 208 | const int pos = from->merged_nodes + from->unmerged_nodes; 209 | for (int i = 0; i < pos; i++) { 210 | const double mean = from->nodes_mean[i]; 211 | const long long weight = from->nodes_weight[i]; 212 | if (td_add(into, mean, weight) != 0) { 213 | return EDOM; 214 | } 215 | } 216 | return 0; 217 | } 218 | 219 | long long td_size(td_histogram_t *h) { return h->merged_weight + h->unmerged_weight; } 220 | 221 | double td_cdf(td_histogram_t *h, double val) { 222 | td_compress(h); 223 | // no data to examine 224 | if (h->merged_nodes == 0) { 225 | return NAN; 226 | } 227 | // bellow lower bound 228 | if (val < h->min) { 229 | return 0; 230 | } 231 | // above upper bound 232 | if (val > h->max) { 233 | return 1; 234 | } 235 | if (h->merged_nodes == 1) { 236 | // exactly one centroid, should have max==min 237 | const double width = h->max - h->min; 238 | if (val - h->min <= width) { 239 | // min and max are too close together to do any viable interpolation 240 | return 0.5; 241 | } else { 242 | // interpolate if somehow we have weight > 0 and max != min 243 | return (val - h->min) / width; 244 | } 245 | } 246 | const int n = h->merged_nodes; 247 | // check for the left tail 248 | const double left_centroid_mean = h->nodes_mean[0]; 249 | const double left_centroid_weight = (double)h->nodes_weight[0]; 250 | const double merged_weight_d = (double)h->merged_weight; 251 | if (val < left_centroid_mean) { 252 | // note that this is different than h->nodes_mean[0] > min 253 | // ... this guarantees we divide by non-zero number and interpolation works 254 | const double width = left_centroid_mean - h->min; 255 | if (width > 0) { 256 | // must be a sample exactly at min 257 | if (val == h->min) { 258 | return 0.5 / merged_weight_d; 259 | } else { 260 | return (1 + (val - h->min) / width * (left_centroid_weight / 2 - 1)) / 261 | merged_weight_d; 262 | } 263 | } else { 264 | // this should be redundant with the check val < h->min 265 | return 0; 266 | } 267 | } 268 | // and the right tail 269 | const double right_centroid_mean = h->nodes_mean[n - 1]; 270 | const double right_centroid_weight = (double)h->nodes_weight[n - 1]; 271 | if (val > right_centroid_mean) { 272 | const double width = h->max - right_centroid_mean; 273 | if (width > 0) { 274 | if (val == h->max) { 275 | return 1 - 0.5 / merged_weight_d; 276 | } else { 277 | // there has to be a single sample exactly at max 278 | const double dq = (1 + (h->max - val) / width * (right_centroid_weight / 2 - 1)) / 279 | merged_weight_d; 280 | return 1 - dq; 281 | } 282 | } else { 283 | return 1; 284 | } 285 | } 286 | // we know that there are at least two centroids and mean[0] < x < mean[n-1] 287 | // that means that there are either one or more consecutive centroids all at exactly x 288 | // or there are consecutive centroids, c0 < x < c1 289 | double weightSoFar = 0; 290 | for (int it = 0; it < n - 1; it++) { 291 | // weightSoFar does not include weight[it] yet 292 | if (h->nodes_mean[it] == val) { 293 | // we have one or more centroids == x, treat them as one 294 | // dw will accumulate the weight of all of the centroids at x 295 | double dw = 0; 296 | while (it < n && h->nodes_mean[it] == val) { 297 | dw += (double)h->nodes_weight[it]; 298 | it++; 299 | } 300 | return (weightSoFar + dw / 2) / (double)h->merged_weight; 301 | } else if (h->nodes_mean[it] <= val && val < h->nodes_mean[it + 1]) { 302 | const double node_weight = (double)h->nodes_weight[it]; 303 | const double node_weight_next = (double)h->nodes_weight[it + 1]; 304 | const double node_mean = h->nodes_mean[it]; 305 | const double node_mean_next = h->nodes_mean[it + 1]; 306 | // landed between centroids ... check for floating point madness 307 | if (node_mean_next - node_mean > 0) { 308 | // note how we handle singleton centroids here 309 | // the point is that for singleton centroids, we know that their entire 310 | // weight is exactly at the centroid and thus shouldn't be involved in 311 | // interpolation 312 | double leftExcludedW = 0; 313 | double rightExcludedW = 0; 314 | if (node_weight == 1) { 315 | if (node_weight_next == 1) { 316 | // two singletons means no interpolation 317 | // left singleton is in, right is out 318 | return (weightSoFar + 1) / merged_weight_d; 319 | } else { 320 | leftExcludedW = 0.5; 321 | } 322 | } else if (node_weight_next == 1) { 323 | rightExcludedW = 0.5; 324 | } 325 | double dw = (node_weight + node_weight_next) / 2; 326 | 327 | // adjust endpoints for any singleton 328 | double dwNoSingleton = dw - leftExcludedW - rightExcludedW; 329 | 330 | double base = weightSoFar + node_weight / 2 + leftExcludedW; 331 | return (base + dwNoSingleton * (val - node_mean) / (node_mean_next - node_mean)) / 332 | merged_weight_d; 333 | } else { 334 | // this is simply caution against floating point madness 335 | // it is conceivable that the centroids will be different 336 | // but too near to allow safe interpolation 337 | double dw = (node_weight + node_weight_next) / 2; 338 | return (weightSoFar + dw) / merged_weight_d; 339 | } 340 | } else { 341 | weightSoFar += (double)h->nodes_weight[it]; 342 | } 343 | } 344 | return 1 - 0.5 / merged_weight_d; 345 | } 346 | 347 | static double td_internal_iterate_centroids_to_index(const td_histogram_t *h, const double index, 348 | const double left_centroid_weight, 349 | const int total_centroids, double *weightSoFar, 350 | int *node_pos) { 351 | if (left_centroid_weight > 1 && index < left_centroid_weight / 2) { 352 | // there is a single sample at min so we interpolate with less weight 353 | return h->min + (index - 1) / (left_centroid_weight / 2 - 1) * (h->nodes_mean[0] - h->min); 354 | } 355 | 356 | // usually the last centroid will have unit weight so this test will make it moot 357 | if (index > h->merged_weight - 1) { 358 | return h->max; 359 | } 360 | 361 | // if the right-most centroid has more than one sample, we still know 362 | // that one sample occurred at max so we can do some interpolation 363 | const double right_centroid_weight = (double)h->nodes_weight[total_centroids - 1]; 364 | const double right_centroid_mean = h->nodes_mean[total_centroids - 1]; 365 | if (right_centroid_weight > 1 && 366 | (double)h->merged_weight - index <= right_centroid_weight / 2) { 367 | return h->max - ((double)h->merged_weight - index - 1) / (right_centroid_weight / 2 - 1) * 368 | (h->max - right_centroid_mean); 369 | } 370 | 371 | for (; *node_pos < total_centroids - 1; (*node_pos)++) { 372 | const int i = *node_pos; 373 | const double node_weight = (double)h->nodes_weight[i]; 374 | const double node_weight_next = (double)h->nodes_weight[i + 1]; 375 | const double node_mean = h->nodes_mean[i]; 376 | const double node_mean_next = h->nodes_mean[i + 1]; 377 | const double dw = (node_weight + node_weight_next) / 2; 378 | if (*weightSoFar + dw > index) { 379 | // centroids i and i+1 bracket our current point 380 | // check for unit weight 381 | double leftUnit = 0; 382 | if (node_weight == 1) { 383 | if (index - *weightSoFar < 0.5) { 384 | // within the singleton's sphere 385 | return node_mean; 386 | } else { 387 | leftUnit = 0.5; 388 | } 389 | } 390 | double rightUnit = 0; 391 | if (node_weight_next == 1) { 392 | if (*weightSoFar + dw - index <= 0.5) { 393 | // no interpolation needed near singleton 394 | return node_mean_next; 395 | } 396 | rightUnit = 0.5; 397 | } 398 | const double z1 = index - *weightSoFar - leftUnit; 399 | const double z2 = *weightSoFar + dw - index - rightUnit; 400 | return weighted_average(node_mean, z2, node_mean_next, z1); 401 | } 402 | *weightSoFar += dw; 403 | } 404 | 405 | // weightSoFar = totalWeight - weight[total_centroids-1]/2 (very nearly) 406 | // so we interpolate out to max value ever seen 407 | const double z1 = index - h->merged_weight - right_centroid_weight / 2.0; 408 | const double z2 = right_centroid_weight / 2 - z1; 409 | return weighted_average(right_centroid_mean, z1, h->max, z2); 410 | } 411 | 412 | double td_quantile(td_histogram_t *h, double q) { 413 | td_compress(h); 414 | // q should be in [0,1] 415 | if (q < 0.0 || q > 1.0 || h->merged_nodes == 0) { 416 | return NAN; 417 | } 418 | // with one data point, all quantiles lead to Rome 419 | if (h->merged_nodes == 1) { 420 | return h->nodes_mean[0]; 421 | } 422 | 423 | // if values were stored in a sorted array, index would be the offset we are interested in 424 | const double index = q * (double)h->merged_weight; 425 | 426 | // beyond the boundaries, we return min or max 427 | // usually, the first centroid will have unit weight so this will make it moot 428 | if (index < 1) { 429 | return h->min; 430 | } 431 | 432 | // we know that there are at least two centroids now 433 | const int n = h->merged_nodes; 434 | 435 | // if the left centroid has more than one sample, we still know 436 | // that one sample occurred at min so we can do some interpolation 437 | const double left_centroid_weight = (double)h->nodes_weight[0]; 438 | 439 | // in between extremes we interpolate between centroids 440 | double weightSoFar = left_centroid_weight / 2; 441 | int i = 0; 442 | return td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n, &weightSoFar, 443 | &i); 444 | } 445 | 446 | int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length) { 447 | td_compress(h); 448 | 449 | if (NULL == quantiles || NULL == values) { 450 | return EINVAL; 451 | } 452 | 453 | const int n = h->merged_nodes; 454 | if (n == 0) { 455 | for (size_t i = 0; i < length; i++) { 456 | values[i] = NAN; 457 | } 458 | return 0; 459 | } 460 | if (n == 1) { 461 | for (size_t i = 0; i < length; i++) { 462 | const double requested_quantile = quantiles[i]; 463 | 464 | // q should be in [0,1] 465 | if (requested_quantile < 0.0 || requested_quantile > 1.0) { 466 | values[i] = NAN; 467 | } else { 468 | // with one data point, all quantiles lead to Rome 469 | values[i] = h->nodes_mean[0]; 470 | } 471 | } 472 | return 0; 473 | } 474 | 475 | // we know that there are at least two centroids now 476 | // if the left centroid has more than one sample, we still know 477 | // that one sample occurred at min so we can do some interpolation 478 | const double left_centroid_weight = (double)h->nodes_weight[0]; 479 | 480 | // in between extremes we interpolate between centroids 481 | double weightSoFar = left_centroid_weight / 2; 482 | int node_pos = 0; 483 | 484 | // to avoid allocations we use the values array for intermediate computation 485 | // i.e. to store the expected cumulative count at each percentile 486 | for (size_t qpos = 0; qpos < length; qpos++) { 487 | const double index = quantiles[qpos] * (double)h->merged_weight; 488 | values[qpos] = td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n, 489 | &weightSoFar, &node_pos); 490 | } 491 | return 0; 492 | } 493 | 494 | static double td_internal_trimmed_mean(const td_histogram_t *h, const double leftmost_weight, 495 | const double rightmost_weight) { 496 | double count_done = 0; 497 | double trimmed_sum = 0; 498 | double trimmed_count = 0; 499 | for (int i = 0; i < h->merged_nodes; i++) { 500 | 501 | const double n_weight = (double)h->nodes_weight[i]; 502 | // Assume the whole centroid falls into the range 503 | double count_add = n_weight; 504 | 505 | // If we haven't reached the low threshold yet, skip appropriate part of the centroid. 506 | count_add -= __td_min(__td_max(0, leftmost_weight - count_done), count_add); 507 | 508 | // If we have reached the upper threshold, ignore the overflowing part of the centroid. 509 | 510 | count_add = __td_min(__td_max(0, rightmost_weight - count_done), count_add); 511 | 512 | // consider the whole centroid processed 513 | count_done += n_weight; 514 | 515 | // increment the sum / count 516 | trimmed_sum += h->nodes_mean[i] * count_add; 517 | trimmed_count += count_add; 518 | 519 | // break once we cross the high threshold 520 | if (count_done >= rightmost_weight) 521 | break; 522 | } 523 | 524 | return trimmed_sum / trimmed_count; 525 | } 526 | 527 | double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut) { 528 | td_compress(h); 529 | // proportion_to_cut should be in [0,1] 530 | if (h->merged_nodes == 0 || proportion_to_cut < 0.0 || proportion_to_cut > 1.0) { 531 | return NAN; 532 | } 533 | // with one data point, all values lead to Rome 534 | if (h->merged_nodes == 1) { 535 | return h->nodes_mean[0]; 536 | } 537 | 538 | /* translate the percentiles to counts */ 539 | const double leftmost_weight = floor((double)h->merged_weight * proportion_to_cut); 540 | const double rightmost_weight = ceil((double)h->merged_weight * (1.0 - proportion_to_cut)); 541 | 542 | return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight); 543 | } 544 | 545 | double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut) { 546 | td_compress(h); 547 | // leftmost_cut and rightmost_cut should be in [0,1] 548 | if (h->merged_nodes == 0 || leftmost_cut < 0.0 || leftmost_cut > 1.0 || rightmost_cut < 0.0 || 549 | rightmost_cut > 1.0) { 550 | return NAN; 551 | } 552 | // with one data point, all values lead to Rome 553 | if (h->merged_nodes == 1) { 554 | return h->nodes_mean[0]; 555 | } 556 | 557 | /* translate the percentiles to counts */ 558 | const double leftmost_weight = floor((double)h->merged_weight * leftmost_cut); 559 | const double rightmost_weight = ceil((double)h->merged_weight * rightmost_cut); 560 | 561 | return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight); 562 | } 563 | 564 | int td_add(td_histogram_t *h, double mean, long long weight) { 565 | if (should_td_compress(h)) { 566 | const int overflow_res = td_compress(h); 567 | if (overflow_res != 0) 568 | return overflow_res; 569 | } 570 | const int pos = next_node(h); 571 | if (pos >= h->cap) 572 | return EDOM; 573 | if (_tdigest_long_long_add_safe(h->unmerged_weight, weight) == false) 574 | return EDOM; 575 | const long long new_unmerged_weight = h->unmerged_weight + weight; 576 | if (_tdigest_long_long_add_safe(new_unmerged_weight, h->merged_weight) == false) 577 | return EDOM; 578 | const long long new_total_weight = new_unmerged_weight + h->merged_weight; 579 | // double-precision overflow detected 580 | const int overflow_res = 581 | _check_td_overflow((double)new_unmerged_weight, (double)new_total_weight); 582 | if (overflow_res != 0) 583 | return overflow_res; 584 | 585 | if (mean < h->min) { 586 | h->min = mean; 587 | } 588 | if (mean > h->max) { 589 | h->max = mean; 590 | } 591 | h->nodes_mean[pos] = mean; 592 | h->nodes_weight[pos] = weight; 593 | h->unmerged_nodes++; 594 | h->unmerged_weight = new_unmerged_weight; 595 | return 0; 596 | } 597 | 598 | int td_compress(td_histogram_t *h) { 599 | if (h->unmerged_nodes == 0) { 600 | return 0; 601 | } 602 | int N = h->merged_nodes + h->unmerged_nodes; 603 | td_qsort(h->nodes_mean, h->nodes_weight, 0, N - 1); 604 | const double total_weight = (double)h->merged_weight + (double)h->unmerged_weight; 605 | // double-precision overflow detected 606 | const int overflow_res = _check_td_overflow((double)h->unmerged_weight, (double)total_weight); 607 | if (overflow_res != 0) 608 | return overflow_res; 609 | if (total_weight <= 1) 610 | return 0; 611 | const double denom = 2 * MM_PI * total_weight * log(total_weight); 612 | if (_check_overflow(denom) != 0) 613 | return EDOM; 614 | 615 | // Compute the normalizer given compression and number of points. 616 | const double normalizer = h->compression / denom; 617 | if (_check_overflow(normalizer) != 0) 618 | return EDOM; 619 | int cur = 0; 620 | double weight_so_far = 0; 621 | 622 | for (int i = 1; i < N; i++) { 623 | const double proposed_weight = (double)h->nodes_weight[cur] + (double)h->nodes_weight[i]; 624 | const double z = proposed_weight * normalizer; 625 | // quantile up to cur 626 | const double q0 = weight_so_far / total_weight; 627 | // quantile up to cur + i 628 | const double q2 = (weight_so_far + proposed_weight) / total_weight; 629 | // Convert a quantile to the k-scale 630 | const bool should_add = (z <= (q0 * (1 - q0))) && (z <= (q2 * (1 - q2))); 631 | // next point will fit 632 | // so merge into existing centroid 633 | if (should_add) { 634 | h->nodes_weight[cur] += h->nodes_weight[i]; 635 | const double delta = h->nodes_mean[i] - h->nodes_mean[cur]; 636 | const double weighted_delta = (delta * h->nodes_weight[i]) / h->nodes_weight[cur]; 637 | h->nodes_mean[cur] += weighted_delta; 638 | } else { 639 | weight_so_far += h->nodes_weight[cur]; 640 | cur++; 641 | h->nodes_weight[cur] = h->nodes_weight[i]; 642 | h->nodes_mean[cur] = h->nodes_mean[i]; 643 | } 644 | if (cur != i) { 645 | h->nodes_weight[i] = 0; 646 | h->nodes_mean[i] = 0.0; 647 | } 648 | } 649 | h->merged_nodes = cur + 1; 650 | h->merged_weight = total_weight; 651 | h->unmerged_nodes = 0; 652 | h->unmerged_weight = 0; 653 | h->total_compressions++; 654 | return 0; 655 | } 656 | 657 | double td_min(td_histogram_t *h) { return h->min; } 658 | 659 | double td_max(td_histogram_t *h) { return h->max; } 660 | 661 | int td_compression(td_histogram_t *h) { return h->compression; } 662 | 663 | const long long *td_centroids_weight(td_histogram_t *h) { return h->nodes_weight; } 664 | 665 | const double *td_centroids_mean(td_histogram_t *h) { return h->nodes_mean; } 666 | 667 | long long td_centroids_weight_at(td_histogram_t *h, int pos) { return h->nodes_weight[pos]; } 668 | 669 | double td_centroids_mean_at(td_histogram_t *h, int pos) { 670 | if (pos < 0 || pos > h->merged_nodes) { 671 | return NAN; 672 | } 673 | return h->nodes_mean[pos]; 674 | } 675 | -------------------------------------------------------------------------------- /src/tdigest.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | /** 5 | * Adaptive histogram based on something like streaming k-means crossed with Q-digest. 6 | * The implementation is a direct descendent of MergingDigest 7 | * https://github.com/tdunning/t-digest/ 8 | * 9 | * Copyright (c) 2021 Redis, All rights reserved. 10 | * Copyright (c) 2018 Andrew Werner, All rights reserved. 11 | * 12 | * The special characteristics of this algorithm are: 13 | * 14 | * - smaller summaries than Q-digest 15 | * 16 | * - provides part per million accuracy for extreme quantiles and typically <1000 ppm accuracy 17 | * for middle quantiles 18 | * 19 | * - fast 20 | * 21 | * - simple 22 | * 23 | * - easy to adapt for use with map-reduce 24 | */ 25 | 26 | #define MM_PI 3.14159265358979323846 27 | 28 | struct td_histogram { 29 | // compression is a setting used to configure the size of centroids when merged. 30 | double compression; 31 | 32 | double min; 33 | double max; 34 | 35 | // cap is the total size of nodes 36 | int cap; 37 | // merged_nodes is the number of merged nodes at the front of nodes. 38 | int merged_nodes; 39 | // unmerged_nodes is the number of buffered nodes. 40 | int unmerged_nodes; 41 | 42 | // we run the merge in reverse every other merge to avoid left-to-right bias in merging 43 | long long total_compressions; 44 | 45 | long long merged_weight; 46 | long long unmerged_weight; 47 | 48 | double *nodes_mean; 49 | long long *nodes_weight; 50 | }; 51 | 52 | typedef struct td_histogram td_histogram_t; 53 | 54 | #ifdef __cplusplus 55 | extern "C" { 56 | #endif 57 | 58 | /** 59 | * Allocate the memory, initialise the t-digest, and return the histogram as output parameter. 60 | * @param compression The compression parameter. 61 | * 100 is a common value for normal uses. 62 | * 1000 is extremely large. 63 | * The number of centroids retained will be a smallish (usually less than 10) multiple of this 64 | * number. 65 | * @return the histogram on success, NULL if allocation failed. 66 | */ 67 | td_histogram_t *td_new(double compression); 68 | 69 | /** 70 | * Allocate the memory and initialise the t-digest. 71 | * 72 | * @param compression The compression parameter. 73 | * 100 is a common value for normal uses. 74 | * 1000 is extremely large. 75 | * The number of centroids retained will be a smallish (usually less than 10) multiple of this 76 | * number. 77 | * @param result Output parameter to capture allocated histogram. 78 | * @return 0 on success, 1 if allocation failed. 79 | */ 80 | int td_init(double compression, td_histogram_t **result); 81 | 82 | /** 83 | * Frees the memory associated with the t-digest. 84 | * 85 | * @param h The histogram you want to free. 86 | */ 87 | void td_free(td_histogram_t *h); 88 | 89 | /** 90 | * Reset a histogram to zero - empty out a histogram and re-initialise it 91 | * 92 | * If you want to re-use an existing histogram, but reset everything back to zero, this 93 | * is the routine to use. 94 | * 95 | * @param h The histogram you want to reset to empty. 96 | * 97 | */ 98 | void td_reset(td_histogram_t *h); 99 | 100 | /** 101 | * Adds a sample to a histogram. 102 | * 103 | * @param val The value to add. 104 | * @param weight The weight of this point. 105 | * @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided 106 | * weight. 107 | * 108 | */ 109 | int td_add(td_histogram_t *h, double val, long long weight); 110 | 111 | /** 112 | * Re-examines a t-digest to determine whether some centroids are redundant. If your data are 113 | * perversely ordered, this may be a good idea. Even if not, this may save 20% or so in space. 114 | * 115 | * The cost is roughly the same as adding as many data points as there are centroids. This 116 | * is typically < 10 * compression, but could be as high as 100 * compression. 117 | * This is a destructive operation that is not thread-safe. 118 | * 119 | * @param h The histogram you want to compress. 120 | * @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided 121 | * weight. If overflow is detected the histogram is not changed. 122 | * 123 | */ 124 | int td_compress(td_histogram_t *h); 125 | 126 | /** 127 | * Merges all of the values from 'from' to 'this' histogram. 128 | * 129 | * @param h "This" pointer 130 | * @param from Histogram to copy values from. 131 | * * @return 0 on success, EDOM if overflow was detected as a consequence of merging the the 132 | * provided histogram. If overflow is detected the original histogram is not detected. 133 | */ 134 | int td_merge(td_histogram_t *h, td_histogram_t *from); 135 | 136 | /** 137 | * Returns the fraction of all points added which are ≤ x. 138 | * 139 | * @param x The cutoff for the cdf. 140 | * @return The fraction of all data which is less or equal to x. 141 | */ 142 | double td_cdf(td_histogram_t *h, double x); 143 | 144 | /** 145 | * Returns an estimate of the cutoff such that a specified fraction of the data 146 | * added to this TDigest would be less than or equal to the cutoff. 147 | * 148 | * @param q The desired fraction 149 | * @return The value x such that cdf(x) == q; 150 | */ 151 | double td_quantile(td_histogram_t *h, double q); 152 | 153 | /** 154 | * Returns an estimate of the cutoff such that a specified fraction of the data 155 | * added to this TDigest would be less than or equal to the cutoffs. 156 | * 157 | * @param quantiles The ordered percentiles array to get the values for. 158 | * @param values Destination array containing the values at the given quantiles. 159 | * The values array should be allocated by the caller. 160 | * @return 0 on success, ENOMEM if the provided destination array is null. 161 | */ 162 | int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length); 163 | 164 | /** 165 | * Returns the trimmed mean ignoring values outside given cutoff upper and lower limits. 166 | * 167 | * @param leftmost_cut Fraction to cut off of the left tail of the distribution. 168 | * @param rightmost_cut Fraction to cut off of the right tail of the distribution. 169 | * @return The trimmed mean ignoring values outside given cutoff upper and lower limits; 170 | */ 171 | double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut); 172 | 173 | /** 174 | * Returns the trimmed mean ignoring values outside given a symmetric cutoff limits. 175 | * 176 | * @param proportion_to_cut Fraction to cut off of the left and right tails of the distribution. 177 | * @return The trimmed mean ignoring values outside given cutoff upper and lower limits; 178 | */ 179 | double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut); 180 | 181 | /** 182 | * Returns the current compression factor. 183 | * 184 | * @return The compression factor originally used to set up the TDigest. 185 | */ 186 | int td_compression(td_histogram_t *h); 187 | 188 | /** 189 | * Returns the number of points that have been added to this TDigest. 190 | * 191 | * @return The sum of the weights on all centroids. 192 | */ 193 | long long td_size(td_histogram_t *h); 194 | 195 | /** 196 | * Returns the number of centroids being used by this TDigest. 197 | * 198 | * @return The number of centroids being used. 199 | */ 200 | int td_centroid_count(td_histogram_t *h); 201 | 202 | /** 203 | * Get minimum value from the histogram. Will return __DBL_MAX__ if the histogram 204 | * is empty. 205 | * 206 | * @param h "This" pointer 207 | */ 208 | double td_min(td_histogram_t *h); 209 | 210 | /** 211 | * Get maximum value from the histogram. Will return - __DBL_MAX__ if the histogram 212 | * is empty. 213 | * 214 | * @param h "This" pointer 215 | */ 216 | double td_max(td_histogram_t *h); 217 | 218 | /** 219 | * Get the full centroids weight array for 'this' histogram. 220 | * 221 | * @param h "This" pointer 222 | * 223 | * @return The full centroids weight array. 224 | */ 225 | const long long *td_centroids_weight(td_histogram_t *h); 226 | 227 | /** 228 | * Get the full centroids mean array for 'this' histogram. 229 | * 230 | * @param h "This" pointer 231 | * 232 | * @return The full centroids mean array. 233 | */ 234 | const double *td_centroids_mean(td_histogram_t *h); 235 | 236 | /** 237 | * Get the centroid weight for 'this' histogram and 'pos'. 238 | * 239 | * @param h "This" pointer 240 | * @param pos centroid position. 241 | * 242 | * @return The centroid weight. 243 | */ 244 | long long td_centroids_weight_at(td_histogram_t *h, int pos); 245 | 246 | /** 247 | * Get the centroid mean for 'this' histogram and 'pos'. 248 | * 249 | * @param h "This" pointer 250 | * @param pos centroid position. 251 | * 252 | * @return The centroid mean. 253 | */ 254 | double td_centroids_mean_at(td_histogram_t *h, int pos); 255 | 256 | #ifdef __cplusplus 257 | } 258 | #endif 259 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | if (BUILD_BENCHMARK) 3 | if (UNIX) 4 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2 -g -ggdb -fno-omit-frame-pointer") 5 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -g -ggdb -fno-omit-frame-pointer") 6 | set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Suppressing benchmark's tests" FORCE) 7 | add_subdirectory(vendor/google/benchmark) 8 | include_directories(vendor/google/benchmark/include) 9 | add_executable(histogram_benchmark benchmark/histogram_benchmark.cpp) 10 | target_link_libraries(histogram_benchmark tdigest benchmark::benchmark) 11 | else() 12 | message(WARNING 13 | "google.benchmark - microbenchmarks disabled on WIN32 platforms") 14 | endif() 15 | endif() 16 | if (BUILD_TESTS) 17 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c99") 18 | add_executable(td_test unit/td_test.c unit/minunit.h) 19 | target_link_libraries(td_test tdigest m) 20 | enable_testing() 21 | add_test(td_test td_test) 22 | endif() 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /tests/benchmark/histogram_benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "tdigest.h" 3 | #include 4 | #include 5 | 6 | #ifdef _WIN32 7 | #pragma comment(lib, "Shlwapi.lib") 8 | #ifdef _DEBUG 9 | #pragma comment(lib, "benchmarkd.lib") 10 | #else 11 | #pragma comment(lib, "benchmark.lib") 12 | #endif 13 | #endif 14 | 15 | int64_t min_value = 1; 16 | int64_t min_compression = 100; 17 | int64_t max_compression = 500; 18 | int64_t step_compression_unit = 100; 19 | 20 | static void generate_arguments_pairs(benchmark::internal::Benchmark *b) { 21 | for (int64_t compression = min_compression; compression <= max_compression; 22 | compression += step_compression_unit) { 23 | b = b->ArgPair((double)compression, INT64_C(10000000)); 24 | } 25 | } 26 | 27 | static void BM_td_add_uniform_dist(benchmark::State &state) { 28 | const double compression = state.range(0); 29 | const int64_t stream_size = state.range(1); 30 | td_histogram_t *mdigest = td_new(compression); 31 | std::vector input; 32 | input.resize(stream_size, 0); 33 | std::mt19937_64 rng; 34 | rng.seed(std::random_device()()); 35 | std::uniform_real_distribution dist(0, 1); 36 | 37 | for (double &i : input) { 38 | i = dist(rng); 39 | } 40 | 41 | while (state.KeepRunning()) { 42 | for (int i = 0; i < stream_size; ++i) { 43 | td_add(mdigest, input[i], 1); 44 | } 45 | td_compress(mdigest); 46 | // read/write barrier 47 | benchmark::ClobberMemory(); 48 | state.SetItemsProcessed(stream_size); 49 | // Set the counter as a thread-average quantity. It will 50 | // be presented divided by the number of threads ( in our case just one thread ). 51 | state.counters["Centroid_Count"] = 52 | benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads); 53 | state.counters["Total_Compressions"] = 54 | benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads); 55 | } 56 | } 57 | 58 | static void BM_td_add_lognormal_dist(benchmark::State &state) { 59 | const double compression = state.range(0); 60 | const int64_t stream_size = state.range(1); 61 | td_histogram_t *mdigest = td_new(compression); 62 | std::vector input; 63 | input.resize(stream_size, 0); 64 | std::mt19937_64 rng; 65 | rng.seed(std::random_device()()); 66 | std::lognormal_distribution dist(1, 0.5); 67 | 68 | for (double &i : input) { 69 | i = dist(rng); 70 | } 71 | 72 | while (state.KeepRunning()) { 73 | for (int i = 0; i < stream_size; ++i) { 74 | td_add(mdigest, input[i], 1); 75 | } 76 | td_compress(mdigest); 77 | // read/write barrier 78 | benchmark::ClobberMemory(); 79 | state.SetItemsProcessed(stream_size); 80 | // Set the counter as a thread-average quantity. It will 81 | // be presented divided by the number of threads ( in our case just one thread ). 82 | state.counters["Centroid_Count"] = 83 | benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads); 84 | state.counters["Total_Compressions"] = 85 | benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads); 86 | } 87 | } 88 | 89 | static void BM_td_quantile_lognormal_dist(benchmark::State &state) { 90 | const double compression = state.range(0); 91 | const int64_t stream_size = state.range(1); 92 | td_histogram_t *mdigest = td_new(compression); 93 | std::vector input; 94 | input.resize(stream_size, 0); 95 | std::mt19937_64 rng; 96 | rng.seed(std::random_device()()); 97 | std::uniform_real_distribution dist(0, 1); 98 | std::lognormal_distribution distSamples(1, 0.5); 99 | 100 | for (double &i : input) { 101 | i = dist(rng); 102 | td_add(mdigest, distSamples(rng), 1); 103 | } 104 | td_compress(mdigest); 105 | 106 | while (state.KeepRunning()) { 107 | for (int i = 0; i < stream_size; ++i) { 108 | td_quantile(mdigest, input[i]); 109 | } 110 | // read/write barrier 111 | benchmark::ClobberMemory(); 112 | state.SetItemsProcessed(stream_size); 113 | // Set the counter as a thread-average quantity. It will 114 | // be presented divided by the number of threads ( in our case just one thread ). 115 | state.counters["Centroid_Count"] = 116 | benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads); 117 | state.counters["Total_Compressions"] = 118 | benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads); 119 | } 120 | } 121 | 122 | static void BM_td_merge_lognormal_dist(benchmark::State &state) { 123 | const double compression = state.range(0); 124 | const int64_t stream_size = 100000; 125 | td_histogram_t *mdigest = td_new(compression); 126 | td_histogram_t *mdigest2 = td_new(compression); 127 | std::vector input; 128 | input.resize(stream_size, 0); 129 | std::mt19937_64 rng; 130 | rng.seed(std::random_device()()); 131 | std::uniform_real_distribution dist(0, 1); 132 | std::lognormal_distribution distSamples(1, 0.5); 133 | 134 | for (double &i : input) { 135 | i = dist(rng); 136 | td_add(mdigest, distSamples(rng), 1); 137 | td_add(mdigest2, distSamples(rng), 1); 138 | } 139 | td_compress(mdigest); 140 | 141 | while (state.KeepRunning()) { 142 | for (int i = 0; i < stream_size; ++i) { 143 | td_merge(mdigest, mdigest2); 144 | } 145 | // read/write barrier 146 | benchmark::ClobberMemory(); 147 | state.SetItemsProcessed(stream_size); 148 | // Set the counter as a thread-average quantity. It will 149 | // be presented divided by the number of threads ( in our case just one thread ). 150 | state.counters["Centroid_Count"] = 151 | benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads); 152 | state.counters["Total_Compressions"] = 153 | benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads); 154 | } 155 | } 156 | 157 | static void BM_td_trimmed_mean_symmetric_lognormal_dist(benchmark::State &state) { 158 | const double compression = state.range(0); 159 | const int64_t stream_size = state.range(1); 160 | td_histogram_t *mdigest = td_new(compression); 161 | std::vector input; 162 | input.resize(stream_size, 0); 163 | std::mt19937_64 rng; 164 | rng.seed(std::random_device()()); 165 | std::uniform_real_distribution dist(0, 1); 166 | std::lognormal_distribution distSamples(1, 0.5); 167 | 168 | for (double &i : input) { 169 | i = dist(rng); 170 | td_add(mdigest, distSamples(rng), 1); 171 | } 172 | td_compress(mdigest); 173 | 174 | while (state.KeepRunning()) { 175 | for (int i = 0; i < stream_size; ++i) { 176 | td_trimmed_mean_symmetric(mdigest, input[i]); 177 | } 178 | // read/write barrier 179 | benchmark::ClobberMemory(); 180 | state.SetItemsProcessed(stream_size); 181 | // Set the counter as a thread-average quantity. It will 182 | // be presented divided by the number of threads ( in our case just one thread ). 183 | state.counters["Centroid_Count"] = 184 | benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads); 185 | state.counters["Total_Compressions"] = 186 | benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads); 187 | } 188 | } 189 | 190 | static void BM_td_quantile_lognormal_dist_given_array(benchmark::State &state) { 191 | const double compression = state.range(0); 192 | const int64_t stream_size = state.range(1); 193 | td_histogram_t *mdigest = td_new(compression); 194 | std::vector input; 195 | input.resize(stream_size, 0); 196 | std::mt19937_64 rng; 197 | rng.seed(12345); 198 | std::lognormal_distribution distSamples(1, 0.5); 199 | const double percentile_list[4] = {50.0, 95.0, 99.0, 99.9}; 200 | 201 | for (double &i : input) { 202 | td_add(mdigest, distSamples(rng), 1); 203 | } 204 | td_compress(mdigest); 205 | int64_t items_processed = 0; 206 | for (auto _ : state) { 207 | for (auto percentile : percentile_list) { 208 | benchmark::DoNotOptimize(td_quantile(mdigest, percentile)); 209 | // read/write barrier 210 | benchmark::ClobberMemory(); 211 | } 212 | items_processed += 4; 213 | // read/write barrier 214 | benchmark::ClobberMemory(); 215 | state.SetItemsProcessed(stream_size); 216 | // Set the counter as a thread-average quantity. It will 217 | // be presented divided by the number of threads ( in our case just one thread ). 218 | state.counters["Centroid_Count"] = 219 | benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads); 220 | state.counters["Total_Compressions"] = 221 | benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads); 222 | } 223 | } 224 | 225 | static void BM_td_quantiles_lognormal_dist_given_array(benchmark::State &state) { 226 | const double compression = state.range(0); 227 | const int64_t stream_size = state.range(1); 228 | td_histogram_t *mdigest = td_new(compression); 229 | std::vector input; 230 | input.resize(stream_size, 0); 231 | std::mt19937_64 rng; 232 | rng.seed(12345); 233 | std::lognormal_distribution distSamples(1, 0.5); 234 | const double percentile_list[4] = {50.0, 95.0, 99.0, 99.9}; 235 | double values[4] = {.0}; 236 | 237 | for (double &i : input) { 238 | td_add(mdigest, distSamples(rng), 1); 239 | } 240 | td_compress(mdigest); 241 | int64_t items_processed = 0; 242 | for (auto _ : state) { 243 | benchmark::DoNotOptimize(td_quantiles(mdigest, percentile_list, values, 4)); 244 | items_processed += 4; 245 | // read/write barrier 246 | benchmark::ClobberMemory(); 247 | state.SetItemsProcessed(stream_size); 248 | // Set the counter as a thread-average quantity. It will 249 | // be presented divided by the number of threads ( in our case just one thread ). 250 | state.counters["Centroid_Count"] = 251 | benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads); 252 | state.counters["Total_Compressions"] = 253 | benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads); 254 | } 255 | } 256 | 257 | // Register the functions as a benchmark 258 | BENCHMARK(BM_td_add_uniform_dist)->Apply(generate_arguments_pairs); 259 | BENCHMARK(BM_td_add_lognormal_dist)->Apply(generate_arguments_pairs); 260 | BENCHMARK(BM_td_quantile_lognormal_dist)->Apply(generate_arguments_pairs); 261 | BENCHMARK(BM_td_quantile_lognormal_dist_given_array)->Apply(generate_arguments_pairs); 262 | BENCHMARK(BM_td_quantiles_lognormal_dist_given_array)->Apply(generate_arguments_pairs); 263 | BENCHMARK(BM_td_merge_lognormal_dist)->Apply(generate_arguments_pairs); 264 | BENCHMARK(BM_td_trimmed_mean_symmetric_lognormal_dist)->Apply(generate_arguments_pairs); 265 | 266 | BENCHMARK_MAIN(); -------------------------------------------------------------------------------- /tests/unit/minunit.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 David Siñuela Pastor, siu.4coders@gmail.com 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining 5 | * a copy of this software and associated documentation files (the 6 | * "Software"), to deal in the Software without restriction, including 7 | * without limitation the rights to use, copy, modify, merge, publish, 8 | * distribute, sublicense, and/or sell copies of the Software, and to 9 | * permit persons to whom the Software is furnished to do so, subject to 10 | * the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be 13 | * included in all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 19 | * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 20 | * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 21 | * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | */ 23 | // clang-format off 24 | #ifndef MINUNIT_MINUNIT_H 25 | #define MINUNIT_MINUNIT_H 26 | 27 | #ifdef __cplusplus 28 | extern "C" { 29 | #endif 30 | 31 | #if defined(_WIN32) 32 | #include 33 | #if defined(_MSC_VER) && _MSC_VER < 1900 34 | #define snprintf _snprintf 35 | #define __func__ __FUNCTION__ 36 | #endif 37 | 38 | #elif defined(__unix__) || defined(__unix) || defined(unix) || \ 39 | (defined(__APPLE__) && defined(__MACH__)) 40 | 41 | /* Change POSIX C SOURCE version for pure c99 compilers */ 42 | #if !defined(_POSIX_C_SOURCE) || _POSIX_C_SOURCE < 200112L 43 | #undef _POSIX_C_SOURCE 44 | #define _POSIX_C_SOURCE 200112L 45 | #endif 46 | 47 | #include /* POSIX flags */ 48 | #include /* clock_gettime(), time() */ 49 | #include /* gethrtime(), gettimeofday() */ 50 | #include 51 | #include 52 | #include 53 | 54 | #if defined(__MACH__) && defined(__APPLE__) 55 | #include 56 | #include 57 | #endif 58 | 59 | #if __GNUC__ >= 5 && !defined(__STDC_VERSION__) 60 | #define __func__ __extension__ __FUNCTION__ 61 | #endif 62 | 63 | #else 64 | #error "Unable to define timers for an unknown OS." 65 | #endif 66 | 67 | #include 68 | #include 69 | 70 | /* Maximum length of last message */ 71 | #define MINUNIT_MESSAGE_LEN 1024 72 | /* Accuracy with which floats are compared */ 73 | #define MINUNIT_EPSILON 1E-12 74 | 75 | /* Misc. counters */ 76 | static int minunit_run = 0; 77 | static int minunit_assert = 0; 78 | static int minunit_fail = 0; 79 | static int minunit_status = 0; 80 | 81 | /* Timers */ 82 | static double minunit_real_timer = 0; 83 | static double minunit_proc_timer = 0; 84 | 85 | /* Last message */ 86 | static char minunit_last_message[MINUNIT_MESSAGE_LEN]; 87 | 88 | /* Test setup and teardown function pointers */ 89 | static void (*minunit_setup)(void) = NULL; 90 | static void (*minunit_teardown)(void) = NULL; 91 | 92 | /* Definitions */ 93 | #define MU_TEST(method_name) static void method_name(void) 94 | #define MU_TEST_SUITE(suite_name) static void suite_name(void) 95 | 96 | #define MU__SAFE_BLOCK(block) \ 97 | do { \ 98 | block \ 99 | } while (0) 100 | 101 | /* Run test suite and unset setup and teardown functions */ 102 | #define MU_RUN_SUITE(suite_name) \ 103 | MU__SAFE_BLOCK(suite_name(); minunit_setup = NULL; minunit_teardown = NULL;) 104 | 105 | /* Configure setup and teardown functions */ 106 | #define MU_SUITE_CONFIGURE(setup_fun, teardown_fun) \ 107 | MU__SAFE_BLOCK(minunit_setup = setup_fun; minunit_teardown = teardown_fun;) 108 | 109 | /* Test runner */ 110 | #define MU_RUN_TEST(test) \ 111 | MU__SAFE_BLOCK( \ 112 | if (minunit_real_timer == 0 && minunit_proc_timer == 0) { \ 113 | minunit_real_timer = mu_timer_real(); \ 114 | minunit_proc_timer = mu_timer_cpu(); \ 115 | } if (minunit_setup) (*minunit_setup)(); \ 116 | minunit_status = 0; test(); minunit_run++; if (minunit_status) { \ 117 | minunit_fail++; \ 118 | printf("F"); \ 119 | printf("\n%s\n", minunit_last_message); \ 120 | } fflush(stdout); \ 121 | if (minunit_teardown)(*minunit_teardown)();) 122 | 123 | /* Report */ 124 | #define MU_REPORT() \ 125 | MU__SAFE_BLOCK(double minunit_end_real_timer; double minunit_end_proc_timer; \ 126 | printf("\n\n%d tests, %d assertions, %d failures\n", minunit_run, \ 127 | minunit_assert, minunit_fail); \ 128 | minunit_end_real_timer = mu_timer_real(); \ 129 | minunit_end_proc_timer = mu_timer_cpu(); \ 130 | printf("\nFinished in %.8f seconds (real) %.8f seconds (proc)\n\n", \ 131 | minunit_end_real_timer - minunit_real_timer, \ 132 | minunit_end_proc_timer - minunit_proc_timer);) 133 | #define MU_EXIT_CODE minunit_fail 134 | 135 | /* Assertions */ 136 | #define mu_check(test) \ 137 | MU__SAFE_BLOCK( \ 138 | minunit_assert++; if (!(test)) { \ 139 | snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, "%s failed:\n\t%s:%d: %s", \ 140 | __func__, __FILE__, __LINE__, #test); \ 141 | minunit_status = 1; \ 142 | return; \ 143 | } else { printf("."); }) 144 | 145 | #define mu_fail(message) \ 146 | MU__SAFE_BLOCK(minunit_assert++; \ 147 | snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, "%s failed:\n\t%s:%d: %s", \ 148 | __func__, __FILE__, __LINE__, message); \ 149 | minunit_status = 1; return;) 150 | 151 | #define mu_assert(test, message) \ 152 | MU__SAFE_BLOCK( \ 153 | minunit_assert++; if (!(test)) { \ 154 | snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, "%s failed:\n\t%s:%d: %s", \ 155 | __func__, __FILE__, __LINE__, message); \ 156 | minunit_status = 1; \ 157 | return; \ 158 | } else { printf("."); }) 159 | 160 | #define mu_assert_long_eq(expected, result) \ 161 | MU__SAFE_BLOCK( \ 162 | long long minunit_tmp_e; long long minunit_tmp_r; minunit_assert++; \ 163 | minunit_tmp_e = (expected); minunit_tmp_r = (result); \ 164 | if (minunit_tmp_e != minunit_tmp_r) { \ 165 | snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, \ 166 | "%s failed:\n\t%s:%d: %lld expected but was %lld", __func__, __FILE__, \ 167 | __LINE__, minunit_tmp_e, minunit_tmp_r); \ 168 | minunit_status = 1; \ 169 | return; \ 170 | } else { printf("."); }) 171 | 172 | #define mu_assert_int_eq(expected, result) \ 173 | MU__SAFE_BLOCK( \ 174 | int minunit_tmp_e; int minunit_tmp_r; minunit_assert++; minunit_tmp_e = (expected); \ 175 | minunit_tmp_r = (result); if (minunit_tmp_e != minunit_tmp_r) { \ 176 | snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, \ 177 | "%s failed:\n\t%s:%d: %d expected but was %d", __func__, __FILE__, __LINE__, \ 178 | minunit_tmp_e, minunit_tmp_r); \ 179 | minunit_status = 1; \ 180 | return; \ 181 | } else { printf("."); }) 182 | 183 | #define mu_assert_double_eq(expected, result) \ 184 | MU__SAFE_BLOCK( \ 185 | double minunit_tmp_e; double minunit_tmp_r; minunit_assert++; minunit_tmp_e = (expected); \ 186 | minunit_tmp_r = (result); if (fabs(minunit_tmp_e - minunit_tmp_r) > MINUNIT_EPSILON) { \ 187 | int minunit_significant_figures = 1 - log10(MINUNIT_EPSILON); \ 188 | snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, \ 189 | "%s failed:\n\t%s:%d: %.*g expected but was %.*g", __func__, __FILE__, \ 190 | __LINE__, minunit_significant_figures, minunit_tmp_e, \ 191 | minunit_significant_figures, minunit_tmp_r); \ 192 | minunit_status = 1; \ 193 | return; \ 194 | } else { printf("."); }) 195 | 196 | #define mu_assert_double_eq_epsilon(expected, result, epsilon) \ 197 | MU__SAFE_BLOCK( \ 198 | double minunit_tmp_e; double minunit_tmp_r; minunit_assert++; minunit_tmp_e = (expected); \ 199 | minunit_tmp_r = (result); if (fabs(minunit_tmp_e - minunit_tmp_r) > epsilon) { \ 200 | int minunit_significant_figures = 1 - log10(epsilon); \ 201 | snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, \ 202 | "%s failed:\n\t%s:%d: %.*g expected but was %.*g ( using epsilon %.*g )", \ 203 | __func__, __FILE__, __LINE__, minunit_significant_figures, minunit_tmp_e, \ 204 | minunit_significant_figures, minunit_tmp_r, minunit_significant_figures, \ 205 | epsilon); \ 206 | minunit_status = 1; \ 207 | return; \ 208 | } else { printf("."); }) 209 | 210 | #define mu_assert_string_eq(expected, result) \ 211 | MU__SAFE_BLOCK( \ 212 | const char *minunit_tmp_e = expected; const char *minunit_tmp_r = result; \ 213 | minunit_assert++; \ 214 | if (!minunit_tmp_e) { minunit_tmp_e = ""; } if (!minunit_tmp_r) { \ 215 | minunit_tmp_r = ""; \ 216 | } if (strcmp(minunit_tmp_e, minunit_tmp_r)) { \ 217 | snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, \ 218 | "%s failed:\n\t%s:%d: '%s' expected but was '%s'", __func__, __FILE__, \ 219 | __LINE__, minunit_tmp_e, minunit_tmp_r); \ 220 | minunit_status = 1; \ 221 | return; \ 222 | } else { printf("."); }) 223 | 224 | /* 225 | * The following two functions were written by David Robert Nadeau 226 | * from http://NadeauSoftware.com/ and distributed under the 227 | * Creative Commons Attribution 3.0 Unported License 228 | */ 229 | 230 | /** 231 | * Returns the real time, in seconds, or -1.0 if an error occurred. 232 | * 233 | * Time is measured since an arbitrary and OS-dependent start time. 234 | * The returned real time is only useful for computing an elapsed time 235 | * between two calls to this function. 236 | */ 237 | static double mu_timer_real(void) { 238 | #if defined(_WIN32) 239 | /* Windows 2000 and later. ---------------------------------- */ 240 | LARGE_INTEGER Time; 241 | LARGE_INTEGER Frequency; 242 | 243 | QueryPerformanceFrequency(&Frequency); 244 | QueryPerformanceCounter(&Time); 245 | 246 | Time.QuadPart *= 1000000; 247 | Time.QuadPart /= Frequency.QuadPart; 248 | 249 | return (double)Time.QuadPart / 1000000.0; 250 | 251 | #elif (defined(__hpux) || defined(hpux)) || \ 252 | ((defined(__sun__) || defined(__sun) || defined(sun)) && \ 253 | (defined(__SVR4) || defined(__svr4__))) 254 | /* HP-UX, Solaris. ------------------------------------------ */ 255 | return (double)gethrtime() / 1000000000.0; 256 | 257 | #elif defined(__MACH__) && defined(__APPLE__) 258 | /* OSX. ----------------------------------------------------- */ 259 | static double timeConvert = 0.0; 260 | if (timeConvert == 0.0) { 261 | mach_timebase_info_data_t timeBase; 262 | (void)mach_timebase_info(&timeBase); 263 | timeConvert = (double)timeBase.numer / (double)timeBase.denom / 1000000000.0; 264 | } 265 | return (double)mach_absolute_time() * timeConvert; 266 | 267 | #elif defined(_POSIX_VERSION) 268 | /* POSIX. --------------------------------------------------- */ 269 | struct timeval tm; 270 | #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) 271 | { 272 | struct timespec ts; 273 | #if defined(CLOCK_MONOTONIC_PRECISE) 274 | /* BSD. --------------------------------------------- */ 275 | const clockid_t id = CLOCK_MONOTONIC_PRECISE; 276 | #elif defined(CLOCK_MONOTONIC_RAW) 277 | /* Linux. ------------------------------------------- */ 278 | const clockid_t id = CLOCK_MONOTONIC_RAW; 279 | #elif defined(CLOCK_HIGHRES) 280 | /* Solaris. ----------------------------------------- */ 281 | const clockid_t id = CLOCK_HIGHRES; 282 | #elif defined(CLOCK_MONOTONIC) 283 | /* AIX, BSD, Linux, POSIX, Solaris. ----------------- */ 284 | const clockid_t id = CLOCK_MONOTONIC; 285 | #elif defined(CLOCK_REALTIME) 286 | /* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */ 287 | const clockid_t id = CLOCK_REALTIME; 288 | #else 289 | const clockid_t id = (clockid_t)-1; /* Unknown. */ 290 | #endif /* CLOCK_* */ 291 | if (id != (clockid_t)-1 && clock_gettime(id, &ts) != -1) 292 | return (double)ts.tv_sec + (double)ts.tv_nsec / 1000000000.0; 293 | /* Fall thru. */ 294 | } 295 | #endif /* _POSIX_TIMERS */ 296 | 297 | /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */ 298 | gettimeofday(&tm, NULL); 299 | return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0; 300 | #else 301 | return -1.0; /* Failed. */ 302 | #endif 303 | } 304 | 305 | /** 306 | * Returns the amount of CPU time used by the current process, 307 | * in seconds, or -1.0 if an error occurred. 308 | */ 309 | static double mu_timer_cpu(void) { 310 | #if defined(_WIN32) 311 | /* Windows -------------------------------------------------- */ 312 | FILETIME createTime; 313 | FILETIME exitTime; 314 | FILETIME kernelTime; 315 | FILETIME userTime; 316 | 317 | /* This approach has a resolution of 1/64 second. Unfortunately, Windows' API does not offer 318 | * better */ 319 | if (GetProcessTimes(GetCurrentProcess(), &createTime, &exitTime, &kernelTime, &userTime) != 0) { 320 | ULARGE_INTEGER userSystemTime; 321 | memcpy(&userSystemTime, &userTime, sizeof(ULARGE_INTEGER)); 322 | return (double)userSystemTime.QuadPart / 10000000.0; 323 | } 324 | 325 | #elif defined(__unix__) || defined(__unix) || defined(unix) || \ 326 | (defined(__APPLE__) && defined(__MACH__)) 327 | /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, and Solaris --------- */ 328 | 329 | #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) 330 | /* Prefer high-res POSIX timers, when available. */ 331 | { 332 | clockid_t id; 333 | struct timespec ts; 334 | #if _POSIX_CPUTIME > 0 335 | /* Clock ids vary by OS. Query the id, if possible. */ 336 | if (clock_getcpuclockid(0, &id) == -1) 337 | #endif 338 | #if defined(CLOCK_PROCESS_CPUTIME_ID) 339 | /* Use known clock id for AIX, Linux, or Solaris. */ 340 | id = CLOCK_PROCESS_CPUTIME_ID; 341 | #elif defined(CLOCK_VIRTUAL) 342 | /* Use known clock id for BSD or HP-UX. */ 343 | id = CLOCK_VIRTUAL; 344 | #else 345 | id = (clockid_t)-1; 346 | #endif 347 | if (id != (clockid_t)-1 && clock_gettime(id, &ts) != -1) 348 | return (double)ts.tv_sec + (double)ts.tv_nsec / 1000000000.0; 349 | } 350 | #endif 351 | 352 | #if defined(RUSAGE_SELF) 353 | { 354 | struct rusage rusage; 355 | if (getrusage(RUSAGE_SELF, &rusage) != -1) 356 | return (double)rusage.ru_utime.tv_sec + (double)rusage.ru_utime.tv_usec / 1000000.0; 357 | } 358 | #endif 359 | 360 | #if defined(_SC_CLK_TCK) 361 | { 362 | const double ticks = (double)sysconf(_SC_CLK_TCK); 363 | struct tms tms; 364 | if (times(&tms) != (clock_t)-1) 365 | return (double)tms.tms_utime / ticks; 366 | } 367 | #endif 368 | 369 | #if defined(CLOCKS_PER_SEC) 370 | { 371 | clock_t cl = clock(); 372 | if (cl != (clock_t)-1) 373 | return (double)cl / (double)CLOCKS_PER_SEC; 374 | } 375 | #endif 376 | 377 | #endif 378 | 379 | return -1; /* Failed. */ 380 | } 381 | 382 | #ifdef __cplusplus 383 | } 384 | #endif 385 | 386 | #endif /* MINUNIT_MINUNIT_H */ 387 | // clang-format on 388 | -------------------------------------------------------------------------------- /tests/unit/td_test.c: -------------------------------------------------------------------------------- 1 | /** 2 | * td_test.c 3 | * Written by Filipe Oliveira and released to the public domain, 4 | * as explained at http://creativecommons.org/publicdomain/zero/1.0/ 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include "tdigest.h" 14 | 15 | #include "minunit.h" 16 | 17 | #define STREAM_SIZE 1000000 18 | 19 | #define MAX(x, y) (((x) > (y)) ? (x) : (y)) 20 | #define MIN(x, y) (((x) < (y)) ? (x) : (y)) 21 | 22 | static double randfrom(double M, double N) { return M + (rand() / (RAND_MAX / (N - M))); } 23 | 24 | int tests_run = 0; 25 | 26 | td_histogram_t *histogram = NULL; 27 | 28 | static void load_histograms(void) { 29 | const int compression = 500; 30 | 31 | int i; 32 | if (histogram) { 33 | td_free(histogram); 34 | } 35 | histogram = td_new(compression); 36 | 37 | for (i = 0; i < STREAM_SIZE; i++) { 38 | mu_assert(td_add(histogram, randfrom(0, 10), 1) == 0, "Insertion"); 39 | } 40 | } 41 | 42 | MU_TEST(test_basic) { 43 | td_histogram_t *t = td_new(10); 44 | mu_assert(t != NULL, "created_histogram"); 45 | mu_assert_long_eq(0, t->unmerged_weight); 46 | mu_assert_long_eq(0, t->merged_weight); 47 | mu_assert(td_add(t, 0.0, 1) == 0, "Insertion"); 48 | // with one data point, all quantiles lead to Rome 49 | mu_assert_double_eq(0.0, td_quantile(t, .0)); 50 | mu_assert_double_eq(0.0, td_quantile(t, 0.5)); 51 | mu_assert_double_eq(0.0, td_quantile(t, 1)); 52 | mu_assert(td_add(t, 10.0, 1) == 0, "Insertion"); 53 | mu_assert_double_eq(0.0, td_min(t)); 54 | mu_assert_double_eq(10.0, td_max(t)); 55 | mu_assert_double_eq(2.0, td_size(t)); 56 | mu_assert(t != NULL, "Failed to allocate hdr_histogram"); 57 | mu_assert_double_eq(10.0, t->compression); 58 | mu_assert(td_compression(t) < t->cap, "False: buffer size < compression"); 59 | mu_assert_double_eq(0.0, td_quantile(t, .0)); 60 | mu_assert_double_eq(0.0, td_quantile(t, .1)); 61 | mu_assert_double_eq(10.0, td_quantile(t, .99)); 62 | td_reset(t); 63 | td_reset(NULL); 64 | td_free(t); 65 | } 66 | 67 | MU_TEST(test_overflow) { 68 | td_histogram_t *t = td_new(10); 69 | td_histogram_t *t2 = td_new(10); 70 | mu_assert(t != NULL, "created_histogram"); 71 | mu_assert(t2 != NULL, "created_histogram"); 72 | mu_assert_long_eq(0, t->unmerged_weight); 73 | mu_assert_long_eq(0, t->merged_weight); 74 | mu_assert_long_eq(0, t2->unmerged_weight); 75 | mu_assert_long_eq(0, t2->merged_weight); 76 | mu_assert(td_add(t, 5.0, __LONG_LONG_MAX__ - 1) == 0, "Insertion of __LONG_LONG_MAX__"); 77 | mu_assert(td_add(t, 5.0, __LONG_LONG_MAX__ - 1) == EDOM, 78 | "second insertion of __LONG_LONG_MAX__ should overflow"); 79 | mu_assert_long_eq(__LONG_LONG_MAX__ - 1, t->merged_weight + t->unmerged_weight); 80 | // overflow on merge 81 | mu_assert(td_add(t2, 5.0, __LONG_LONG_MAX__ - 1) == 0, "First insertion of __LONG_LONG_MAX__"); 82 | mu_assert_long_eq(__LONG_LONG_MAX__ - 1, t2->merged_weight + t2->unmerged_weight); 83 | mu_assert(td_add(t2, 1.0, 1) == 0, "Insertion of 1"); 84 | mu_assert(td_add(t2, 5.0, __LONG_LONG_MAX__ - 1) == EDOM, 85 | "Second insertion of __LONG_LONG_MAX__"); 86 | td_free(t); 87 | td_free(t2); 88 | } 89 | 90 | MU_TEST(test_overflow_merge) { 91 | td_histogram_t *x = td_new(1000); 92 | td_histogram_t *y = td_new(1000); 93 | td_histogram_t *z = td_new(10); 94 | mu_assert(x != NULL, "created_histogram"); 95 | mu_assert(y != NULL, "created_histogram"); 96 | mu_assert(z != NULL, "created_histogram"); 97 | mu_assert_long_eq(0, x->unmerged_weight); 98 | mu_assert_long_eq(0, x->merged_weight); 99 | mu_assert_long_eq(0, y->unmerged_weight); 100 | mu_assert_long_eq(0, y->merged_weight); 101 | mu_assert(td_add(x, 1, 1) == 0, "Insertion of 1"); 102 | mu_assert(td_add(x, 2, 1) == 0, "Insertion of 2"); 103 | mu_assert(td_add(x, 3, 1) == 0, "Insertion of 3"); 104 | mu_assert(td_add(x, 4, 1) == 0, "Insertion of 4"); 105 | mu_assert(td_add(x, 5, 1) == 0, "Insertion of 5"); 106 | mu_assert(td_add(x, 6, 1) == 0, "Insertion of 6"); 107 | mu_assert(td_add(x, 7, 1) == 0, "Insertion of 7"); 108 | mu_assert(td_add(x, 8, 1) == 0, "Insertion of 8"); 109 | mu_assert(td_add(x, 9, 1) == 0, "Insertion of 9"); 110 | mu_assert(td_add(x, 10, 1) == 0, "Insertion of 10"); 111 | mu_assert(td_add(x, 11, 1) == 0, "Insertion of 11"); 112 | mu_assert(td_add(x, 12, 1) == 0, "Insertion of 12"); 113 | mu_assert(td_add(x, 13, 1) == 0, "Insertion of 13"); 114 | mu_assert(td_add(x, 14, 1) == 0, "Insertion of 14"); 115 | mu_assert(td_add(x, 15, 1) == 0, "Insertion of 15"); 116 | mu_assert(td_add(x, 16, 1) == 0, "Insertion of 16"); 117 | mu_assert(td_add(x, 17, 1) == 0, "Insertion of 17"); 118 | mu_assert(td_add(x, 18, 1) == 0, "Insertion of 18"); 119 | mu_assert(td_add(x, 19, 1) == 0, "Insertion of 19"); 120 | mu_assert(td_add(x, 20, 1) == 0, "Insertion of 20"); 121 | mu_assert(td_add(y, 101, 1) == 0, "Insertion of 101"); 122 | mu_assert(td_add(y, 102, 1) == 0, "Insertion of 102"); 123 | mu_assert(td_add(y, 103, 1) == 0, "Insertion of 103"); 124 | mu_assert(td_add(y, 104, 1) == 0, "Insertion of 104"); 125 | mu_assert(td_add(y, 105, 1) == 0, "Insertion of 105"); 126 | mu_assert(td_add(y, 106, 1) == 0, "Insertion of 106"); 127 | mu_assert(td_add(y, 107, 1) == 0, "Insertion of 107"); 128 | mu_assert(td_add(y, 108, 1) == 0, "Insertion of 108"); 129 | mu_assert(td_add(y, 109, 1) == 0, "Insertion of 109"); 130 | mu_assert(td_add(y, 110, 1) == 0, "Insertion of 110"); 131 | mu_assert(td_add(y, 111, 1) == 0, "Insertion of 111"); 132 | mu_assert(td_add(y, 112, 1) == 0, "Insertion of 112"); 133 | mu_assert(td_add(y, 113, 1) == 0, "Insertion of 113"); 134 | mu_assert(td_add(y, 114, 1) == 0, "Insertion of 114"); 135 | mu_assert(td_add(y, 115, 1) == 0, "Insertion of 115"); 136 | mu_assert(td_add(y, 116, 1) == 0, "Insertion of 116"); 137 | mu_assert(td_add(y, 117, 1) == 0, "Insertion of 117"); 138 | mu_assert(td_add(y, 118, 1) == 0, "Insertion of 118"); 139 | mu_assert(td_add(y, 119, 1) == 0, "Insertion of 119"); 140 | mu_assert(td_add(y, 120, 1) == 0, "Insertion of 120"); 141 | 142 | for (size_t i = 0; i < 10; i++) { 143 | td_histogram_t *zz = td_new(10); 144 | int self_merge_res = 0; 145 | mu_assert(td_merge(zz, x) == 0, "1st merge x into z"); 146 | mu_assert(td_merge(zz, y) == 0, "1st merge y into z"); 147 | mu_assert(td_merge(zz, x) == 0, "2nd merge x into z"); 148 | mu_assert(td_merge(zz, y) == 0, "2nd merge y into z"); 149 | mu_assert(td_merge(zz, x) == 0, "3rd merge x into z"); 150 | for (size_t j = 0; j < 5; j++) { 151 | self_merge_res = td_merge(zz, z); 152 | } 153 | td_free(z); 154 | z = zz; 155 | mu_assert((z->merged_weight + z->unmerged_weight) > 0, "assert z contains weight"); 156 | if (self_merge_res == EDOM) 157 | break; 158 | } 159 | 160 | td_free(x); 161 | td_free(y); 162 | td_free(z); 163 | } 164 | 165 | MU_TEST(test_quantile_interpolations) { 166 | td_histogram_t *t = td_new(10); 167 | mu_assert(t != NULL, "created_histogram"); 168 | mu_assert_long_eq(0, t->unmerged_weight); 169 | mu_assert_long_eq(0, t->merged_weight); 170 | mu_assert(td_add(t, 5.0, 2) == 0, "add"); 171 | mu_assert_long_eq(2, t->unmerged_weight); 172 | // with one data point, all quantiles lead to Rome 173 | mu_assert_double_eq(5.0, td_quantile(t, .0)); 174 | mu_assert_double_eq(5.0, td_quantile(t, 0.5)); 175 | mu_assert_double_eq(5.0, td_quantile(t, 1.0)); 176 | mu_assert(td_compress(t) == 0, "compress"); 177 | mu_assert_long_eq(0, t->unmerged_weight); 178 | mu_assert_long_eq(2, t->merged_weight); 179 | mu_assert(td_add(t, 100.0, 1) == 0, "Insertion"); 180 | // we know that there are at least two centroids now 181 | td_free(t); 182 | } 183 | 184 | MU_TEST(test_trimmed_mean_simple) { 185 | /* Used numpy to check results validity 186 | import numpy as np 187 | from scipy import stats 188 | x = [5,5,5,10,15,15,15] 189 | np.mean(x) 190 | 10.0 191 | stats.trim_mean(x, 0.0) 192 | 10.0 193 | */ 194 | td_histogram_t *t = td_new(100); 195 | mu_assert(t != NULL, "created_histogram"); 196 | mu_assert_long_eq(0, t->unmerged_weight); 197 | mu_assert_long_eq(0, t->merged_weight); 198 | // stats.trim_mean([], 0.49) 199 | // nan 200 | mu_assert_double_eq(NAN, td_trimmed_mean_symmetric(t, .49)); 201 | mu_assert_double_eq(NAN, td_trimmed_mean(t, 0.49, 0.51)); 202 | mu_assert(td_add(t, 5.0, 1) == 0, "Insertion"); 203 | // with one data point, all quantiles lead to Rome 204 | // stats.trim_mean(x, 0.49) 205 | mu_assert_double_eq(5, td_trimmed_mean_symmetric(t, .49)); 206 | mu_assert_double_eq(5, td_trimmed_mean(t, 0.49, 0.51)); 207 | // stats.trim_mean(x, 0.1) 208 | // 5.0 209 | mu_assert_double_eq(5, td_trimmed_mean_symmetric(t, .1)); 210 | mu_assert_double_eq(5, td_trimmed_mean(t, 0.1, 0.9)); 211 | // 5.0 212 | // stats.trim_mean(x, 0.0) 213 | mu_assert_double_eq(5, td_trimmed_mean_symmetric(t, .0)); 214 | mu_assert_double_eq(5, td_trimmed_mean(t, 0.0, 1)); 215 | // 5.0 216 | mu_assert(td_add(t, 5.0, 2) == 0, "Insertion"); 217 | mu_assert_double_eq(5, td_trimmed_mean_symmetric(t, .0)); 218 | mu_assert_double_eq(5, td_trimmed_mean(t, 0.0, 1)); 219 | mu_assert(td_add(t, 10.0, 1) == 0, "Insertion"); 220 | mu_assert(td_add(t, 15.0, 3) == 0, "Insertion"); 221 | // stats.trim_mean(x, 0.0) 222 | // 10.0 223 | mu_assert_double_eq(10, td_trimmed_mean_symmetric(t, .0)); 224 | mu_assert_double_eq(10, td_trimmed_mean(t, 0.0, 1)); 225 | // trimmed mean and mean should lead to 10 in here 226 | // stats.trim_mean(x, 0.1) 227 | // 10.0 228 | mu_assert_double_eq(10, td_trimmed_mean_symmetric(t, .1)); 229 | mu_assert_double_eq(10, td_trimmed_mean(t, .1, .9)); 230 | // trimmed mean and mean should lead to 10 in here 231 | // stats.trim_mean(x, 0.25) 232 | // 10.0 233 | mu_assert_double_eq(10, td_trimmed_mean_symmetric(t, .25)); 234 | mu_assert_double_eq(10, td_trimmed_mean(t, .25, .75)); 235 | td_free(t); 236 | } 237 | 238 | MU_TEST(test_trimmed_mean_complex) { 239 | /* Used numpy to check results validity 240 | import numpy as np 241 | from scipy import stats 242 | x = np.arange(20) 243 | stats.trim_mean(x, 0.1) 244 | 9.5 245 | */ 246 | td_histogram_t *t = td_new(100); 247 | mu_assert(t != NULL, "created_histogram"); 248 | mu_assert_long_eq(0, t->unmerged_weight); 249 | mu_assert_long_eq(0, t->merged_weight); 250 | for (int i = 0; i < 20; ++i) { 251 | mu_assert(td_add(t, (double)i, 1) == 0, "Insertion"); 252 | } 253 | // trimmed mean and mean should lead to 9.5 in here 254 | // stats.trim_mean(x, 0.25) 255 | // 9.5 256 | mu_assert_double_eq(9.5, td_trimmed_mean_symmetric(t, .25)); 257 | mu_assert_double_eq(9.5, td_trimmed_mean(t, .25, .75)); 258 | td_free(t); 259 | t = td_new(100); 260 | mu_assert(t != NULL, "created_histogram"); 261 | mu_assert_long_eq(0, t->unmerged_weight); 262 | mu_assert_long_eq(0, t->merged_weight); 263 | for (int i = 0; i < 200; ++i) { 264 | mu_assert(td_add(t, (double)i, 1) == 0, "Insertion"); 265 | } 266 | // trimmed mean and mean should lead to 99.5 in here 267 | // x = np.arange(200) 268 | // stats.trim_mean(x, 0.25) 269 | // 99.5 270 | mu_assert_double_eq_epsilon(99.5, td_trimmed_mean_symmetric(t, .25), 0.1); 271 | mu_assert_double_eq_epsilon(99.5, td_trimmed_mean(t, .25, .75), 0.1); 272 | 273 | // Non symmetric trimmed means 274 | // trim_mean(x, 0.1, 0.75) 275 | // 84.5 276 | mu_assert_double_eq_epsilon(84.5, td_trimmed_mean(t, .1, 0.75), 0.1); 277 | // trim_mean(x, 0.0, 0.75) 278 | // 74.5 279 | mu_assert_double_eq_epsilon(74.5, td_trimmed_mean(t, .0, 0.75), 0.1); 280 | 281 | td_free(t); 282 | // x = [1,2,3,4,5,6,7,8,9,10,100,100,100] 283 | t = td_new(100); 284 | for (int i = 1; i < 11; ++i) { 285 | mu_assert(td_add(t, (double)i, 1) == 0, "Insertion"); 286 | } 287 | mu_assert(td_add(t, 100, 3) == 0, "Insertion"); 288 | // stats.trim_mean(x, 0.1) 289 | // 23.09090909090909 290 | mu_assert_double_eq_epsilon(23.09090909090909, td_trimmed_mean_symmetric(t, .1), 0.01); 291 | mu_assert_double_eq_epsilon(23.09090909090909, td_trimmed_mean(t, .1, .9), 0.01); 292 | // stats.trim_mean(x, 0.25) 293 | // 7.0 294 | mu_assert_double_eq_epsilon(7.0, td_trimmed_mean_symmetric(t, .25), 0.01); 295 | mu_assert_double_eq_epsilon(7.0, td_trimmed_mean(t, .25, .75), 0.01); 296 | td_free(t); 297 | } 298 | 299 | MU_TEST(test_compress_small) { 300 | td_histogram_t *t = td_new(100); 301 | mu_assert(t != NULL, "created_histogram"); 302 | mu_assert(td_add(t, 1.0, 1) == 0, "Insertion"); 303 | mu_assert_double_eq(1.0, td_min(t)); 304 | mu_assert_double_eq(1.0, td_max(t)); 305 | mu_assert_double_eq(1.0, td_size(t)); 306 | mu_assert_int_eq(1, td_centroid_count(t)); 307 | mu_assert_long_eq(0, t->total_compressions); 308 | mu_assert_double_eq(1.0, td_centroids_mean_at(t, 0)); 309 | mu_assert_long_eq(1, td_centroids_weight_at(t, 0)); 310 | mu_assert_int_eq(1, t->unmerged_nodes); 311 | mu_assert_int_eq(0, t->merged_nodes); 312 | mu_assert(td_compress(t) == 0, "compress"); 313 | mu_assert_long_eq(1, t->unmerged_nodes + t->merged_nodes); 314 | mu_assert_double_eq(1.0, td_centroids_mean_at(t, 0)); 315 | mu_assert_long_eq(1, td_centroids_weight_at(t, 0)); 316 | mu_assert_double_eq(1.0, td_quantile(t, 0.001)); 317 | mu_assert_double_eq(1.0, td_quantile(t, 0.01)); 318 | mu_assert_double_eq(1.0, td_quantile(t, 0.5)); 319 | mu_assert_double_eq(1.0, td_quantile(t, 0.99)); 320 | mu_assert_double_eq(1.0, td_quantile(t, 0.999)); 321 | td_free(t); 322 | } 323 | 324 | MU_TEST(test_compress_large) { 325 | td_histogram_t *t = td_new(100); 326 | mu_assert(t != NULL, "created_histogram"); 327 | for (int i = 1; i <= 1000; ++i) { 328 | mu_assert(td_add(t, (double)i, 1) == 0, "Insertion"); 329 | } 330 | 331 | mu_assert_double_eq(1.0, td_min(t)); 332 | mu_assert_double_eq(1000.0, td_max(t)); 333 | mu_assert_double_eq(1000.0, td_size(t)); 334 | // TODO: add this test cases 335 | // EXPECT_EQ(500500, digest.sum()); 336 | // EXPECT_EQ(500.5, digest.mean()); 337 | // mu_assert_double_eq(1.5, td_quantile(t, 0.001)); 338 | mu_assert_double_eq(10.5, td_quantile(t, 0.01)); 339 | // mu_assert_double_eq_epsilon(500.25, td_quantile(t, 0.5), 0.5); 340 | // TODO: swap this one by the bellow 341 | // mu_assert_double_eq(990.25, td_quantile(t, 0.99)); 342 | mu_assert_double_eq_epsilon(990.25, td_quantile(t, 0.99), 0.5); 343 | // mu_assert_double_eq(999.5, td_quantile(t, 0.999)); 344 | td_free(t); 345 | } 346 | 347 | MU_TEST(test_negative_values) { 348 | td_histogram_t *t = td_new(1000); 349 | mu_assert(t != NULL, "created_histogram"); 350 | for (int i = 1; i <= 100; ++i) { 351 | mu_assert(td_add(t, (double)i, 1) == 0, "Insertion"); 352 | mu_assert(td_add(t, -(double)i, 1) == 0, "Insertion"); 353 | } 354 | mu_assert_double_eq(-100.0, td_min(t)); 355 | mu_assert_double_eq(100.0, td_max(t)); 356 | mu_assert_double_eq(200.0, td_size(t)); 357 | mu_assert_double_eq(-100, td_quantile(t, 0.0)); 358 | mu_assert_double_eq(-100, td_quantile(t, 0.001)); 359 | // TODO: fix my epsilon 360 | mu_assert_double_eq_epsilon(-98.5, td_quantile(t, 0.01), 0.75); 361 | mu_assert_double_eq_epsilon(98.5, td_quantile(t, 0.99), 0.75); 362 | mu_assert_double_eq(100, td_quantile(t, 0.999)); 363 | mu_assert_double_eq(100, td_quantile(t, 1)); 364 | td_free(t); 365 | } 366 | 367 | MU_TEST(test_negative_values_merge) { 368 | td_histogram_t *d1 = td_new(100); 369 | td_histogram_t *d2 = td_new(100); 370 | mu_assert(d1 != NULL, "created_histogram"); 371 | mu_assert(d2 != NULL, "created_histogram"); 372 | for (int i = 1; i <= 100; ++i) { 373 | mu_assert(td_add(d1, (double)i, 1) == 0, "Insertion"); 374 | mu_assert(td_add(d2, -(double)i, 1) == 0, "Insertion"); 375 | } 376 | td_merge(d1, d2); 377 | mu_assert_double_eq(-100.0, td_min(d1)); 378 | mu_assert_double_eq(100.0, td_max(d1)); 379 | mu_assert_double_eq(200.0, td_size(d1)); 380 | mu_assert_double_eq(-100, td_quantile(d1, 0.0)); 381 | mu_assert_double_eq(-100, td_quantile(d1, 0.001)); 382 | // TODO: fix my epsilon 383 | mu_assert_double_eq_epsilon(-98.5, td_quantile(d1, 0.01), 0.75); 384 | mu_assert_double_eq_epsilon(98.5, td_quantile(d1, 0.99), 0.75); 385 | mu_assert_double_eq(100, td_quantile(d1, 0.999)); 386 | mu_assert_double_eq(100, td_quantile(d1, 1)); 387 | td_free(d1); 388 | td_free(d2); 389 | } 390 | 391 | MU_TEST(test_large_outlier_test) { 392 | td_histogram_t *t = td_new(100); 393 | mu_assert(t != NULL, "created_histogram"); 394 | for (int i = 1; i <= 19; ++i) { 395 | mu_assert(td_add(t, (double)i, 1) == 0, "Insertion"); 396 | } 397 | mu_assert(td_add(t, 1000000, 1) == 0, "Insertion"); 398 | mu_assert(td_quantile(t, 0.5) < td_quantile(t, 0.9), 399 | "False: td_quantile(t, 0.5) < td_quantile(t, 0.9)"); 400 | td_free(t); 401 | } 402 | 403 | MU_TEST(test_nans) { 404 | td_histogram_t *t = td_new(1000); 405 | mu_assert(isnan(td_quantile(t, 0)), "empty value at 0"); 406 | mu_assert(isnan(td_quantile(t, 0.5)), "empty value at .5"); 407 | mu_assert(isnan(td_quantile(t, 1)), "empty value at 1"); 408 | mu_assert(isnan(td_centroids_mean_at(t, 1)), "td_centroids_mean_at on pos > h->merged_nodes"); 409 | mu_assert(isnan(td_centroids_mean_at(t, -1)), "td_centroids_mean_at on pos < 0"); 410 | mu_assert(td_add(t, 1, 1) == 0, "Insertion"); 411 | mu_assert(isnan(td_quantile(t, -.1)), "value at -0.1"); 412 | mu_assert(isnan(td_quantile(t, 1.1)), "value at 1.1"); 413 | td_free(t); 414 | } 415 | 416 | MU_TEST(test_two_interp) { 417 | td_histogram_t *t = td_new(1000); 418 | mu_assert(td_add(t, 1, 1) == 0, "Insertion"); 419 | mu_assert(td_add(t, 10, 1) == 0, "Insertion"); 420 | mu_assert(isfinite(td_quantile(t, .9)), "test_two_interp: value at .9"); 421 | td_reset(t); 422 | // if the left centroid has more than one sample, we still know 423 | // that one sample occurred at min so we can do some interpolation 424 | mu_assert(td_add(t, 1, 10) == 0, "Insertion"); 425 | mu_assert(td_add(t, 10, 1) == 0, "Insertion"); 426 | mu_assert_double_eq(1.0, td_quantile(t, .1)); 427 | td_reset(t); 428 | // if the right-most centroid has more than one sample, we still know 429 | // that one sample occurred at max so we can do some interpolation 430 | mu_assert(td_add(t, 1, 1) == 0, "Insertion"); 431 | mu_assert(td_add(t, 10, 10) == 0, "Insertion"); 432 | mu_assert_double_eq(10.0, td_quantile(t, .9)); 433 | td_reset(t); 434 | // in between extremes we interpolate between centroids 435 | mu_assert(td_add(t, 1, 1) == 0, "Insertion"); 436 | mu_assert(td_add(t, 5, 1) == 0, "Insertion"); 437 | mu_assert(td_add(t, 10, 1) == 0, "Insertion"); 438 | // centroids i and i+1 bracket our current point 439 | // check for unit weight 440 | // within the singleton's sphere 441 | // left 442 | mu_assert_double_eq(5.0, td_quantile(t, .5)); 443 | td_reset(t); 444 | // in between extremes we interpolate between centroids 445 | mu_assert(td_add(t, 1, 1) == 0, "Insertion"); // q0 446 | mu_assert(td_add(t, 4, 1) == 0, "Insertion"); // q20 447 | mu_assert(td_add(t, 8, 1) == 0, "Insertion"); // q40 448 | mu_assert(td_add(t, 12, 1) == 0, "Insertion"); // q60 449 | mu_assert(td_add(t, 16, 1) == 0, "Insertion"); // q80 450 | mu_assert(td_add(t, 20, 1) == 0, "Insertion"); // q100 451 | // centroids i and i+1 bracket our current point 452 | // check for unit weight 453 | // within the singleton's sphere 454 | // TODO: check for right 455 | // mu_assert_double_eq(4.0, td_quantile(t, .20) ); 456 | // mu_assert_double_eq(8.0, td_quantile(t, .40) ); 457 | // mu_assert_double_eq(12.0, td_quantile(t, .60) ); 458 | // mu_assert_double_eq(7.0, td_quantile(t, .70) ); 459 | // mu_assert_double_eq(8.0, td_quantile(t, .75) ); 460 | td_free(t); 461 | } 462 | 463 | MU_TEST(test_cdf) { 464 | td_histogram_t *t = td_new(100); 465 | mu_assert(isnan(td_cdf(t, 1.1)), "no data to examine"); 466 | // interpolate if somehow we have weight > 0 and max != min 467 | mu_assert(td_add(t, 1, 1) == 0, "Insertion"); 468 | // bellow lower bound 469 | mu_assert_double_eq(0, td_cdf(t, 0)); 470 | // exactly one centroid, should have max==min 471 | // min and max are too close together to do any viable interpolation 472 | mu_assert_double_eq(0.5, td_cdf(t, 1)); 473 | // above upper bound 474 | mu_assert_double_eq(1.0, td_cdf(t, 2)); 475 | mu_assert(td_add(t, 10, 1) == 0, "Insertion"); 476 | mu_assert_double_eq(.25, td_cdf(t, 1)); 477 | mu_assert_double_eq(.5, td_cdf(t, 5.5)); 478 | // // TODO: fix this 479 | // mu_assert_double_eq(1,td_cdf(t, 10)); 480 | td_free(t); 481 | } 482 | 483 | MU_TEST(test_td_size) { 484 | load_histograms(); 485 | mu_assert(td_size(histogram) == STREAM_SIZE, "td_size(histogram) != STREAM_SIZE"); 486 | } 487 | 488 | MU_TEST(test_td_max) { 489 | load_histograms(); 490 | mu_assert_double_eq_epsilon(10.0, td_max(histogram), 0.001); 491 | } 492 | 493 | MU_TEST(test_td_min) { 494 | load_histograms(); 495 | mu_assert_double_eq_epsilon(0.0, td_min(histogram), 0.001); 496 | } 497 | 498 | MU_TEST(test_td_init) { 499 | td_histogram_t *t; 500 | // overflow detected 501 | // mu_assert_long_eq(1, td_init(10000000000000000, &t)); 502 | t = NULL; 503 | // bellow overflow 504 | mu_assert_long_eq(0, td_init(1000, &t)); 505 | td_free(t); 506 | 507 | mu_assert_long_eq(0, td_init(1000000, &t)); 508 | td_free(t); 509 | 510 | mu_assert_long_eq(0, td_init(100000000, &t)); 511 | td_free(t); 512 | } 513 | 514 | MU_TEST(test_quantiles) { 515 | load_histograms(); 516 | mu_assert_double_eq_epsilon(0.0, td_quantile(histogram, 0.0), 0.001); 517 | mu_assert_double_eq_epsilon(1.0, td_quantile(histogram, 0.1), 0.02); 518 | mu_assert_double_eq_epsilon(2.0, td_quantile(histogram, 0.2), 0.02); 519 | mu_assert_double_eq_epsilon(3.0, td_quantile(histogram, 0.3), 0.03); 520 | mu_assert_double_eq_epsilon(4.0, td_quantile(histogram, 0.4), 0.04); 521 | mu_assert_double_eq_epsilon(5.0, td_quantile(histogram, 0.5), 0.05); 522 | mu_assert_double_eq_epsilon(6.0, td_quantile(histogram, 0.6), 0.04); 523 | mu_assert_double_eq_epsilon(7.0, td_quantile(histogram, 0.7), 0.03); 524 | mu_assert_double_eq_epsilon(8.0, td_quantile(histogram, 0.8), 0.02); 525 | mu_assert_double_eq_epsilon(9.0, td_quantile(histogram, 0.9), 0.02); 526 | mu_assert_double_eq_epsilon(9.99, td_quantile(histogram, 0.999), 0.01); 527 | mu_assert_double_eq_epsilon(9.999, td_quantile(histogram, 0.9999), 0.01); 528 | mu_assert_double_eq_epsilon(9.9999, td_quantile(histogram, 0.99999), 0.01); 529 | mu_assert_double_eq_epsilon(10.0, td_quantile(histogram, 1), 0.001); 530 | } 531 | 532 | MU_TEST(test_quantiles_multiple) { 533 | load_histograms(); 534 | const size_t quantiles_arr_size = 14; 535 | double values[14] = {0.0}; 536 | double percentiles[14] = {0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 537 | 0.7, 0.8, 0.9, 0.999, 0.9999, 0.99999, 1.0}; 538 | mu_assert(td_quantiles(histogram, NULL, values, quantiles_arr_size) == EINVAL, 539 | "td_quantiles on NULL percentiles should return EINVAL"); 540 | mu_assert(td_quantiles(histogram, percentiles, NULL, quantiles_arr_size) == EINVAL, 541 | "td_quantiles on NULL values should return EINVAL"); 542 | mu_assert(td_quantiles(histogram, percentiles, values, quantiles_arr_size) == 0, 543 | "td_quantiles return should be 0"); 544 | mu_assert_double_eq_epsilon(0.0, values[0], 0.001); 545 | mu_assert_double_eq_epsilon(1.0, values[1], 0.02); 546 | mu_assert_double_eq_epsilon(2.0, values[2], 0.02); 547 | mu_assert_double_eq_epsilon(3.0, values[3], 0.03); 548 | mu_assert_double_eq_epsilon(4.0, values[4], 0.04); 549 | mu_assert_double_eq_epsilon(5.0, values[5], 0.05); 550 | mu_assert_double_eq_epsilon(6.0, values[6], 0.04); 551 | mu_assert_double_eq_epsilon(7.0, values[7], 0.03); 552 | mu_assert_double_eq_epsilon(8.0, values[8], 0.02); 553 | mu_assert_double_eq_epsilon(9.0, values[9], 0.02); 554 | mu_assert_double_eq_epsilon(9.99, values[10], 0.01); 555 | mu_assert_double_eq_epsilon(9.999, values[11], 0.01); 556 | mu_assert_double_eq_epsilon(9.9999, values[12], 0.01); 557 | mu_assert_double_eq_epsilon(10.0, values[13], 0.001); 558 | td_free(histogram); 559 | td_histogram_t *t = td_new(100); 560 | mu_assert(td_quantiles(t, percentiles, values, quantiles_arr_size) == 0, 561 | "td_quantiles return should be 0"); 562 | for (int i = 0; i < quantiles_arr_size; ++i) { 563 | mu_assert(isnan(values[i]), "no data to examine"); 564 | } 565 | mu_assert(td_add(t, 1, 1) == 0, "Insertion"); 566 | // with one data point, all quantiles lead to Rome 567 | mu_assert(td_quantiles(t, percentiles, values, quantiles_arr_size) == 0, 568 | "td_quantiles return should be 0"); 569 | for (int i = 0; i < quantiles_arr_size; ++i) { 570 | mu_assert_double_eq_epsilon(1.0, values[i], 0.02); 571 | } 572 | // q should be in [0,1] 573 | double percentiles_nans[14] = {-10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 574 | 10.7, 10.8, 10.9, -0.999, -0.9999, -0.99999, -1.0}; 575 | mu_assert(td_quantiles(t, percentiles_nans, values, quantiles_arr_size) == 0, 576 | "td_quantiles return should be 0"); 577 | for (int i = 0; i < quantiles_arr_size; ++i) { 578 | mu_assert(isnan(values[i]), " q should be in [0,1]"); 579 | } 580 | td_free(t); 581 | } 582 | 583 | MU_TEST_SUITE(test_suite) { 584 | MU_RUN_TEST(test_basic); 585 | MU_RUN_TEST(test_td_init); 586 | MU_RUN_TEST(test_compress_small); 587 | MU_RUN_TEST(test_compress_large); 588 | MU_RUN_TEST(test_nans); 589 | MU_RUN_TEST(test_negative_values); 590 | MU_RUN_TEST(test_negative_values_merge); 591 | MU_RUN_TEST(test_large_outlier_test); 592 | MU_RUN_TEST(test_two_interp); 593 | MU_RUN_TEST(test_cdf); 594 | MU_RUN_TEST(test_td_size); 595 | MU_RUN_TEST(test_td_max); 596 | MU_RUN_TEST(test_td_min); 597 | MU_RUN_TEST(test_quantiles); 598 | MU_RUN_TEST(test_quantiles_multiple); 599 | MU_RUN_TEST(test_quantile_interpolations); 600 | MU_RUN_TEST(test_trimmed_mean_simple); 601 | MU_RUN_TEST(test_trimmed_mean_complex); 602 | MU_RUN_TEST(test_overflow); 603 | MU_RUN_TEST(test_overflow_merge); 604 | } 605 | 606 | int main(int argc, char *argv[]) { 607 | MU_RUN_SUITE(test_suite); 608 | MU_REPORT(); 609 | return MU_EXIT_CODE; 610 | } 611 | --------------------------------------------------------------------------------