├── .circleci
    └── config.yml
├── .clang-format
├── .github
    ├── release-drafter-config.yml
    └── workflows
    │   └── release-drafter.yml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── LICENSE.md
├── Makefile
├── README.md
├── cmake
    └── UseCodeCoverage.cmake
├── examples
    ├── CMakeLists.txt
    └── quantile_example.c
├── src
    ├── .gitignore
    ├── CMakeLists.txt
    ├── td_malloc.h
    ├── tdigest.c
    └── tdigest.h
└── tests
    ├── CMakeLists.txt
    ├── benchmark
        └── histogram_benchmark.cpp
    └── unit
        ├── minunit.h
        └── td_test.c


/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | # Use the latest 2.1 version of CircleCI pipeline process engine. See: https://circleci.com/docs/2.0/configuration-reference
  2 | version: 2.1
  3 | 
  4 | jobs:
  5 |   lint:
  6 |     docker:
  7 |       - image: redislabsmodules/llvm-toolset:latest
  8 |     steps:
  9 |       - checkout
 10 |       - run:
 11 |           name: lint
 12 |           command: |
 13 |             make lint
 14 | 
 15 |   sanitize:
 16 |     docker:
 17 |       - image: redislabsmodules/llvm-toolset:latest
 18 |     steps:
 19 |       - checkout
 20 |       - run:
 21 |           name: Install CMAKE
 22 |           command: 'apt update -q && apt install -y cmake'
 23 |       - run:
 24 |           name: Pull Submodules
 25 |           command: git submodule update --init --recursive
 26 |       - run:
 27 |           name: Build & Test with sanitizers
 28 |           command: |
 29 |             make sanitize
 30 | 
 31 |   static-analysis-infer:
 32 |     docker:
 33 |       - image: redisbench/infer-linux64:1.0.0
 34 |     steps:
 35 |       - checkout
 36 |       - run:
 37 |           name: Submodule checkout
 38 |           command: git submodule update --init --recursive
 39 |       - run:
 40 |           name: run fbinfer
 41 |           command: |
 42 |             CC=clang CXX=clang++ INFER=infer make static-analysis
 43 |   build:
 44 |     docker:
 45 |       - image: "debian:bullseye"
 46 |     steps:
 47 |       - run:
 48 |           name: Installing SUDO
 49 |           command: 'apt update && apt install -y sudo && rm -rf /var/lib/apt/lists/*'
 50 |       - run:
 51 |           name: Installing GCC
 52 |           command: 'apt update && apt install -y gcc g++'
 53 |       - run:
 54 |           name: Install CMAKE
 55 |           command: 'apt install -y cmake'
 56 |       - run:
 57 |           name: Installing LCOV
 58 |           command: 'apt install -y lcov'
 59 |       - run:
 60 |           name: Installing CURL
 61 |           command: 'apt install -y curl'
 62 |       - run:
 63 |           name: Installing GIT
 64 |           command: 'apt install -y git'
 65 |       - checkout
 66 |       - run:
 67 |           name: Pull Submodules
 68 |           command: git submodule update --init --recursive
 69 |       - run:
 70 |           name: Build & Test
 71 |           command: |
 72 |             make clean
 73 |             make coverage
 74 |             cd build && bash <(curl -s https://codecov.io/bash) -f coverage.info -X gcov -x gcov-7 || echo "Codecov did not collect coverage reports"
 75 |       - run:
 76 |           name: Install benchmark dependencies
 77 |           command: |
 78 |             apt update
 79 |             apt install python3-pip -y
 80 |             pip3 install redisbench-admin
 81 |       - run:
 82 |           name: Benchmark
 83 |           command: |
 84 |             make bench
 85 |             redisbench-admin export \
 86 |               --results-format google.benchmark \
 87 |               --github_repo $CIRCLE_PROJECT_REPONAME \
 88 |               --github_org $CIRCLE_PROJECT_USERNAME \
 89 |               --github_branch $CIRCLE_BRANCH \
 90 |               --benchmark-result-file results.json
 91 | 
 92 | 
 93 | 
 94 | workflows:
 95 |   commit:
 96 |     jobs:
 97 |       - lint
 98 |       - build:
 99 |           context: common
100 |       - sanitize
101 |   nightly:
102 |     triggers:
103 |       - schedule:
104 |           cron: "0 0 * * *"
105 |           filters:
106 |             branches:
107 |               only:
108 |                 - master
109 |     jobs:
110 |       - build:
111 |           context: common
112 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | IndentWidth: 4
2 | ColumnLimit: 100
3 | SortIncludes: false
4 | AlignEscapedNewlinesLeft: false
5 | SpacesBeforeTrailingComments: 1


--------------------------------------------------------------------------------
/.github/release-drafter-config.yml:
--------------------------------------------------------------------------------
 1 | name-template: 'Version $NEXT_PATCH_VERSION'
 2 | tag-template: 'v$NEXT_PATCH_VERSION'
 3 | categories:
 4 |   - title: 'Features'
 5 |     labels:
 6 |       - 'feature'
 7 |       - 'enhancement'
 8 |   - title: 'Bug Fixes'
 9 |     labels:
10 |       - 'fix'
11 |       - 'bugfix'
12 |       - 'bug'
13 |   - title: 'Maintenance'
14 |     label: 'chore'
15 | change-template: '- $TITLE (#$NUMBER)'
16 | exclude-labels:
17 |   - 'skip-changelog'
18 | template: |
19 |   ## Changes
20 | 
21 |   $CHANGES
22 | 


--------------------------------------------------------------------------------
/.github/workflows/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | name: Release Drafter
 2 | 
 3 | on:
 4 |   push:
 5 |     # branches to consider in the event; optional, defaults to all
 6 |     branches:
 7 |       - master
 8 | 
 9 | jobs:
10 |   update_release_draft:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       # Drafts your next Release notes as Pull Requests are merged into "master"
14 |       - uses: release-drafter/release-drafter@v5
15 |         with:
16 |           # (Optional) specify config name to use, relative to .github/. Default: release-drafter.yml
17 |            config-name: release-drafter-config.yml
18 |         env:
19 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | README_cache
 3 | src/*.o
 4 | src/*.so
 5 | src/*.dll
 6 | CMakeLists.txt.user
 7 | CMakeCache.txt
 8 | CMakeFiles
 9 | CMakeScripts
10 | Testing
11 | cmake-build-debug
12 | cmake_install.cmake
13 | install_manifest.txt
14 | compile_commands.json
15 | CTestTestfile.cmake
16 | _deps
17 | build/*
18 | .vscode
19 | tests/vendor/*
20 | 
21 | # perf related
22 | perf.data*
23 | 
24 | 
25 | # fb infer static analysis
26 | infer-out/*
27 | 
28 | # Prerequisites
29 | *.d
30 | 
31 | # Object files
32 | *.o
33 | *.ko
34 | *.obj
35 | *.elf
36 | 
37 | # Linker output
38 | *.ilk
39 | *.map
40 | *.exp
41 | 
42 | # Precompiled Headers
43 | *.gch
44 | *.pch
45 | 
46 | # Libraries
47 | *.lib
48 | *.a
49 | *.la
50 | *.lo
51 | 
52 | # Shared objects (inc. Windows DLLs)
53 | *.dll
54 | *.so
55 | *.so.*
56 | *.dylib
57 | 
58 | # Executables
59 | *.exe
60 | *.out
61 | *.app
62 | *.i*86
63 | *.x86_64
64 | *.hex
65 | 
66 | # Debug files
67 | *.dSYM/
68 | *.su
69 | *.idb
70 | *.pdb
71 | 
72 | # Kernel Module Compile Results
73 | *.mod*
74 | *.cmd
75 | .tmp_versions/
76 | modules.order
77 | Module.symvers
78 | Mkfile.old
79 | dkms.conf
80 | 
81 | # IDE
82 | .idea


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tests/vendor/google/benchmark"]
2 | 	path = tests/vendor/google/benchmark
3 | 	url = https://github.com/google/benchmark.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ## cmake flags
 2 | cmake_minimum_required (VERSION 3.0)
 3 | project(tdigest)
 4 | 
 5 | # CMake modules should be included in ${CMAKE_SOURCE_DIR}/cmake
 6 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
 7 | 
 8 | # --- Build options ---
 9 | option(BUILD_SHARED "Build shared library" ON)
10 | option(BUILD_STATIC "Build static library" ON)
11 | option(BUILD_BENCHMARK "Build benchmark" ON)
12 | option(BUILD_TESTS "Build tests" ON)
13 | OPTION(ENABLE_CODECOVERAGE "Enable code coverage testing support" OFF)
14 | OPTION(ENABLE_PROFILE "Enable code profiling support" OFF)
15 | option(BUILD_EXAMPLES "Build examples" ON)
16 | 
17 | # --- Build properties ---
18 | 
19 | # Set a default build type if none was specified
20 | set(default_build_type "Release")
21 | 
22 | IF(NOT CMAKE_BUILD_TYPE)
23 |     message(STATUS "Setting build type to '${default_build_type}' as none was specified.")
24 |     set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
25 |             STRING "Choose the type of build." FORCE)
26 |     # Set the possible values of build type for cmake-gui
27 |     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
28 |             "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
29 | ENDIF()
30 | 
31 | 
32 | if(ENABLE_SANITIZERS)
33 |   message(STATUS "Forcing build type to Debug to run coverage.")
34 |       set(CMAKE_BUILD_TYPE "Debug" CACHE
35 |               STRING "Choose the type of build." FORCE)
36 |       set (CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wshadow -Wpointer-arith -Wcast-qual -Wunused -Wstrict-prototypes -Wmissing-prototypes -Wwrite-strings -Werror -fno-omit-frame-pointer -fsanitize=address")
37 |       set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wshadow -Wpointer-arith -Wcast-qual -Wunused -Wstrict-prototypes -Wmissing-prototypes -Wwrite-strings -Werror  -fno-omit-frame-pointer -fsanitize=address")
38 |       set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address")
39 | ENDIF()
40 | 
41 | if(ENABLE_CODECOVERAGE)
42 |   message(STATUS "Forcing build type to Debug to run coverage.")
43 |       set(CMAKE_BUILD_TYPE "Debug" CACHE
44 |               STRING "Choose the type of build." FORCE)
45 |   # --- System Libraries ---
46 |   include(GNUInstallDirs)
47 |   include(UseCodeCoverage)
48 | ENDIF()
49 | 
50 | # Generate position-independent code (-fPIC on UNIX)
51 | set(CMAKE_POSITION_INDEPENDENT_CODE ON)
52 | 
53 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -std=c99")
54 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
55 | 
56 | if(ENABLE_PROFILE)
57 |   message(STATUS "Enabling profile flags.")
58 |   string (REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
59 |   string (REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
60 |   set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -g -ggdb -fno-omit-frame-pointer")
61 |   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g -ggdb -fno-omit-frame-pointer")
62 |   # enable vectorization report flags
63 |   # using Clang
64 |   if (CMAKE_C_COMPILER_ID MATCHES "Clang")
65 |   set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Rpass-analysis=loop-vectorize -Rpass=loop-vectorize -Rpass-missed=loop-vectorize")
66 | 
67 |   # using GCC
68 |   elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
69 |   set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -ftree-vectorize -fopt-info-vec-all")
70 | 
71 |   # using Intel C++
72 |   elseif (CMAKE_C_COMPILER_ID STREQUAL "Intel")
73 |   set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -qopt-report=5 -qopt-report-phase=vec")
74 | 
75 |   # using Visual Studio C++
76 |   elseif (CMAKE_C_COMPILER_ID STREQUAL "MSVC")
77 |   # TBD
78 |   endif()
79 | endif(ENABLE_PROFILE)
80 | 
81 | # --- Build directories ---
82 | add_subdirectory("src")
83 | 
84 | # --- Documentation ---
85 | # TODO
86 | 
87 | # --- Unit Tests ---
88 | ENABLE_TESTING()
89 | 
90 | if(BUILD_TESTS OR BUILD_BENCHMARK)
91 |   add_subdirectory("tests")
92 | endif()
93 | 
94 | # --- Examples ---
95 | if(BUILD_EXAMPLES)
96 |   add_subdirectory("examples")
97 | endif()
98 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2019
2 | COPYRIGHT HOLDER: Bob Rudis
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2019 Bob Rudis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | #----------------------------------------------------------------------------------------------------
  2 | # simple Makefile for T-Digest, relies on cmake to do the actual build.  
  3 | # Use CMAKE_LIBRARY_OPTIONS,CMAKE_LIBRARY_SHARED_OPTIONS,CMAKE_LIBRARY_STATIC_OPTIONS or CMAKE_FULL_OPTIONS argument to this Makefile to pass options to cmake.
  4 | #----------------------------------------------------------------------------------------------------
  5 | 
  6 | CC?=gcc
  7 | INFER?=./deps/infer
  8 | INFER_DOCKER?=redisbench/infer-linux64:1.0.0
  9 | ROOT=$(shell pwd)
 10 | SRCDIR := $(ROOT)/src
 11 | TESTDIR := $(ROOT)/tests/unit
 12 | BENCHDIR := $(ROOT)/tests/benchmark
 13 | 
 14 | 
 15 | ifndef CMAKE_LIBRARY_SHARED_OPTIONS
 16 | 	CMAKE_LIBRARY_SHARED_OPTIONS=\
 17 | 		-DBUILD_SHARED=ON \
 18 | 		-DBUILD_STATIC=OFF \
 19 | 		-DENABLE_CODECOVERAGE=OFF \
 20 | 		-DBUILD_TESTS=OFF \
 21 | 		-DBUILD_BENCHMARK=OFF \
 22 | 		-DBUILD_EXAMPLES=OFF
 23 | endif
 24 | 
 25 | ifndef CMAKE_LIBRARY_STATIC_OPTIONS
 26 | 	CMAKE_LIBRARY_STATIC_OPTIONS=\
 27 | 		-DBUILD_SHARED=OFF \
 28 | 		-DBUILD_STATIC=ON \
 29 | 		-DENABLE_CODECOVERAGE=OFF \
 30 | 		-DBUILD_TESTS=OFF \
 31 | 		-DBUILD_BENCHMARK=OFF \
 32 | 		-DBUILD_EXAMPLES=OFF
 33 | endif
 34 | 
 35 | ifndef CMAKE_LIBRARY_OPTIONS
 36 | 	CMAKE_LIBRARY_OPTIONS=\
 37 | 		-DBUILD_SHARED=ON \
 38 | 		-DBUILD_STATIC=ON \
 39 | 		-DENABLE_CODECOVERAGE=OFF \
 40 | 		-DBUILD_TESTS=OFF \
 41 | 		-DBUILD_EXAMPLES=OFF
 42 | endif
 43 | 
 44 | ifndef CMAKE_FULL_OPTIONS
 45 | 	CMAKE_FULL_OPTIONS=\
 46 | 		-DBUILD_SHARED=ON \
 47 | 		-DBUILD_STATIC=ON \
 48 | 		-DBUILD_TESTS=ON \
 49 | 		-DBUILD_BENCHMARK=ON \
 50 | 		-DBUILD_EXAMPLES=ON
 51 | endif
 52 | 
 53 | 
 54 | ifndef CMAKE_PROFILE_OPTIONS
 55 | 	CMAKE_PROFILE_OPTIONS=\
 56 | 		-DBUILD_SHARED=ON \
 57 | 		-DBUILD_STATIC=OFF \
 58 | 		-DENABLE_CODECOVERAGE=OFF \
 59 | 		-DBUILD_TESTS=OFF \
 60 | 		-DBUILD_BENCHMARK=ON \
 61 | 		-DBUILD_EXAMPLES=OFF \
 62 | 		-DENABLE_PROFILE=ON
 63 | endif
 64 | 
 65 | 
 66 | ifndef CMAKE_SANITIZE_OPTIONS
 67 | 	CMAKE_SANITIZE_OPTIONS=\
 68 | 		-DBUILD_SHARED=ON \
 69 | 		-DBUILD_STATIC=OFF \
 70 | 		-DENABLE_CODECOVERAGE=OFF \
 71 | 		-DBUILD_TESTS=ON \
 72 | 		-DBUILD_BENCHMARK=OFF \
 73 | 		-DBUILD_EXAMPLES=OFF \
 74 | 		-DENABLE_PROFILE=OFF \
 75 | 		-DENABLE_SANITIZERS=ON
 76 | endif
 77 | 
 78 | ifndef CMAKE_TEST_OPTIONS
 79 | 	CMAKE_TEST_OPTIONS=\
 80 | 		-DBUILD_SHARED=ON \
 81 | 		-DBUILD_STATIC=ON \
 82 | 		-DBUILD_TESTS=ON \
 83 | 		-DENABLE_CODECOVERAGE=ON \
 84 | 		-DBUILD_BENCHMARK=OFF \
 85 | 		-DBUILD_EXAMPLES=OFF
 86 | endif
 87 | 
 88 | ifndef CMAKE_BENCHMARK_OPTIONS
 89 | 	CMAKE_BENCHMARK_OPTIONS=\
 90 | 		-DBUILD_SHARED=ON \
 91 | 		-DBUILD_STATIC=OFF \
 92 | 		-DENABLE_CODECOVERAGE=OFF \
 93 | 		-DBUILD_TESTS=OFF \
 94 | 		-DBUILD_BENCHMARK=ON \
 95 | 		-DBUILD_EXAMPLES=OFF \
 96 | 		-DENABLE_PROFILE=OFF
 97 | endif
 98 | 
 99 | default: full
100 | 
101 | # just build the static library. Do not build tests or benchmarks
102 | library_static:
103 | 	( mkdir -p build; cd build ; cmake $(CMAKE_LIBRARY_STATIC_OPTIONS) .. ; $(MAKE) )
104 | 
105 | # just build the shared library. Do not build tests or benchmarks
106 | library_shared:
107 | 	( mkdir -p build; cd build ; cmake $(CMAKE_LIBRARY_SHARED_OPTIONS) .. ; $(MAKE) )
108 | 
109 | # just build the static and shared libraries. Do not build tests or benchmarks
110 | library_all:
111 | 	( mkdir -p build; cd build ; cmake $(CMAKE_LIBRARY_OPTIONS) .. ; $(MAKE) )
112 | 
113 | # just build the static and shared libraries and produce measurements
114 | # of accuracy versus compression factor for fixed data size
115 | # TODO:
116 | 
117 | # just build the static and shared libraries and tests
118 | unit_tests: 
119 | 	( mkdir -p build; cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ; $(MAKE) test)
120 | 
121 | test:
122 | 	$(MAKE) unit_tests
123 | 
124 | coverage:
125 | 	( mkdir -p build; cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ; $(MAKE) test; make coverage; )
126 | 	
127 | format:
128 | 	clang-format -style=file -i $(SRCDIR)/*.c
129 | 	clang-format -style=file -i $(SRCDIR)/*.h
130 | 	clang-format -style=file -i $(TESTDIR)/*.c
131 | 	clang-format -style=file -i $(TESTDIR)/*.h
132 | 	clang-format -style=file -i $(BENCHDIR)/*.cpp
133 | 
134 | lint:
135 | 	clang-format -style=file -Werror -n $(SRCDIR)/*.c
136 | 	clang-format -style=file -Werror -n $(SRCDIR)/*.h
137 | 	clang-format -style=file -Werror -n $(TESTDIR)/*.c
138 | 	clang-format -style=file -Werror -n $(TESTDIR)/*.h
139 | 	clang-format -style=file -Werror -n $(BENCHDIR)/*.cpp
140 | 
141 | # build all
142 | full:
143 | 	( mkdir -p build; cd build ; cmake $(CMAKE_FULL_OPTIONS) .. ; $(MAKE) )
144 | 
145 | # static-analysis-docker:
146 | # 	$(MAKE) clean
147 | # 	docker run -v $(ROOT)/:/t-digest-c/ --user "$(username):$(usergroup)" $(INFER_DOCKER) bash -c "cd t-digest-c && CC=clang infer run --keep-going --fail-on-issue --biabduction -- make test"
148 | 
149 | clean: distclean
150 | 
151 | distclean:
152 | 	rm -rf build/* 
153 | 
154 | sanitize: clean
155 | 	( mkdir -p build; cd build ; cmake $(CMAKE_SANITIZE_OPTIONS) .. ; $(MAKE) VERBOSE=1 )
156 | 	$(SHOW) build/tests/td_test
157 | 
158 | profile: clean
159 | 	( mkdir -p build; cd build ; cmake $(CMAKE_PROFILE_OPTIONS) .. ; $(MAKE) VERBOSE=1 2> $(basename $@).compiler_stedrr_output.txt )
160 | 
161 | bench: clean
162 | 	( mkdir -p build; cd build ; cmake $(CMAKE_BENCHMARK_OPTIONS) .. ; $(MAKE) VERBOSE=1 )
163 | 	$(SHOW) build/tests/histogram_benchmark --benchmark_min_time=5 --benchmark_out=results.json --benchmark_out_format=json
164 | 
165 | bench-quantile: clean
166 | 	( mkdir -p build; cd build ; cmake $(CMAKE_BENCHMARK_OPTIONS) .. ; $(MAKE) VERBOSE=1 )
167 | 	$(SHOW) build/tests/histogram_benchmark  --benchmark_min_time=5 --benchmark_filter="BM_td_quantile_lognormal_dist_given_array*|BM_td_quantiles_*"
168 | 
169 | perf-stat-bench:
170 | 	( mkdir -p build; cd build ; cmake $(CMAKE_PROFILE_OPTIONS) .. ; $(MAKE) VERBOSE=1 )
171 | 	$(SHOW) perf stat build/tests/histogram_benchmark --benchmark_min_time=10
172 | 
173 | perf-record-bench: clean
174 | 	( mkdir -p build; cd build ; cmake $(CMAKE_PROFILE_OPTIONS) .. ; $(MAKE) VERBOSE=1 )
175 | 	$(SHOW) perf record -g -o perf.data.td_add \
176 | 		build/tests/histogram_benchmark
177 | 
178 | perf-report-bench:
179 | 	$(SHOW) perf report -g "graph,0.5,caller" -i perf.data.td_add
180 | 
181 | perf-report-bench-pprof:
182 | 	go tool pprof -web perf.data.td_add
183 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ![License](https://img.shields.io/badge/License-MIT-blue.svg)
 3 | [![CircleCI](https://circleci.com/gh/RedisBloom/t-digest-c.svg?style=svg)](https://circleci.com/gh/RedisBloom/t-digest-c)
 4 | [![codecov](https://codecov.io/gh/RedisBloom/t-digest-c/branch/master/graph/badge.svg)](https://codecov.io/gh/RedisBloom/t-digest-c)
 5 | 
 6 | # T-Digest
 7 | 
 8 | Adaptive histogram based on something like streaming k-means crossed with Q-digest.
 9 | 
10 | 
11 | This implementation is a descendent of Ted MergingDigest, available at:
12 | [https://github.com/tdunning/t-digest/](https://github.com/tdunning/t-digest/)
13 | 
14 | 
15 | And contains the work of  Andrew Werner originally available at:
16 | [https://github.com/ajwerner/tdigestc](https://github.com/ajwerner/tdigestc)
17 | 
18 | ## Description
19 | 
20 | The t-Digest construction algorithm uses a variant of 1-dimensional
21 | k-means clustering to produce a very compact data structure that allows
22 | accurate estimation of quantiles. This t-Digest data structure can be
23 | used to estimate quantiles, compute other rank statistics or even to
24 | estimate related measures like trimmed means. The advantage of the
25 | t-Digest over previous digests for this purpose is that the t-Digest
26 | handles data with full floating point resolution. The accuracy of
27 | quantile estimates produced by t-Digests can be orders of magnitude more
28 | accurate than those produced by previous digest algorithms. Methods are
29 | provided to create and update t-Digests and retrieve quantiles from the
30 | accumulated distributions.
31 | 
32 | See [the original paper by Ted Dunning & Otmar
33 | Ertl](https://arxiv.org/abs/1902.04023) for more details on t-Digests.
34 | 
35 | ## What’s Inside
36 | 
37 | The following functions are implemented:
38 | 
39 |   - `td_add`: Add a value to the t-Digest with the specified count
40 |   - `td_create`: Allocate a new histogram
41 |   - `td_reset`: Empty out a histogram and re-initialize it
42 |   - `td_free`: Frees the memory associated with the t-Digest
43 |   - `td_compress`: Re-examines a the t-Digest to determine whether some centroids are redundant
44 |   - `td_merge`: Merge one t-Digest into another
45 |   - `td_cdf`:  Returns the fraction of all points added which are &le; x.
46 |   - `td_quantile`: Returns an estimate of the cutoff such that a specified fraction of the data added to the t-Digest would be less than or equal to the cutoff.
47 |   - `td_quantiles`: Returns an estimate of the cutoff such that a specified fraction of the data added to the t-Digest would be less than or equal to the given cutoffs.
48 |   - `td_size`: Return the number of points that have been added to the t-Digest
49 |   - `td_centroid_count`: Return the number of centroids being used by the t-Digest
50 |   - `td_min`: Get the minimum value from the histogram.  Will return __DBL_MAX__ if the histogram is empty
51 |   - `td_max`: Get the maximum value from the histogram.  Will return __DBL_MIN__ if the histogram is empty
52 |   - `td_trimmed_mean`: Returns the trimmed mean ignoring values outside given cutoff upper and lower limits
53 |   - `td_trimmed_mean_symmetric`: Returns the trimmed mean ignoring values outside given a symmetric cutoff limits
54 | 
55 | ## Build notes
56 | 
57 | ``` 
58 | # Build
59 | git clone https://github.com/RedisBloom/t-digest-c.git
60 | cd t-digest-c/
61 | git submodule update --init --recursive
62 | make
63 | ```
64 | 
65 | ## Testing 
66 | Assuming you've followed the previous build steps, it should be as easy as:
67 | ``` 
68 | # Run the unit tests
69 | make test
70 | ```
71 | 
72 | ## Benchmarking
73 | 
74 | Assuming you've followed the previous build steps, it should be as easy as:
75 | ``` 
76 | # Run the benchmark
77 | make bench
78 | ```
79 | 
80 | ## Code of Conduct
81 | 
82 | Please note that this project is released with a Contributor Code of
83 | Conduct. By participating in this project you agree to abide by its
84 | terms.
85 | 


--------------------------------------------------------------------------------
/cmake/UseCodeCoverage.cmake:
--------------------------------------------------------------------------------
 1 | # - Enable Code Coverage
 2 | #
 3 | # Variables you may define are:
 4 | #  CODECOV_HTMLOUTPUTDIR - the name of the directory where HTML results are placed. Defaults to "coverage_html"
 5 | #  CODECOV_XMLOUTPUTFILE - the name of the directory where HTML results are placed. Defaults to "coverage.xml"
 6 | #  CODECOV_GCOVR_OPTIONS - additional options given to gcovr commands.
 7 | #
 8 | 
 9 | if(ENABLE_CODECOVERAGE)
10 | 
11 |     if ( NOT CMAKE_BUILD_TYPE STREQUAL "Debug" )
12 |         message( WARNING "Code coverage results with an optimised (non-Debug) build may be misleading" )
13 |     endif ( NOT CMAKE_BUILD_TYPE STREQUAL "Debug" )
14 | 
15 |     if ( NOT DEFINED CODECOV_OUTPUTFILE )
16 |         set( CODECOV_OUTPUTFILE coverage.info )
17 |     endif ( NOT DEFINED CODECOV_OUTPUTFILE )
18 | 
19 |     if ( NOT DEFINED CODECOV_HTMLOUTPUTDIR )
20 |         set( CODECOV_HTMLOUTPUTDIR coverage_results )
21 |     endif ( NOT DEFINED CODECOV_HTMLOUTPUTDIR )
22 | 
23 |     if ( CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_GNUCXX )
24 |         find_program( CODECOV_GCOV gcov )
25 |         find_program( CODECOV_LCOV lcov )
26 |         find_program( CODECOV_GENHTML genhtml )
27 |         add_definitions( -fprofile-arcs -ftest-coverage )
28 |         link_libraries( gcov )
29 |         set( CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS} --coverage )
30 |         add_custom_target( coverage_init ALL ${CODECOV_LCOV} --base-directory .  --directory ${CMAKE_BINARY_DIR}/src --output-file ${CODECOV_OUTPUTFILE} --capture --initial )
31 |         add_custom_target( coverage ${CODECOV_LCOV} --base-directory .  --directory ${CMAKE_BINARY_DIR}/src --output-file ${CODECOV_OUTPUTFILE} --capture COMMAND genhtml -o ${CODECOV_HTMLOUTPUTDIR} ${CODECOV_OUTPUTFILE} )
32 |     endif ( CMAKE_COMPILER_IS_GNUCXX )
33 | 
34 | endif(ENABLE_CODECOVERAGE)
35 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(quantile_example quantile_example.c)
2 | target_link_libraries(quantile_example tdigest)


--------------------------------------------------------------------------------
/examples/quantile_example.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "tdigest.h"
 3 | 
 4 | #define STREAM_SIZE 1000000
 5 | 
 6 | static inline double randMToN(double M, double N)
 7 | {
 8 |     return M + (rand() / (RAND_MAX / (N - M)));
 9 | }
10 | 
11 | 
12 | int main()
13 | {
14 | 
15 |     td_histogram_t *mdigest = td_new(500);
16 |     printf("compression is %f capacity is %d\n", mdigest->compression, mdigest->cap);
17 |     double seeds[STREAM_SIZE];
18 |     for (int i = 0; i < STREAM_SIZE; ++i)
19 |     {
20 |         seeds[i] = randMToN(0, 10);
21 |     }
22 | 
23 |     for (int i = 0; i < STREAM_SIZE; ++i)
24 |     {
25 |         td_add(mdigest, seeds[i], 1);
26 |     }
27 |     td_compress(mdigest);
28 |     for (int i = 0; i < 10; ++i)
29 |     {
30 |         const double v = seeds[i];
31 |         printf("value %f is at percentile %f\n", v, td_cdf(mdigest, v));
32 |     }
33 |     printf("\n");
34 |     for (int i = 0; i <= 100; i += 10)
35 |     {
36 |         printf("%d percentile has value %f\n", i, td_quantile(mdigest, i / 100.0));
37 |     }
38 | }


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | FILE(GLOB c_files "*.c")
 2 | FILE(GLOB header_files "*.h")
 3 | 
 4 | if (BUILD_SHARED)
 5 |     add_library(tdigest SHARED ${c_files} ${header_files})
 6 |     target_link_libraries(tdigest m)
 7 |     target_include_directories(tdigest SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 8 |     set_target_properties(tdigest PROPERTIES PUBLIC_HEADER "${header_files}")
 9 |     install(TARGETS tdigest DESTINATION lib${LIB_SUFFIX} PUBLIC_HEADER DESTINATION include)
10 | endif(BUILD_SHARED)
11 | 
12 | if (BUILD_STATIC) 
13 |     add_library(tdigest_static STATIC ${c_files} ${header_files}) 
14 |     target_link_libraries(tdigest_static m)
15 |     target_include_directories(tdigest_static SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
16 |     set_target_properties(tdigest_static PROPERTIES PUBLIC_HEADER "${header_files}")
17 |     install(TARGETS tdigest_static DESTINATION lib${LIB_SUFFIX} PUBLIC_HEADER DESTINATION include)
18 | endif(BUILD_STATIC)
19 | 


--------------------------------------------------------------------------------
/src/td_malloc.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Adaptive histogram based on something like streaming k-means crossed with Q-digest.
 3 |  * The implementation is a direct descendent of MergingDigest
 4 |  * https://github.com/tdunning/t-digest/
 5 |  *
 6 |  * Copyright (c) 2021 Redis, All rights reserved.
 7 |  *
 8 |  * Allocator selection.
 9 |  *
10 |  * This file is used in order to change the t-digest allocator at compile time.
11 |  * Just define the following defines to what you want to use. Also add
12 |  * the include of your alternate allocator if needed (not needed in order
13 |  * to use the default libc allocator). */
14 | 
15 | #ifndef TD_ALLOC_H
16 | #define TD_ALLOC_H
17 | #define __td_malloc malloc
18 | #define __td_calloc calloc
19 | #define __td_realloc realloc
20 | #define __td_free free
21 | #endif
22 | 


--------------------------------------------------------------------------------
/src/tdigest.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdbool.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | #include "tdigest.h"
  6 | #include <errno.h>
  7 | #include <stdint.h>
  8 | 
  9 | #ifndef TD_MALLOC_INCLUDE
 10 | #define TD_MALLOC_INCLUDE "td_malloc.h"
 11 | #endif
 12 | 
 13 | #include TD_MALLOC_INCLUDE
 14 | 
 15 | #define __td_max(x, y) (((x) > (y)) ? (x) : (y))
 16 | #define __td_min(x, y) (((x) < (y)) ? (x) : (y))
 17 | 
 18 | static inline double weighted_average_sorted(double x1, double w1, double x2, double w2) {
 19 |     const double x = (x1 * w1 + x2 * w2) / (w1 + w2);
 20 |     return __td_max(x1, __td_min(x, x2));
 21 | }
 22 | 
 23 | static inline bool _tdigest_long_long_add_safe(long long a, long long b) {
 24 |     if (b < 0) {
 25 |         return (a >= __LONG_LONG_MAX__ - b);
 26 |     } else {
 27 |         return (a <= __LONG_LONG_MAX__ - b);
 28 |     }
 29 | }
 30 | 
 31 | static inline double weighted_average(double x1, double w1, double x2, double w2) {
 32 |     if (x1 <= x2) {
 33 |         return weighted_average_sorted(x1, w1, x2, w2);
 34 |     } else {
 35 |         return weighted_average_sorted(x2, w2, x1, w1);
 36 |     }
 37 | }
 38 | 
 39 | static void inline swap(double *arr, int i, int j) {
 40 |     const double temp = arr[i];
 41 |     arr[i] = arr[j];
 42 |     arr[j] = temp;
 43 | }
 44 | 
 45 | static void inline swap_l(long long *arr, int i, int j) {
 46 |     const long long temp = arr[i];
 47 |     arr[i] = arr[j];
 48 |     arr[j] = temp;
 49 | }
 50 | 
 51 | static unsigned int partition(double *means, long long *weights, unsigned int start,
 52 |                               unsigned int end, unsigned int pivot_idx) {
 53 |     const double pivotMean = means[pivot_idx];
 54 |     swap(means, pivot_idx, end);
 55 |     swap_l(weights, pivot_idx, end);
 56 | 
 57 |     int i = start - 1;
 58 | 
 59 |     for (unsigned int j = start; j < end; j++) {
 60 |         // If current element is smaller than the pivot
 61 |         if (means[j] < pivotMean) {
 62 |             // increment index of smaller element
 63 |             i++;
 64 |             swap(means, i, j);
 65 |             swap_l(weights, i, j);
 66 |         }
 67 |     }
 68 |     swap(means, i + 1, end);
 69 |     swap_l(weights, i + 1, end);
 70 |     return i + 1;
 71 | }
 72 | 
 73 | /**
 74 |  * Standard quick sort except that sorting rearranges parallel arrays
 75 |  *
 76 |  * @param means  Values to sort on
 77 |  * @param weights The auxillary values to sort.
 78 |  * @param start  The beginning of the values to sort
 79 |  * @param end    The value after the last value to sort
 80 |  */
 81 | static void td_qsort(double *means, long long *weights, unsigned int start, unsigned int end) {
 82 |     if (start < end) {
 83 |         // two elements can be directly compared
 84 |         if ((end - start) == 1) {
 85 |             if (means[start] > means[end]) {
 86 |                 swap(means, start, end);
 87 |                 swap_l(weights, start, end);
 88 |             }
 89 |             return;
 90 |         }
 91 |         // generating a random number as a pivot was very expensive vs the array size
 92 |         // const unsigned int pivot_idx = start + rand()%(end - start + 1);
 93 |         const unsigned int pivot_idx = (end + start) / 2; // central pivot
 94 |         const unsigned int new_pivot_idx = partition(means, weights, start, end, pivot_idx);
 95 |         if (new_pivot_idx > start) {
 96 |             td_qsort(means, weights, start, new_pivot_idx - 1);
 97 |         }
 98 |         td_qsort(means, weights, new_pivot_idx + 1, end);
 99 |     }
100 | }
101 | 
102 | static inline size_t cap_from_compression(double compression) {
103 |     if ((size_t)compression > ((SIZE_MAX / sizeof(double) / 6) - 10)) {
104 |         return 0;
105 |     }
106 | 
107 |     return (6 * (size_t)(compression)) + 10;
108 | }
109 | 
110 | static inline bool should_td_compress(td_histogram_t *h) {
111 |     return ((h->merged_nodes + h->unmerged_nodes) >= (h->cap - 1));
112 | }
113 | 
114 | static inline int next_node(td_histogram_t *h) { return h->merged_nodes + h->unmerged_nodes; }
115 | 
116 | int td_compress(td_histogram_t *h);
117 | 
118 | static inline int _check_overflow(const double v) {
119 |     // double-precision overflow detected on h->unmerged_weight
120 |     if (v == INFINITY) {
121 |         return EDOM;
122 |     }
123 |     return 0;
124 | }
125 | 
126 | static inline int _check_td_overflow(const double new_unmerged_weight,
127 |                                      const double new_total_weight) {
128 |     // double-precision overflow detected on h->unmerged_weight
129 |     if (new_unmerged_weight == INFINITY) {
130 |         return EDOM;
131 |     }
132 |     if (new_total_weight == INFINITY) {
133 |         return EDOM;
134 |     }
135 |     const double denom = 2 * MM_PI * new_total_weight * log(new_total_weight);
136 |     if (denom == INFINITY) {
137 |         return EDOM;
138 |     }
139 | 
140 |     return 0;
141 | }
142 | 
143 | int td_centroid_count(td_histogram_t *h) { return next_node(h); }
144 | 
145 | void td_reset(td_histogram_t *h) {
146 |     if (!h) {
147 |         return;
148 |     }
149 |     h->min = __DBL_MAX__;
150 |     h->max = -h->min;
151 |     h->merged_nodes = 0;
152 |     h->merged_weight = 0;
153 |     h->unmerged_nodes = 0;
154 |     h->unmerged_weight = 0;
155 |     h->total_compressions = 0;
156 | }
157 | 
158 | int td_init(double compression, td_histogram_t **result) {
159 | 
160 |     const size_t capacity = cap_from_compression(compression);
161 |     if (capacity < 1) {
162 |         return 1;
163 |     }
164 |     td_histogram_t *histogram;
165 |     histogram = (td_histogram_t *)__td_malloc(sizeof(td_histogram_t));
166 |     if (!histogram) {
167 |         return 1;
168 |     }
169 |     histogram->cap = capacity;
170 |     histogram->compression = (double)compression;
171 |     td_reset(histogram);
172 |     histogram->nodes_mean = (double *)__td_calloc(capacity, sizeof(double));
173 |     if (!histogram->nodes_mean) {
174 |         td_free(histogram);
175 |         return 1;
176 |     }
177 |     histogram->nodes_weight = (long long *)__td_calloc(capacity, sizeof(long long));
178 |     if (!histogram->nodes_weight) {
179 |         td_free(histogram);
180 |         return 1;
181 |     }
182 |     *result = histogram;
183 | 
184 |     return 0;
185 | }
186 | 
187 | td_histogram_t *td_new(double compression) {
188 |     td_histogram_t *mdigest = NULL;
189 |     td_init(compression, &mdigest);
190 |     return mdigest;
191 | }
192 | 
193 | void td_free(td_histogram_t *histogram) {
194 |     if (histogram->nodes_mean) {
195 |         __td_free((void *)(histogram->nodes_mean));
196 |     }
197 |     if (histogram->nodes_weight) {
198 |         __td_free((void *)(histogram->nodes_weight));
199 |     }
200 |     __td_free((void *)(histogram));
201 | }
202 | 
203 | int td_merge(td_histogram_t *into, td_histogram_t *from) {
204 |     if (td_compress(into) != 0)
205 |         return EDOM;
206 |     if (td_compress(from) != 0)
207 |         return EDOM;
208 |     const int pos = from->merged_nodes + from->unmerged_nodes;
209 |     for (int i = 0; i < pos; i++) {
210 |         const double mean = from->nodes_mean[i];
211 |         const long long weight = from->nodes_weight[i];
212 |         if (td_add(into, mean, weight) != 0) {
213 |             return EDOM;
214 |         }
215 |     }
216 |     return 0;
217 | }
218 | 
219 | long long td_size(td_histogram_t *h) { return h->merged_weight + h->unmerged_weight; }
220 | 
221 | double td_cdf(td_histogram_t *h, double val) {
222 |     td_compress(h);
223 |     // no data to examine
224 |     if (h->merged_nodes == 0) {
225 |         return NAN;
226 |     }
227 |     // bellow lower bound
228 |     if (val < h->min) {
229 |         return 0;
230 |     }
231 |     // above upper bound
232 |     if (val > h->max) {
233 |         return 1;
234 |     }
235 |     if (h->merged_nodes == 1) {
236 |         // exactly one centroid, should have max==min
237 |         const double width = h->max - h->min;
238 |         if (val - h->min <= width) {
239 |             // min and max are too close together to do any viable interpolation
240 |             return 0.5;
241 |         } else {
242 |             // interpolate if somehow we have weight > 0 and max != min
243 |             return (val - h->min) / width;
244 |         }
245 |     }
246 |     const int n = h->merged_nodes;
247 |     // check for the left tail
248 |     const double left_centroid_mean = h->nodes_mean[0];
249 |     const double left_centroid_weight = (double)h->nodes_weight[0];
250 |     const double merged_weight_d = (double)h->merged_weight;
251 |     if (val < left_centroid_mean) {
252 |         // note that this is different than h->nodes_mean[0] > min
253 |         // ... this guarantees we divide by non-zero number and interpolation works
254 |         const double width = left_centroid_mean - h->min;
255 |         if (width > 0) {
256 |             // must be a sample exactly at min
257 |             if (val == h->min) {
258 |                 return 0.5 / merged_weight_d;
259 |             } else {
260 |                 return (1 + (val - h->min) / width * (left_centroid_weight / 2 - 1)) /
261 |                        merged_weight_d;
262 |             }
263 |         } else {
264 |             // this should be redundant with the check val < h->min
265 |             return 0;
266 |         }
267 |     }
268 |     // and the right tail
269 |     const double right_centroid_mean = h->nodes_mean[n - 1];
270 |     const double right_centroid_weight = (double)h->nodes_weight[n - 1];
271 |     if (val > right_centroid_mean) {
272 |         const double width = h->max - right_centroid_mean;
273 |         if (width > 0) {
274 |             if (val == h->max) {
275 |                 return 1 - 0.5 / merged_weight_d;
276 |             } else {
277 |                 // there has to be a single sample exactly at max
278 |                 const double dq = (1 + (h->max - val) / width * (right_centroid_weight / 2 - 1)) /
279 |                                   merged_weight_d;
280 |                 return 1 - dq;
281 |             }
282 |         } else {
283 |             return 1;
284 |         }
285 |     }
286 |     // we know that there are at least two centroids and mean[0] < x < mean[n-1]
287 |     // that means that there are either one or more consecutive centroids all at exactly x
288 |     // or there are consecutive centroids, c0 < x < c1
289 |     double weightSoFar = 0;
290 |     for (int it = 0; it < n - 1; it++) {
291 |         // weightSoFar does not include weight[it] yet
292 |         if (h->nodes_mean[it] == val) {
293 |             // we have one or more centroids == x, treat them as one
294 |             // dw will accumulate the weight of all of the centroids at x
295 |             double dw = 0;
296 |             while (it < n && h->nodes_mean[it] == val) {
297 |                 dw += (double)h->nodes_weight[it];
298 |                 it++;
299 |             }
300 |             return (weightSoFar + dw / 2) / (double)h->merged_weight;
301 |         } else if (h->nodes_mean[it] <= val && val < h->nodes_mean[it + 1]) {
302 |             const double node_weight = (double)h->nodes_weight[it];
303 |             const double node_weight_next = (double)h->nodes_weight[it + 1];
304 |             const double node_mean = h->nodes_mean[it];
305 |             const double node_mean_next = h->nodes_mean[it + 1];
306 |             // landed between centroids ... check for floating point madness
307 |             if (node_mean_next - node_mean > 0) {
308 |                 // note how we handle singleton centroids here
309 |                 // the point is that for singleton centroids, we know that their entire
310 |                 // weight is exactly at the centroid and thus shouldn't be involved in
311 |                 // interpolation
312 |                 double leftExcludedW = 0;
313 |                 double rightExcludedW = 0;
314 |                 if (node_weight == 1) {
315 |                     if (node_weight_next == 1) {
316 |                         // two singletons means no interpolation
317 |                         // left singleton is in, right is out
318 |                         return (weightSoFar + 1) / merged_weight_d;
319 |                     } else {
320 |                         leftExcludedW = 0.5;
321 |                     }
322 |                 } else if (node_weight_next == 1) {
323 |                     rightExcludedW = 0.5;
324 |                 }
325 |                 double dw = (node_weight + node_weight_next) / 2;
326 | 
327 |                 // adjust endpoints for any singleton
328 |                 double dwNoSingleton = dw - leftExcludedW - rightExcludedW;
329 | 
330 |                 double base = weightSoFar + node_weight / 2 + leftExcludedW;
331 |                 return (base + dwNoSingleton * (val - node_mean) / (node_mean_next - node_mean)) /
332 |                        merged_weight_d;
333 |             } else {
334 |                 // this is simply caution against floating point madness
335 |                 // it is conceivable that the centroids will be different
336 |                 // but too near to allow safe interpolation
337 |                 double dw = (node_weight + node_weight_next) / 2;
338 |                 return (weightSoFar + dw) / merged_weight_d;
339 |             }
340 |         } else {
341 |             weightSoFar += (double)h->nodes_weight[it];
342 |         }
343 |     }
344 |     return 1 - 0.5 / merged_weight_d;
345 | }
346 | 
347 | static double td_internal_iterate_centroids_to_index(const td_histogram_t *h, const double index,
348 |                                                      const double left_centroid_weight,
349 |                                                      const int total_centroids, double *weightSoFar,
350 |                                                      int *node_pos) {
351 |     if (left_centroid_weight > 1 && index < left_centroid_weight / 2) {
352 |         // there is a single sample at min so we interpolate with less weight
353 |         return h->min + (index - 1) / (left_centroid_weight / 2 - 1) * (h->nodes_mean[0] - h->min);
354 |     }
355 | 
356 |     // usually the last centroid will have unit weight so this test will make it moot
357 |     if (index > h->merged_weight - 1) {
358 |         return h->max;
359 |     }
360 | 
361 |     // if the right-most centroid has more than one sample, we still know
362 |     // that one sample occurred at max so we can do some interpolation
363 |     const double right_centroid_weight = (double)h->nodes_weight[total_centroids - 1];
364 |     const double right_centroid_mean = h->nodes_mean[total_centroids - 1];
365 |     if (right_centroid_weight > 1 &&
366 |         (double)h->merged_weight - index <= right_centroid_weight / 2) {
367 |         return h->max - ((double)h->merged_weight - index - 1) / (right_centroid_weight / 2 - 1) *
368 |                             (h->max - right_centroid_mean);
369 |     }
370 | 
371 |     for (; *node_pos < total_centroids - 1; (*node_pos)++) {
372 |         const int i = *node_pos;
373 |         const double node_weight = (double)h->nodes_weight[i];
374 |         const double node_weight_next = (double)h->nodes_weight[i + 1];
375 |         const double node_mean = h->nodes_mean[i];
376 |         const double node_mean_next = h->nodes_mean[i + 1];
377 |         const double dw = (node_weight + node_weight_next) / 2;
378 |         if (*weightSoFar + dw > index) {
379 |             // centroids i and i+1 bracket our current point
380 |             // check for unit weight
381 |             double leftUnit = 0;
382 |             if (node_weight == 1) {
383 |                 if (index - *weightSoFar < 0.5) {
384 |                     // within the singleton's sphere
385 |                     return node_mean;
386 |                 } else {
387 |                     leftUnit = 0.5;
388 |                 }
389 |             }
390 |             double rightUnit = 0;
391 |             if (node_weight_next == 1) {
392 |                 if (*weightSoFar + dw - index <= 0.5) {
393 |                     // no interpolation needed near singleton
394 |                     return node_mean_next;
395 |                 }
396 |                 rightUnit = 0.5;
397 |             }
398 |             const double z1 = index - *weightSoFar - leftUnit;
399 |             const double z2 = *weightSoFar + dw - index - rightUnit;
400 |             return weighted_average(node_mean, z2, node_mean_next, z1);
401 |         }
402 |         *weightSoFar += dw;
403 |     }
404 | 
405 |     // weightSoFar = totalWeight - weight[total_centroids-1]/2 (very nearly)
406 |     // so we interpolate out to max value ever seen
407 |     const double z1 = index - h->merged_weight - right_centroid_weight / 2.0;
408 |     const double z2 = right_centroid_weight / 2 - z1;
409 |     return weighted_average(right_centroid_mean, z1, h->max, z2);
410 | }
411 | 
412 | double td_quantile(td_histogram_t *h, double q) {
413 |     td_compress(h);
414 |     // q should be in [0,1]
415 |     if (q < 0.0 || q > 1.0 || h->merged_nodes == 0) {
416 |         return NAN;
417 |     }
418 |     // with one data point, all quantiles lead to Rome
419 |     if (h->merged_nodes == 1) {
420 |         return h->nodes_mean[0];
421 |     }
422 | 
423 |     // if values were stored in a sorted array, index would be the offset we are interested in
424 |     const double index = q * (double)h->merged_weight;
425 | 
426 |     // beyond the boundaries, we return min or max
427 |     // usually, the first centroid will have unit weight so this will make it moot
428 |     if (index < 1) {
429 |         return h->min;
430 |     }
431 | 
432 |     // we know that there are at least two centroids now
433 |     const int n = h->merged_nodes;
434 | 
435 |     // if the left centroid has more than one sample, we still know
436 |     // that one sample occurred at min so we can do some interpolation
437 |     const double left_centroid_weight = (double)h->nodes_weight[0];
438 | 
439 |     // in between extremes we interpolate between centroids
440 |     double weightSoFar = left_centroid_weight / 2;
441 |     int i = 0;
442 |     return td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n, &weightSoFar,
443 |                                                   &i);
444 | }
445 | 
446 | int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length) {
447 |     td_compress(h);
448 | 
449 |     if (NULL == quantiles || NULL == values) {
450 |         return EINVAL;
451 |     }
452 | 
453 |     const int n = h->merged_nodes;
454 |     if (n == 0) {
455 |         for (size_t i = 0; i < length; i++) {
456 |             values[i] = NAN;
457 |         }
458 |         return 0;
459 |     }
460 |     if (n == 1) {
461 |         for (size_t i = 0; i < length; i++) {
462 |             const double requested_quantile = quantiles[i];
463 | 
464 |             // q should be in [0,1]
465 |             if (requested_quantile < 0.0 || requested_quantile > 1.0) {
466 |                 values[i] = NAN;
467 |             } else {
468 |                 // with one data point, all quantiles lead to Rome
469 |                 values[i] = h->nodes_mean[0];
470 |             }
471 |         }
472 |         return 0;
473 |     }
474 | 
475 |     // we know that there are at least two centroids now
476 |     // if the left centroid has more than one sample, we still know
477 |     // that one sample occurred at min so we can do some interpolation
478 |     const double left_centroid_weight = (double)h->nodes_weight[0];
479 | 
480 |     // in between extremes we interpolate between centroids
481 |     double weightSoFar = left_centroid_weight / 2;
482 |     int node_pos = 0;
483 | 
484 |     // to avoid allocations we use the values array for intermediate computation
485 |     // i.e. to store the expected cumulative count at each percentile
486 |     for (size_t qpos = 0; qpos < length; qpos++) {
487 |         const double index = quantiles[qpos] * (double)h->merged_weight;
488 |         values[qpos] = td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n,
489 |                                                               &weightSoFar, &node_pos);
490 |     }
491 |     return 0;
492 | }
493 | 
494 | static double td_internal_trimmed_mean(const td_histogram_t *h, const double leftmost_weight,
495 |                                        const double rightmost_weight) {
496 |     double count_done = 0;
497 |     double trimmed_sum = 0;
498 |     double trimmed_count = 0;
499 |     for (int i = 0; i < h->merged_nodes; i++) {
500 | 
501 |         const double n_weight = (double)h->nodes_weight[i];
502 |         // Assume the whole centroid falls into the range
503 |         double count_add = n_weight;
504 | 
505 |         // If we haven't reached the low threshold yet, skip appropriate part of the centroid.
506 |         count_add -= __td_min(__td_max(0, leftmost_weight - count_done), count_add);
507 | 
508 |         // If we have reached the upper threshold, ignore the overflowing part of the centroid.
509 | 
510 |         count_add = __td_min(__td_max(0, rightmost_weight - count_done), count_add);
511 | 
512 |         // consider the whole centroid processed
513 |         count_done += n_weight;
514 | 
515 |         // increment the sum / count
516 |         trimmed_sum += h->nodes_mean[i] * count_add;
517 |         trimmed_count += count_add;
518 | 
519 |         // break once we cross the high threshold
520 |         if (count_done >= rightmost_weight)
521 |             break;
522 |     }
523 | 
524 |     return trimmed_sum / trimmed_count;
525 | }
526 | 
527 | double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut) {
528 |     td_compress(h);
529 |     // proportion_to_cut should be in [0,1]
530 |     if (h->merged_nodes == 0 || proportion_to_cut < 0.0 || proportion_to_cut > 1.0) {
531 |         return NAN;
532 |     }
533 |     // with one data point, all values lead to Rome
534 |     if (h->merged_nodes == 1) {
535 |         return h->nodes_mean[0];
536 |     }
537 | 
538 |     /* translate the percentiles to counts */
539 |     const double leftmost_weight = floor((double)h->merged_weight * proportion_to_cut);
540 |     const double rightmost_weight = ceil((double)h->merged_weight * (1.0 - proportion_to_cut));
541 | 
542 |     return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight);
543 | }
544 | 
545 | double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut) {
546 |     td_compress(h);
547 |     // leftmost_cut and rightmost_cut should be in [0,1]
548 |     if (h->merged_nodes == 0 || leftmost_cut < 0.0 || leftmost_cut > 1.0 || rightmost_cut < 0.0 ||
549 |         rightmost_cut > 1.0) {
550 |         return NAN;
551 |     }
552 |     // with one data point, all values lead to Rome
553 |     if (h->merged_nodes == 1) {
554 |         return h->nodes_mean[0];
555 |     }
556 | 
557 |     /* translate the percentiles to counts */
558 |     const double leftmost_weight = floor((double)h->merged_weight * leftmost_cut);
559 |     const double rightmost_weight = ceil((double)h->merged_weight * rightmost_cut);
560 | 
561 |     return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight);
562 | }
563 | 
564 | int td_add(td_histogram_t *h, double mean, long long weight) {
565 |     if (should_td_compress(h)) {
566 |         const int overflow_res = td_compress(h);
567 |         if (overflow_res != 0)
568 |             return overflow_res;
569 |     }
570 |     const int pos = next_node(h);
571 |     if (pos >= h->cap)
572 |         return EDOM;
573 |     if (_tdigest_long_long_add_safe(h->unmerged_weight, weight) == false)
574 |         return EDOM;
575 |     const long long new_unmerged_weight = h->unmerged_weight + weight;
576 |     if (_tdigest_long_long_add_safe(new_unmerged_weight, h->merged_weight) == false)
577 |         return EDOM;
578 |     const long long new_total_weight = new_unmerged_weight + h->merged_weight;
579 |     // double-precision overflow detected
580 |     const int overflow_res =
581 |         _check_td_overflow((double)new_unmerged_weight, (double)new_total_weight);
582 |     if (overflow_res != 0)
583 |         return overflow_res;
584 | 
585 |     if (mean < h->min) {
586 |         h->min = mean;
587 |     }
588 |     if (mean > h->max) {
589 |         h->max = mean;
590 |     }
591 |     h->nodes_mean[pos] = mean;
592 |     h->nodes_weight[pos] = weight;
593 |     h->unmerged_nodes++;
594 |     h->unmerged_weight = new_unmerged_weight;
595 |     return 0;
596 | }
597 | 
598 | int td_compress(td_histogram_t *h) {
599 |     if (h->unmerged_nodes == 0) {
600 |         return 0;
601 |     }
602 |     int N = h->merged_nodes + h->unmerged_nodes;
603 |     td_qsort(h->nodes_mean, h->nodes_weight, 0, N - 1);
604 |     const double total_weight = (double)h->merged_weight + (double)h->unmerged_weight;
605 |     // double-precision overflow detected
606 |     const int overflow_res = _check_td_overflow((double)h->unmerged_weight, (double)total_weight);
607 |     if (overflow_res != 0)
608 |         return overflow_res;
609 |     if (total_weight <= 1)
610 |         return 0;
611 |     const double denom = 2 * MM_PI * total_weight * log(total_weight);
612 |     if (_check_overflow(denom) != 0)
613 |         return EDOM;
614 | 
615 |     // Compute the normalizer given compression and number of points.
616 |     const double normalizer = h->compression / denom;
617 |     if (_check_overflow(normalizer) != 0)
618 |         return EDOM;
619 |     int cur = 0;
620 |     double weight_so_far = 0;
621 | 
622 |     for (int i = 1; i < N; i++) {
623 |         const double proposed_weight = (double)h->nodes_weight[cur] + (double)h->nodes_weight[i];
624 |         const double z = proposed_weight * normalizer;
625 |         // quantile up to cur
626 |         const double q0 = weight_so_far / total_weight;
627 |         // quantile up to cur + i
628 |         const double q2 = (weight_so_far + proposed_weight) / total_weight;
629 |         // Convert  a quantile to the k-scale
630 |         const bool should_add = (z <= (q0 * (1 - q0))) && (z <= (q2 * (1 - q2)));
631 |         // next point will fit
632 |         // so merge into existing centroid
633 |         if (should_add) {
634 |             h->nodes_weight[cur] += h->nodes_weight[i];
635 |             const double delta = h->nodes_mean[i] - h->nodes_mean[cur];
636 |             const double weighted_delta = (delta * h->nodes_weight[i]) / h->nodes_weight[cur];
637 |             h->nodes_mean[cur] += weighted_delta;
638 |         } else {
639 |             weight_so_far += h->nodes_weight[cur];
640 |             cur++;
641 |             h->nodes_weight[cur] = h->nodes_weight[i];
642 |             h->nodes_mean[cur] = h->nodes_mean[i];
643 |         }
644 |         if (cur != i) {
645 |             h->nodes_weight[i] = 0;
646 |             h->nodes_mean[i] = 0.0;
647 |         }
648 |     }
649 |     h->merged_nodes = cur + 1;
650 |     h->merged_weight = total_weight;
651 |     h->unmerged_nodes = 0;
652 |     h->unmerged_weight = 0;
653 |     h->total_compressions++;
654 |     return 0;
655 | }
656 | 
657 | double td_min(td_histogram_t *h) { return h->min; }
658 | 
659 | double td_max(td_histogram_t *h) { return h->max; }
660 | 
661 | int td_compression(td_histogram_t *h) { return h->compression; }
662 | 
663 | const long long *td_centroids_weight(td_histogram_t *h) { return h->nodes_weight; }
664 | 
665 | const double *td_centroids_mean(td_histogram_t *h) { return h->nodes_mean; }
666 | 
667 | long long td_centroids_weight_at(td_histogram_t *h, int pos) { return h->nodes_weight[pos]; }
668 | 
669 | double td_centroids_mean_at(td_histogram_t *h, int pos) {
670 |     if (pos < 0 || pos > h->merged_nodes) {
671 |         return NAN;
672 |     }
673 |     return h->nodes_mean[pos];
674 | }
675 | 


--------------------------------------------------------------------------------
/src/tdigest.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <stdlib.h>
  3 | 
  4 | /**
  5 |  * Adaptive histogram based on something like streaming k-means crossed with Q-digest.
  6 |  * The implementation is a direct descendent of MergingDigest
  7 |  * https://github.com/tdunning/t-digest/
  8 |  *
  9 |  * Copyright (c) 2021 Redis, All rights reserved.
 10 |  * Copyright (c) 2018 Andrew Werner, All rights reserved.
 11 |  *
 12 |  * The special characteristics of this algorithm are:
 13 |  *
 14 |  * - smaller summaries than Q-digest
 15 |  *
 16 |  * - provides part per million accuracy for extreme quantiles and typically &lt;1000 ppm accuracy
 17 |  * for middle quantiles
 18 |  *
 19 |  * - fast
 20 |  *
 21 |  * - simple
 22 |  *
 23 |  * - easy to adapt for use with map-reduce
 24 |  */
 25 | 
 26 | #define MM_PI 3.14159265358979323846
 27 | 
 28 | struct td_histogram {
 29 |     // compression is a setting used to configure the size of centroids when merged.
 30 |     double compression;
 31 | 
 32 |     double min;
 33 |     double max;
 34 | 
 35 |     // cap is the total size of nodes
 36 |     int cap;
 37 |     // merged_nodes is the number of merged nodes at the front of nodes.
 38 |     int merged_nodes;
 39 |     // unmerged_nodes is the number of buffered nodes.
 40 |     int unmerged_nodes;
 41 | 
 42 |     // we run the merge in reverse every other merge to avoid left-to-right bias in merging
 43 |     long long total_compressions;
 44 | 
 45 |     long long merged_weight;
 46 |     long long unmerged_weight;
 47 | 
 48 |     double *nodes_mean;
 49 |     long long *nodes_weight;
 50 | };
 51 | 
 52 | typedef struct td_histogram td_histogram_t;
 53 | 
 54 | #ifdef __cplusplus
 55 | extern "C" {
 56 | #endif
 57 | 
 58 | /**
 59 |  * Allocate the memory, initialise the t-digest, and return the histogram as output parameter.
 60 |  * @param compression The compression parameter.
 61 |  * 100 is a common value for normal uses.
 62 |  * 1000 is extremely large.
 63 |  * The number of centroids retained will be a smallish (usually less than 10) multiple of this
 64 |  * number.
 65 |  * @return the histogram on success, NULL if allocation failed.
 66 |  */
 67 | td_histogram_t *td_new(double compression);
 68 | 
 69 | /**
 70 |  * Allocate the memory and initialise the t-digest.
 71 |  *
 72 |  * @param compression The compression parameter.
 73 |  * 100 is a common value for normal uses.
 74 |  * 1000 is extremely large.
 75 |  * The number of centroids retained will be a smallish (usually less than 10) multiple of this
 76 |  * number.
 77 |  * @param result Output parameter to capture allocated histogram.
 78 |  * @return 0 on success, 1 if allocation failed.
 79 |  */
 80 | int td_init(double compression, td_histogram_t **result);
 81 | 
 82 | /**
 83 |  * Frees the memory associated with the t-digest.
 84 |  *
 85 |  * @param h The histogram you want to free.
 86 |  */
 87 | void td_free(td_histogram_t *h);
 88 | 
 89 | /**
 90 |  * Reset a histogram to zero - empty out a histogram and re-initialise it
 91 |  *
 92 |  * If you want to re-use an existing histogram, but reset everything back to zero, this
 93 |  * is the routine to use.
 94 |  *
 95 |  * @param h The histogram you want to reset to empty.
 96 |  *
 97 |  */
 98 | void td_reset(td_histogram_t *h);
 99 | 
100 | /**
101 |  * Adds a sample to a histogram.
102 |  *
103 |  * @param val The value to add.
104 |  * @param weight The weight of this point.
105 |  * @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
106 |  * weight.
107 |  *
108 |  */
109 | int td_add(td_histogram_t *h, double val, long long weight);
110 | 
111 | /**
112 |  * Re-examines a t-digest to determine whether some centroids are redundant.  If your data are
113 |  * perversely ordered, this may be a good idea.  Even if not, this may save 20% or so in space.
114 |  *
115 |  * The cost is roughly the same as adding as many data points as there are centroids.  This
116 |  * is typically &lt; 10 * compression, but could be as high as 100 * compression.
117 |  * This is a destructive operation that is not thread-safe.
118 |  *
119 |  * @param h The histogram you want to compress.
120 |  * @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
121 |  * weight. If overflow is detected the histogram is not changed.
122 |  *
123 |  */
124 | int td_compress(td_histogram_t *h);
125 | 
126 | /**
127 |  * Merges all of the values from 'from' to 'this' histogram.
128 |  *
129 |  * @param h "This" pointer
130 |  * @param from Histogram to copy values from.
131 |  * * @return 0 on success, EDOM if overflow was detected as a consequence of merging the the
132 |  * provided histogram. If overflow is detected the original histogram is not detected.
133 |  */
134 | int td_merge(td_histogram_t *h, td_histogram_t *from);
135 | 
136 | /**
137 |  * Returns the fraction of all points added which are &le; x.
138 |  *
139 |  * @param x The cutoff for the cdf.
140 |  * @return The fraction of all data which is less or equal to x.
141 |  */
142 | double td_cdf(td_histogram_t *h, double x);
143 | 
144 | /**
145 |  * Returns an estimate of the cutoff such that a specified fraction of the data
146 |  * added to this TDigest would be less than or equal to the cutoff.
147 |  *
148 |  * @param q The desired fraction
149 |  * @return The value x such that cdf(x) == q;
150 |  */
151 | double td_quantile(td_histogram_t *h, double q);
152 | 
153 | /**
154 |  * Returns an estimate of the cutoff such that a specified fraction of the data
155 |  * added to this TDigest would be less than or equal to the cutoffs.
156 |  *
157 |  * @param quantiles The ordered percentiles array to get the values for.
158 |  * @param values Destination array containing the values at the given quantiles.
159 |  * The values array should be allocated by the caller.
160 |  * @return 0 on success, ENOMEM if the provided destination array is null.
161 |  */
162 | int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length);
163 | 
164 | /**
165 |  * Returns the trimmed mean ignoring values outside given cutoff upper and lower limits.
166 |  *
167 |  * @param leftmost_cut Fraction to cut off of the left tail of the distribution.
168 |  * @param rightmost_cut Fraction to cut off of the right tail of the distribution.
169 |  * @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
170 |  */
171 | double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut);
172 | 
173 | /**
174 |  * Returns the trimmed mean ignoring values outside given a symmetric cutoff limits.
175 |  *
176 |  * @param proportion_to_cut Fraction to cut off of the left and right tails of the distribution.
177 |  * @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
178 |  */
179 | double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut);
180 | 
181 | /**
182 |  * Returns the current compression factor.
183 |  *
184 |  * @return The compression factor originally used to set up the TDigest.
185 |  */
186 | int td_compression(td_histogram_t *h);
187 | 
188 | /**
189 |  * Returns the number of points that have been added to this TDigest.
190 |  *
191 |  * @return The sum of the weights on all centroids.
192 |  */
193 | long long td_size(td_histogram_t *h);
194 | 
195 | /**
196 |  * Returns the number of centroids being used by this TDigest.
197 |  *
198 |  * @return The number of centroids being used.
199 |  */
200 | int td_centroid_count(td_histogram_t *h);
201 | 
202 | /**
203 |  * Get minimum value from the histogram.  Will return __DBL_MAX__ if the histogram
204 |  * is empty.
205 |  *
206 |  * @param h "This" pointer
207 |  */
208 | double td_min(td_histogram_t *h);
209 | 
210 | /**
211 |  * Get maximum value from the histogram.  Will return - __DBL_MAX__ if the histogram
212 |  * is empty.
213 |  *
214 |  * @param h "This" pointer
215 |  */
216 | double td_max(td_histogram_t *h);
217 | 
218 | /**
219 |  * Get the full centroids weight array for 'this' histogram.
220 |  *
221 |  * @param h "This" pointer
222 |  *
223 |  * @return The full centroids weight array.
224 |  */
225 | const long long *td_centroids_weight(td_histogram_t *h);
226 | 
227 | /**
228 |  * Get the full centroids mean array for 'this' histogram.
229 |  *
230 |  * @param h "This" pointer
231 |  *
232 |  * @return The full centroids mean array.
233 |  */
234 | const double *td_centroids_mean(td_histogram_t *h);
235 | 
236 | /**
237 |  * Get the centroid weight for 'this' histogram and 'pos'.
238 |  *
239 |  * @param h "This" pointer
240 |  * @param pos centroid position.
241 |  *
242 |  * @return The centroid weight.
243 |  */
244 | long long td_centroids_weight_at(td_histogram_t *h, int pos);
245 | 
246 | /**
247 |  * Get the centroid mean for 'this' histogram and 'pos'.
248 |  *
249 |  * @param h "This" pointer
250 |  * @param pos centroid position.
251 |  *
252 |  * @return The centroid mean.
253 |  */
254 | double td_centroids_mean_at(td_histogram_t *h, int pos);
255 | 
256 | #ifdef __cplusplus
257 | }
258 | #endif
259 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | if (BUILD_BENCHMARK)
 3 |     if (UNIX)
 4 |         set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2 -g -ggdb -fno-omit-frame-pointer")
 5 |         set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2  -g -ggdb -fno-omit-frame-pointer")
 6 |         set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Suppressing benchmark's tests" FORCE)
 7 |         add_subdirectory(vendor/google/benchmark)
 8 |         include_directories(vendor/google/benchmark/include)
 9 |         add_executable(histogram_benchmark benchmark/histogram_benchmark.cpp)
10 |         target_link_libraries(histogram_benchmark tdigest benchmark::benchmark)
11 |     else()
12 |         message(WARNING
13 |               "google.benchmark - microbenchmarks disabled on WIN32 platforms")
14 |     endif()
15 | endif()
16 | if (BUILD_TESTS)
17 |     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c99")
18 |     add_executable(td_test unit/td_test.c unit/minunit.h)
19 |     target_link_libraries(td_test tdigest m)
20 |     enable_testing()
21 |     add_test(td_test td_test)
22 | endif()
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/tests/benchmark/histogram_benchmark.cpp:
--------------------------------------------------------------------------------
  1 | #include <benchmark/benchmark.h>
  2 | #include "tdigest.h"
  3 | #include <math.h>
  4 | #include <random>
  5 | 
  6 | #ifdef _WIN32
  7 | #pragma comment(lib, "Shlwapi.lib")
  8 | #ifdef _DEBUG
  9 | #pragma comment(lib, "benchmarkd.lib")
 10 | #else
 11 | #pragma comment(lib, "benchmark.lib")
 12 | #endif
 13 | #endif
 14 | 
 15 | int64_t min_value = 1;
 16 | int64_t min_compression = 100;
 17 | int64_t max_compression = 500;
 18 | int64_t step_compression_unit = 100;
 19 | 
 20 | static void generate_arguments_pairs(benchmark::internal::Benchmark *b) {
 21 |     for (int64_t compression = min_compression; compression <= max_compression;
 22 |          compression += step_compression_unit) {
 23 |         b = b->ArgPair((double)compression, INT64_C(10000000));
 24 |     }
 25 | }
 26 | 
 27 | static void BM_td_add_uniform_dist(benchmark::State &state) {
 28 |     const double compression = state.range(0);
 29 |     const int64_t stream_size = state.range(1);
 30 |     td_histogram_t *mdigest = td_new(compression);
 31 |     std::vector<double> input;
 32 |     input.resize(stream_size, 0);
 33 |     std::mt19937_64 rng;
 34 |     rng.seed(std::random_device()());
 35 |     std::uniform_real_distribution<double> dist(0, 1);
 36 | 
 37 |     for (double &i : input) {
 38 |         i = dist(rng);
 39 |     }
 40 | 
 41 |     while (state.KeepRunning()) {
 42 |         for (int i = 0; i < stream_size; ++i) {
 43 |             td_add(mdigest, input[i], 1);
 44 |         }
 45 |         td_compress(mdigest);
 46 |         // read/write barrier
 47 |         benchmark::ClobberMemory();
 48 |         state.SetItemsProcessed(stream_size);
 49 |         // Set the counter as a thread-average quantity. It will
 50 |         // be presented divided by the number of threads ( in our case just one thread ).
 51 |         state.counters["Centroid_Count"] =
 52 |             benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads);
 53 |         state.counters["Total_Compressions"] =
 54 |             benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads);
 55 |     }
 56 | }
 57 | 
 58 | static void BM_td_add_lognormal_dist(benchmark::State &state) {
 59 |     const double compression = state.range(0);
 60 |     const int64_t stream_size = state.range(1);
 61 |     td_histogram_t *mdigest = td_new(compression);
 62 |     std::vector<double> input;
 63 |     input.resize(stream_size, 0);
 64 |     std::mt19937_64 rng;
 65 |     rng.seed(std::random_device()());
 66 |     std::lognormal_distribution<double> dist(1, 0.5);
 67 | 
 68 |     for (double &i : input) {
 69 |         i = dist(rng);
 70 |     }
 71 | 
 72 |     while (state.KeepRunning()) {
 73 |         for (int i = 0; i < stream_size; ++i) {
 74 |             td_add(mdigest, input[i], 1);
 75 |         }
 76 |         td_compress(mdigest);
 77 |         // read/write barrier
 78 |         benchmark::ClobberMemory();
 79 |         state.SetItemsProcessed(stream_size);
 80 |         // Set the counter as a thread-average quantity. It will
 81 |         // be presented divided by the number of threads ( in our case just one thread ).
 82 |         state.counters["Centroid_Count"] =
 83 |             benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads);
 84 |         state.counters["Total_Compressions"] =
 85 |             benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads);
 86 |     }
 87 | }
 88 | 
 89 | static void BM_td_quantile_lognormal_dist(benchmark::State &state) {
 90 |     const double compression = state.range(0);
 91 |     const int64_t stream_size = state.range(1);
 92 |     td_histogram_t *mdigest = td_new(compression);
 93 |     std::vector<double> input;
 94 |     input.resize(stream_size, 0);
 95 |     std::mt19937_64 rng;
 96 |     rng.seed(std::random_device()());
 97 |     std::uniform_real_distribution<double> dist(0, 1);
 98 |     std::lognormal_distribution<double> distSamples(1, 0.5);
 99 | 
100 |     for (double &i : input) {
101 |         i = dist(rng);
102 |         td_add(mdigest, distSamples(rng), 1);
103 |     }
104 |     td_compress(mdigest);
105 | 
106 |     while (state.KeepRunning()) {
107 |         for (int i = 0; i < stream_size; ++i) {
108 |             td_quantile(mdigest, input[i]);
109 |         }
110 |         // read/write barrier
111 |         benchmark::ClobberMemory();
112 |         state.SetItemsProcessed(stream_size);
113 |         // Set the counter as a thread-average quantity. It will
114 |         // be presented divided by the number of threads ( in our case just one thread ).
115 |         state.counters["Centroid_Count"] =
116 |             benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads);
117 |         state.counters["Total_Compressions"] =
118 |             benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads);
119 |     }
120 | }
121 | 
122 | static void BM_td_merge_lognormal_dist(benchmark::State &state) {
123 |     const double compression = state.range(0);
124 |     const int64_t stream_size = 100000;
125 |     td_histogram_t *mdigest = td_new(compression);
126 |     td_histogram_t *mdigest2 = td_new(compression);
127 |     std::vector<double> input;
128 |     input.resize(stream_size, 0);
129 |     std::mt19937_64 rng;
130 |     rng.seed(std::random_device()());
131 |     std::uniform_real_distribution<double> dist(0, 1);
132 |     std::lognormal_distribution<double> distSamples(1, 0.5);
133 | 
134 |     for (double &i : input) {
135 |         i = dist(rng);
136 |         td_add(mdigest, distSamples(rng), 1);
137 |         td_add(mdigest2, distSamples(rng), 1);
138 |     }
139 |     td_compress(mdigest);
140 | 
141 |     while (state.KeepRunning()) {
142 |         for (int i = 0; i < stream_size; ++i) {
143 |             td_merge(mdigest, mdigest2);
144 |         }
145 |         // read/write barrier
146 |         benchmark::ClobberMemory();
147 |         state.SetItemsProcessed(stream_size);
148 |         // Set the counter as a thread-average quantity. It will
149 |         // be presented divided by the number of threads ( in our case just one thread ).
150 |         state.counters["Centroid_Count"] =
151 |             benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads);
152 |         state.counters["Total_Compressions"] =
153 |             benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads);
154 |     }
155 | }
156 | 
157 | static void BM_td_trimmed_mean_symmetric_lognormal_dist(benchmark::State &state) {
158 |     const double compression = state.range(0);
159 |     const int64_t stream_size = state.range(1);
160 |     td_histogram_t *mdigest = td_new(compression);
161 |     std::vector<double> input;
162 |     input.resize(stream_size, 0);
163 |     std::mt19937_64 rng;
164 |     rng.seed(std::random_device()());
165 |     std::uniform_real_distribution<double> dist(0, 1);
166 |     std::lognormal_distribution<double> distSamples(1, 0.5);
167 | 
168 |     for (double &i : input) {
169 |         i = dist(rng);
170 |         td_add(mdigest, distSamples(rng), 1);
171 |     }
172 |     td_compress(mdigest);
173 | 
174 |     while (state.KeepRunning()) {
175 |         for (int i = 0; i < stream_size; ++i) {
176 |             td_trimmed_mean_symmetric(mdigest, input[i]);
177 |         }
178 |         // read/write barrier
179 |         benchmark::ClobberMemory();
180 |         state.SetItemsProcessed(stream_size);
181 |         // Set the counter as a thread-average quantity. It will
182 |         // be presented divided by the number of threads ( in our case just one thread ).
183 |         state.counters["Centroid_Count"] =
184 |             benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads);
185 |         state.counters["Total_Compressions"] =
186 |             benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads);
187 |     }
188 | }
189 | 
190 | static void BM_td_quantile_lognormal_dist_given_array(benchmark::State &state) {
191 |     const double compression = state.range(0);
192 |     const int64_t stream_size = state.range(1);
193 |     td_histogram_t *mdigest = td_new(compression);
194 |     std::vector<double> input;
195 |     input.resize(stream_size, 0);
196 |     std::mt19937_64 rng;
197 |     rng.seed(12345);
198 |     std::lognormal_distribution<double> distSamples(1, 0.5);
199 |     const double percentile_list[4] = {50.0, 95.0, 99.0, 99.9};
200 | 
201 |     for (double &i : input) {
202 |         td_add(mdigest, distSamples(rng), 1);
203 |     }
204 |     td_compress(mdigest);
205 |     int64_t items_processed = 0;
206 |     for (auto _ : state) {
207 |         for (auto percentile : percentile_list) {
208 |             benchmark::DoNotOptimize(td_quantile(mdigest, percentile));
209 |             // read/write barrier
210 |             benchmark::ClobberMemory();
211 |         }
212 |         items_processed += 4;
213 |         // read/write barrier
214 |         benchmark::ClobberMemory();
215 |         state.SetItemsProcessed(stream_size);
216 |         // Set the counter as a thread-average quantity. It will
217 |         // be presented divided by the number of threads ( in our case just one thread ).
218 |         state.counters["Centroid_Count"] =
219 |             benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads);
220 |         state.counters["Total_Compressions"] =
221 |             benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads);
222 |     }
223 | }
224 | 
225 | static void BM_td_quantiles_lognormal_dist_given_array(benchmark::State &state) {
226 |     const double compression = state.range(0);
227 |     const int64_t stream_size = state.range(1);
228 |     td_histogram_t *mdigest = td_new(compression);
229 |     std::vector<double> input;
230 |     input.resize(stream_size, 0);
231 |     std::mt19937_64 rng;
232 |     rng.seed(12345);
233 |     std::lognormal_distribution<double> distSamples(1, 0.5);
234 |     const double percentile_list[4] = {50.0, 95.0, 99.0, 99.9};
235 |     double values[4] = {.0};
236 | 
237 |     for (double &i : input) {
238 |         td_add(mdigest, distSamples(rng), 1);
239 |     }
240 |     td_compress(mdigest);
241 |     int64_t items_processed = 0;
242 |     for (auto _ : state) {
243 |         benchmark::DoNotOptimize(td_quantiles(mdigest, percentile_list, values, 4));
244 |         items_processed += 4;
245 |         // read/write barrier
246 |         benchmark::ClobberMemory();
247 |         state.SetItemsProcessed(stream_size);
248 |         // Set the counter as a thread-average quantity. It will
249 |         // be presented divided by the number of threads ( in our case just one thread ).
250 |         state.counters["Centroid_Count"] =
251 |             benchmark::Counter(td_centroid_count(mdigest), benchmark::Counter::kAvgThreads);
252 |         state.counters["Total_Compressions"] =
253 |             benchmark::Counter(mdigest->total_compressions, benchmark::Counter::kAvgThreads);
254 |     }
255 | }
256 | 
257 | // Register the functions as a benchmark
258 | BENCHMARK(BM_td_add_uniform_dist)->Apply(generate_arguments_pairs);
259 | BENCHMARK(BM_td_add_lognormal_dist)->Apply(generate_arguments_pairs);
260 | BENCHMARK(BM_td_quantile_lognormal_dist)->Apply(generate_arguments_pairs);
261 | BENCHMARK(BM_td_quantile_lognormal_dist_given_array)->Apply(generate_arguments_pairs);
262 | BENCHMARK(BM_td_quantiles_lognormal_dist_given_array)->Apply(generate_arguments_pairs);
263 | BENCHMARK(BM_td_merge_lognormal_dist)->Apply(generate_arguments_pairs);
264 | BENCHMARK(BM_td_trimmed_mean_symmetric_lognormal_dist)->Apply(generate_arguments_pairs);
265 | 
266 | BENCHMARK_MAIN();


--------------------------------------------------------------------------------
/tests/unit/minunit.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2012 David Siñuela Pastor, siu.4coders@gmail.com
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining
  5 |  * a copy of this software and associated documentation files (the
  6 |  * "Software"), to deal in the Software without restriction, including
  7 |  * without limitation the rights to use, copy, modify, merge, publish,
  8 |  * distribute, sublicense, and/or sell copies of the Software, and to
  9 |  * permit persons to whom the Software is furnished to do so, subject to
 10 |  * the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be
 13 |  * included in all copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 18 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 19 |  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 20 |  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 21 |  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 22 |  */
 23 | // clang-format off
 24 | #ifndef MINUNIT_MINUNIT_H
 25 | #define MINUNIT_MINUNIT_H
 26 | 
 27 | #ifdef __cplusplus
 28 | extern "C" {
 29 | #endif
 30 | 
 31 | #if defined(_WIN32)
 32 | #include <Windows.h>
 33 | #if defined(_MSC_VER) && _MSC_VER < 1900
 34 | #define snprintf _snprintf
 35 | #define __func__ __FUNCTION__
 36 | #endif
 37 | 
 38 | #elif defined(__unix__) || defined(__unix) || defined(unix) ||                                     \
 39 |     (defined(__APPLE__) && defined(__MACH__))
 40 | 
 41 | /* Change POSIX C SOURCE version for pure c99 compilers */
 42 | #if !defined(_POSIX_C_SOURCE) || _POSIX_C_SOURCE < 200112L
 43 | #undef _POSIX_C_SOURCE
 44 | #define _POSIX_C_SOURCE 200112L
 45 | #endif
 46 | 
 47 | #include <unistd.h>   /* POSIX flags */
 48 | #include <time.h>     /* clock_gettime(), time() */
 49 | #include <sys/time.h> /* gethrtime(), gettimeofday() */
 50 | #include <sys/resource.h>
 51 | #include <sys/times.h>
 52 | #include <string.h>
 53 | 
 54 | #if defined(__MACH__) && defined(__APPLE__)
 55 | #include <mach/mach.h>
 56 | #include <mach/mach_time.h>
 57 | #endif
 58 | 
 59 | #if __GNUC__ >= 5 && !defined(__STDC_VERSION__)
 60 | #define __func__ __extension__ __FUNCTION__
 61 | #endif
 62 | 
 63 | #else
 64 | #error "Unable to define timers for an unknown OS."
 65 | #endif
 66 | 
 67 | #include <stdio.h>
 68 | #include <math.h>
 69 | 
 70 | /*  Maximum length of last message */
 71 | #define MINUNIT_MESSAGE_LEN 1024
 72 | /*  Accuracy with which floats are compared */
 73 | #define MINUNIT_EPSILON 1E-12
 74 | 
 75 | /*  Misc. counters */
 76 | static int minunit_run = 0;
 77 | static int minunit_assert = 0;
 78 | static int minunit_fail = 0;
 79 | static int minunit_status = 0;
 80 | 
 81 | /*  Timers */
 82 | static double minunit_real_timer = 0;
 83 | static double minunit_proc_timer = 0;
 84 | 
 85 | /*  Last message */
 86 | static char minunit_last_message[MINUNIT_MESSAGE_LEN];
 87 | 
 88 | /*  Test setup and teardown function pointers */
 89 | static void (*minunit_setup)(void) = NULL;
 90 | static void (*minunit_teardown)(void) = NULL;
 91 | 
 92 | /*  Definitions */
 93 | #define MU_TEST(method_name) static void method_name(void)
 94 | #define MU_TEST_SUITE(suite_name) static void suite_name(void)
 95 | 
 96 | #define MU__SAFE_BLOCK(block)                                                                      \
 97 |     do {                                                                                           \
 98 |         block                                                                                      \
 99 |     } while (0)
100 | 
101 | /*  Run test suite and unset setup and teardown functions */
102 | #define MU_RUN_SUITE(suite_name)                                                                   \
103 |     MU__SAFE_BLOCK(suite_name(); minunit_setup = NULL; minunit_teardown = NULL;)
104 | 
105 | /*  Configure setup and teardown functions */
106 | #define MU_SUITE_CONFIGURE(setup_fun, teardown_fun)                                                \
107 |     MU__SAFE_BLOCK(minunit_setup = setup_fun; minunit_teardown = teardown_fun;)
108 | 
109 | /*  Test runner */
110 | #define MU_RUN_TEST(test)                                                                          \
111 |     MU__SAFE_BLOCK(                                                                                \
112 |         if (minunit_real_timer == 0 && minunit_proc_timer == 0) {                                  \
113 |             minunit_real_timer = mu_timer_real();                                                  \
114 |             minunit_proc_timer = mu_timer_cpu();                                                   \
115 |         } if (minunit_setup) (*minunit_setup)();                                                   \
116 |         minunit_status = 0; test(); minunit_run++; if (minunit_status) {                           \
117 |             minunit_fail++;                                                                        \
118 |             printf("F");                                                                           \
119 |             printf("\n%s\n", minunit_last_message);                                                \
120 |         } fflush(stdout);                                                                          \
121 |         if (minunit_teardown)(*minunit_teardown)();)
122 | 
123 | /*  Report */
124 | #define MU_REPORT()                                                                                \
125 |     MU__SAFE_BLOCK(double minunit_end_real_timer; double minunit_end_proc_timer;                   \
126 |                    printf("\n\n%d tests, %d assertions, %d failures\n", minunit_run,               \
127 |                           minunit_assert, minunit_fail);                                           \
128 |                    minunit_end_real_timer = mu_timer_real();                                       \
129 |                    minunit_end_proc_timer = mu_timer_cpu();                                        \
130 |                    printf("\nFinished in %.8f seconds (real) %.8f seconds (proc)\n\n",             \
131 |                           minunit_end_real_timer - minunit_real_timer,                             \
132 |                           minunit_end_proc_timer - minunit_proc_timer);)
133 | #define MU_EXIT_CODE minunit_fail
134 | 
135 | /*  Assertions */
136 | #define mu_check(test)                                                                             \
137 |     MU__SAFE_BLOCK(                                                                                \
138 |         minunit_assert++; if (!(test)) {                                                           \
139 |             snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, "%s failed:\n\t%s:%d: %s",         \
140 |                      __func__, __FILE__, __LINE__, #test);                                         \
141 |             minunit_status = 1;                                                                    \
142 |             return;                                                                                \
143 |         } else { printf("."); })
144 | 
145 | #define mu_fail(message)                                                                           \
146 |     MU__SAFE_BLOCK(minunit_assert++;                                                               \
147 |                    snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, "%s failed:\n\t%s:%d: %s",  \
148 |                             __func__, __FILE__, __LINE__, message);                                \
149 |                    minunit_status = 1; return;)
150 | 
151 | #define mu_assert(test, message)                                                                   \
152 |     MU__SAFE_BLOCK(                                                                                \
153 |         minunit_assert++; if (!(test)) {                                                           \
154 |             snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, "%s failed:\n\t%s:%d: %s",         \
155 |                      __func__, __FILE__, __LINE__, message);                                       \
156 |             minunit_status = 1;                                                                    \
157 |             return;                                                                                \
158 |         } else { printf("."); })
159 | 
160 | #define mu_assert_long_eq(expected, result)                                                        \
161 |     MU__SAFE_BLOCK(                                                                                \
162 |         long long minunit_tmp_e; long long minunit_tmp_r; minunit_assert++;                        \
163 |         minunit_tmp_e = (expected); minunit_tmp_r = (result);                                      \
164 |         if (minunit_tmp_e != minunit_tmp_r) {                                                      \
165 |             snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN,                                    \
166 |                      "%s failed:\n\t%s:%d: %lld expected but was %lld", __func__, __FILE__,        \
167 |                      __LINE__, minunit_tmp_e, minunit_tmp_r);                                      \
168 |             minunit_status = 1;                                                                    \
169 |             return;                                                                                \
170 |         } else { printf("."); })
171 | 
172 | #define mu_assert_int_eq(expected, result)                                                         \
173 |     MU__SAFE_BLOCK(                                                                                \
174 |         int minunit_tmp_e; int minunit_tmp_r; minunit_assert++; minunit_tmp_e = (expected);        \
175 |         minunit_tmp_r = (result); if (minunit_tmp_e != minunit_tmp_r) {                            \
176 |             snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN,                                    \
177 |                      "%s failed:\n\t%s:%d: %d expected but was %d", __func__, __FILE__, __LINE__,  \
178 |                      minunit_tmp_e, minunit_tmp_r);                                                \
179 |             minunit_status = 1;                                                                    \
180 |             return;                                                                                \
181 |         } else { printf("."); })
182 | 
183 | #define mu_assert_double_eq(expected, result)                                                      \
184 |     MU__SAFE_BLOCK(                                                                                \
185 |         double minunit_tmp_e; double minunit_tmp_r; minunit_assert++; minunit_tmp_e = (expected);  \
186 |         minunit_tmp_r = (result); if (fabs(minunit_tmp_e - minunit_tmp_r) > MINUNIT_EPSILON) {     \
187 |             int minunit_significant_figures = 1 - log10(MINUNIT_EPSILON);                          \
188 |             snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN,                                    \
189 |                      "%s failed:\n\t%s:%d: %.*g expected but was %.*g", __func__, __FILE__,        \
190 |                      __LINE__, minunit_significant_figures, minunit_tmp_e,                         \
191 |                      minunit_significant_figures, minunit_tmp_r);                                  \
192 |             minunit_status = 1;                                                                    \
193 |             return;                                                                                \
194 |         } else { printf("."); })
195 | 
196 | #define mu_assert_double_eq_epsilon(expected, result, epsilon)                                     \
197 |     MU__SAFE_BLOCK(                                                                                \
198 |         double minunit_tmp_e; double minunit_tmp_r; minunit_assert++; minunit_tmp_e = (expected);  \
199 |         minunit_tmp_r = (result); if (fabs(minunit_tmp_e - minunit_tmp_r) > epsilon) {             \
200 |             int minunit_significant_figures = 1 - log10(epsilon);                                  \
201 |             snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN,                                    \
202 |                      "%s failed:\n\t%s:%d: %.*g expected but was %.*g ( using epsilon %.*g )",     \
203 |                      __func__, __FILE__, __LINE__, minunit_significant_figures, minunit_tmp_e,     \
204 |                      minunit_significant_figures, minunit_tmp_r, minunit_significant_figures,      \
205 |                      epsilon);                                                                     \
206 |             minunit_status = 1;                                                                    \
207 |             return;                                                                                \
208 |         } else { printf("."); })
209 | 
210 | #define mu_assert_string_eq(expected, result)                                                      \
211 |     MU__SAFE_BLOCK(                                                                                \
212 |         const char *minunit_tmp_e = expected; const char *minunit_tmp_r = result;                  \
213 |         minunit_assert++;                                                                          \
214 |         if (!minunit_tmp_e) { minunit_tmp_e = "<null pointer>"; } if (!minunit_tmp_r) {            \
215 |             minunit_tmp_r = "<null pointer>";                                                      \
216 |         } if (strcmp(minunit_tmp_e, minunit_tmp_r)) {                                              \
217 |             snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN,                                    \
218 |                      "%s failed:\n\t%s:%d: '%s' expected but was '%s'", __func__, __FILE__,        \
219 |                      __LINE__, minunit_tmp_e, minunit_tmp_r);                                      \
220 |             minunit_status = 1;                                                                    \
221 |             return;                                                                                \
222 |         } else { printf("."); })
223 | 
224 | /*
225 |  * The following two functions were written by David Robert Nadeau
226 |  * from http://NadeauSoftware.com/ and distributed under the
227 |  * Creative Commons Attribution 3.0 Unported License
228 |  */
229 | 
230 | /**
231 |  * Returns the real time, in seconds, or -1.0 if an error occurred.
232 |  *
233 |  * Time is measured since an arbitrary and OS-dependent start time.
234 |  * The returned real time is only useful for computing an elapsed time
235 |  * between two calls to this function.
236 |  */
237 | static double mu_timer_real(void) {
238 | #if defined(_WIN32)
239 |     /* Windows 2000 and later. ---------------------------------- */
240 |     LARGE_INTEGER Time;
241 |     LARGE_INTEGER Frequency;
242 | 
243 |     QueryPerformanceFrequency(&Frequency);
244 |     QueryPerformanceCounter(&Time);
245 | 
246 |     Time.QuadPart *= 1000000;
247 |     Time.QuadPart /= Frequency.QuadPart;
248 | 
249 |     return (double)Time.QuadPart / 1000000.0;
250 | 
251 | #elif (defined(__hpux) || defined(hpux)) ||                                                        \
252 |     ((defined(__sun__) || defined(__sun) || defined(sun)) &&                                       \
253 |      (defined(__SVR4) || defined(__svr4__)))
254 |     /* HP-UX, Solaris. ------------------------------------------ */
255 |     return (double)gethrtime() / 1000000000.0;
256 | 
257 | #elif defined(__MACH__) && defined(__APPLE__)
258 |     /* OSX. ----------------------------------------------------- */
259 |     static double timeConvert = 0.0;
260 |     if (timeConvert == 0.0) {
261 |         mach_timebase_info_data_t timeBase;
262 |         (void)mach_timebase_info(&timeBase);
263 |         timeConvert = (double)timeBase.numer / (double)timeBase.denom / 1000000000.0;
264 |     }
265 |     return (double)mach_absolute_time() * timeConvert;
266 | 
267 | #elif defined(_POSIX_VERSION)
268 |     /* POSIX. --------------------------------------------------- */
269 |     struct timeval tm;
270 | #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
271 |     {
272 |         struct timespec ts;
273 | #if defined(CLOCK_MONOTONIC_PRECISE)
274 |         /* BSD. --------------------------------------------- */
275 |         const clockid_t id = CLOCK_MONOTONIC_PRECISE;
276 | #elif defined(CLOCK_MONOTONIC_RAW)
277 |         /* Linux. ------------------------------------------- */
278 |         const clockid_t id = CLOCK_MONOTONIC_RAW;
279 | #elif defined(CLOCK_HIGHRES)
280 |         /* Solaris. ----------------------------------------- */
281 |         const clockid_t id = CLOCK_HIGHRES;
282 | #elif defined(CLOCK_MONOTONIC)
283 |         /* AIX, BSD, Linux, POSIX, Solaris. ----------------- */
284 |         const clockid_t id = CLOCK_MONOTONIC;
285 | #elif defined(CLOCK_REALTIME)
286 |         /* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */
287 |         const clockid_t id = CLOCK_REALTIME;
288 | #else
289 |         const clockid_t id = (clockid_t)-1; /* Unknown. */
290 | #endif /* CLOCK_* */
291 |         if (id != (clockid_t)-1 && clock_gettime(id, &ts) != -1)
292 |             return (double)ts.tv_sec + (double)ts.tv_nsec / 1000000000.0;
293 |         /* Fall thru. */
294 |     }
295 | #endif /* _POSIX_TIMERS */
296 | 
297 |     /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */
298 |     gettimeofday(&tm, NULL);
299 |     return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0;
300 | #else
301 |     return -1.0; /* Failed. */
302 | #endif
303 | }
304 | 
305 | /**
306 |  * Returns the amount of CPU time used by the current process,
307 |  * in seconds, or -1.0 if an error occurred.
308 |  */
309 | static double mu_timer_cpu(void) {
310 | #if defined(_WIN32)
311 |     /* Windows -------------------------------------------------- */
312 |     FILETIME createTime;
313 |     FILETIME exitTime;
314 |     FILETIME kernelTime;
315 |     FILETIME userTime;
316 | 
317 |     /* This approach has a resolution of 1/64 second. Unfortunately, Windows' API does not offer
318 |      * better */
319 |     if (GetProcessTimes(GetCurrentProcess(), &createTime, &exitTime, &kernelTime, &userTime) != 0) {
320 |         ULARGE_INTEGER userSystemTime;
321 |         memcpy(&userSystemTime, &userTime, sizeof(ULARGE_INTEGER));
322 |         return (double)userSystemTime.QuadPart / 10000000.0;
323 |     }
324 | 
325 | #elif defined(__unix__) || defined(__unix) || defined(unix) ||                                     \
326 |     (defined(__APPLE__) && defined(__MACH__))
327 |     /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, and Solaris --------- */
328 | 
329 | #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
330 |     /* Prefer high-res POSIX timers, when available. */
331 |     {
332 |         clockid_t id;
333 |         struct timespec ts;
334 | #if _POSIX_CPUTIME > 0
335 |         /* Clock ids vary by OS.  Query the id, if possible. */
336 |         if (clock_getcpuclockid(0, &id) == -1)
337 | #endif
338 | #if defined(CLOCK_PROCESS_CPUTIME_ID)
339 |             /* Use known clock id for AIX, Linux, or Solaris. */
340 |             id = CLOCK_PROCESS_CPUTIME_ID;
341 | #elif defined(CLOCK_VIRTUAL)
342 |         /* Use known clock id for BSD or HP-UX. */
343 |         id = CLOCK_VIRTUAL;
344 | #else
345 |         id = (clockid_t)-1;
346 | #endif
347 |         if (id != (clockid_t)-1 && clock_gettime(id, &ts) != -1)
348 |             return (double)ts.tv_sec + (double)ts.tv_nsec / 1000000000.0;
349 |     }
350 | #endif
351 | 
352 | #if defined(RUSAGE_SELF)
353 |     {
354 |         struct rusage rusage;
355 |         if (getrusage(RUSAGE_SELF, &rusage) != -1)
356 |             return (double)rusage.ru_utime.tv_sec + (double)rusage.ru_utime.tv_usec / 1000000.0;
357 |     }
358 | #endif
359 | 
360 | #if defined(_SC_CLK_TCK)
361 |     {
362 |         const double ticks = (double)sysconf(_SC_CLK_TCK);
363 |         struct tms tms;
364 |         if (times(&tms) != (clock_t)-1)
365 |             return (double)tms.tms_utime / ticks;
366 |     }
367 | #endif
368 | 
369 | #if defined(CLOCKS_PER_SEC)
370 |     {
371 |         clock_t cl = clock();
372 |         if (cl != (clock_t)-1)
373 |             return (double)cl / (double)CLOCKS_PER_SEC;
374 |     }
375 | #endif
376 | 
377 | #endif
378 | 
379 |     return -1; /* Failed. */
380 | }
381 | 
382 | #ifdef __cplusplus
383 | }
384 | #endif
385 | 
386 | #endif /* MINUNIT_MINUNIT_H */
387 | // clang-format on
388 | 


--------------------------------------------------------------------------------
/tests/unit/td_test.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * td_test.c
  3 |  * Written by Filipe Oliveira and released to the public domain,
  4 |  * as explained at http://creativecommons.org/publicdomain/zero/1.0/
  5 |  */
  6 | 
  7 | #include <stdint.h>
  8 | #include <stdbool.h>
  9 | #include <stdlib.h>
 10 | #include <errno.h>
 11 | 
 12 | #include <stdio.h>
 13 | #include "tdigest.h"
 14 | 
 15 | #include "minunit.h"
 16 | 
 17 | #define STREAM_SIZE 1000000
 18 | 
 19 | #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 20 | #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 21 | 
 22 | static double randfrom(double M, double N) { return M + (rand() / (RAND_MAX / (N - M))); }
 23 | 
 24 | int tests_run = 0;
 25 | 
 26 | td_histogram_t *histogram = NULL;
 27 | 
 28 | static void load_histograms(void) {
 29 |     const int compression = 500;
 30 | 
 31 |     int i;
 32 |     if (histogram) {
 33 |         td_free(histogram);
 34 |     }
 35 |     histogram = td_new(compression);
 36 | 
 37 |     for (i = 0; i < STREAM_SIZE; i++) {
 38 |         mu_assert(td_add(histogram, randfrom(0, 10), 1) == 0, "Insertion");
 39 |     }
 40 | }
 41 | 
 42 | MU_TEST(test_basic) {
 43 |     td_histogram_t *t = td_new(10);
 44 |     mu_assert(t != NULL, "created_histogram");
 45 |     mu_assert_long_eq(0, t->unmerged_weight);
 46 |     mu_assert_long_eq(0, t->merged_weight);
 47 |     mu_assert(td_add(t, 0.0, 1) == 0, "Insertion");
 48 |     // with one data point, all quantiles lead to Rome
 49 |     mu_assert_double_eq(0.0, td_quantile(t, .0));
 50 |     mu_assert_double_eq(0.0, td_quantile(t, 0.5));
 51 |     mu_assert_double_eq(0.0, td_quantile(t, 1));
 52 |     mu_assert(td_add(t, 10.0, 1) == 0, "Insertion");
 53 |     mu_assert_double_eq(0.0, td_min(t));
 54 |     mu_assert_double_eq(10.0, td_max(t));
 55 |     mu_assert_double_eq(2.0, td_size(t));
 56 |     mu_assert(t != NULL, "Failed to allocate hdr_histogram");
 57 |     mu_assert_double_eq(10.0, t->compression);
 58 |     mu_assert(td_compression(t) < t->cap, "False: buffer size < compression");
 59 |     mu_assert_double_eq(0.0, td_quantile(t, .0));
 60 |     mu_assert_double_eq(0.0, td_quantile(t, .1));
 61 |     mu_assert_double_eq(10.0, td_quantile(t, .99));
 62 |     td_reset(t);
 63 |     td_reset(NULL);
 64 |     td_free(t);
 65 | }
 66 | 
 67 | MU_TEST(test_overflow) {
 68 |     td_histogram_t *t = td_new(10);
 69 |     td_histogram_t *t2 = td_new(10);
 70 |     mu_assert(t != NULL, "created_histogram");
 71 |     mu_assert(t2 != NULL, "created_histogram");
 72 |     mu_assert_long_eq(0, t->unmerged_weight);
 73 |     mu_assert_long_eq(0, t->merged_weight);
 74 |     mu_assert_long_eq(0, t2->unmerged_weight);
 75 |     mu_assert_long_eq(0, t2->merged_weight);
 76 |     mu_assert(td_add(t, 5.0, __LONG_LONG_MAX__ - 1) == 0, "Insertion of __LONG_LONG_MAX__");
 77 |     mu_assert(td_add(t, 5.0, __LONG_LONG_MAX__ - 1) == EDOM,
 78 |               "second insertion of __LONG_LONG_MAX__ should overflow");
 79 |     mu_assert_long_eq(__LONG_LONG_MAX__ - 1, t->merged_weight + t->unmerged_weight);
 80 |     // overflow on merge
 81 |     mu_assert(td_add(t2, 5.0, __LONG_LONG_MAX__ - 1) == 0, "First insertion of __LONG_LONG_MAX__");
 82 |     mu_assert_long_eq(__LONG_LONG_MAX__ - 1, t2->merged_weight + t2->unmerged_weight);
 83 |     mu_assert(td_add(t2, 1.0, 1) == 0, "Insertion of 1");
 84 |     mu_assert(td_add(t2, 5.0, __LONG_LONG_MAX__ - 1) == EDOM,
 85 |               "Second insertion of __LONG_LONG_MAX__");
 86 |     td_free(t);
 87 |     td_free(t2);
 88 | }
 89 | 
 90 | MU_TEST(test_overflow_merge) {
 91 |     td_histogram_t *x = td_new(1000);
 92 |     td_histogram_t *y = td_new(1000);
 93 |     td_histogram_t *z = td_new(10);
 94 |     mu_assert(x != NULL, "created_histogram");
 95 |     mu_assert(y != NULL, "created_histogram");
 96 |     mu_assert(z != NULL, "created_histogram");
 97 |     mu_assert_long_eq(0, x->unmerged_weight);
 98 |     mu_assert_long_eq(0, x->merged_weight);
 99 |     mu_assert_long_eq(0, y->unmerged_weight);
100 |     mu_assert_long_eq(0, y->merged_weight);
101 |     mu_assert(td_add(x, 1, 1) == 0, "Insertion of 1");
102 |     mu_assert(td_add(x, 2, 1) == 0, "Insertion of 2");
103 |     mu_assert(td_add(x, 3, 1) == 0, "Insertion of 3");
104 |     mu_assert(td_add(x, 4, 1) == 0, "Insertion of 4");
105 |     mu_assert(td_add(x, 5, 1) == 0, "Insertion of 5");
106 |     mu_assert(td_add(x, 6, 1) == 0, "Insertion of 6");
107 |     mu_assert(td_add(x, 7, 1) == 0, "Insertion of 7");
108 |     mu_assert(td_add(x, 8, 1) == 0, "Insertion of 8");
109 |     mu_assert(td_add(x, 9, 1) == 0, "Insertion of 9");
110 |     mu_assert(td_add(x, 10, 1) == 0, "Insertion of 10");
111 |     mu_assert(td_add(x, 11, 1) == 0, "Insertion of 11");
112 |     mu_assert(td_add(x, 12, 1) == 0, "Insertion of 12");
113 |     mu_assert(td_add(x, 13, 1) == 0, "Insertion of 13");
114 |     mu_assert(td_add(x, 14, 1) == 0, "Insertion of 14");
115 |     mu_assert(td_add(x, 15, 1) == 0, "Insertion of 15");
116 |     mu_assert(td_add(x, 16, 1) == 0, "Insertion of 16");
117 |     mu_assert(td_add(x, 17, 1) == 0, "Insertion of 17");
118 |     mu_assert(td_add(x, 18, 1) == 0, "Insertion of 18");
119 |     mu_assert(td_add(x, 19, 1) == 0, "Insertion of 19");
120 |     mu_assert(td_add(x, 20, 1) == 0, "Insertion of 20");
121 |     mu_assert(td_add(y, 101, 1) == 0, "Insertion of 101");
122 |     mu_assert(td_add(y, 102, 1) == 0, "Insertion of 102");
123 |     mu_assert(td_add(y, 103, 1) == 0, "Insertion of 103");
124 |     mu_assert(td_add(y, 104, 1) == 0, "Insertion of 104");
125 |     mu_assert(td_add(y, 105, 1) == 0, "Insertion of 105");
126 |     mu_assert(td_add(y, 106, 1) == 0, "Insertion of 106");
127 |     mu_assert(td_add(y, 107, 1) == 0, "Insertion of 107");
128 |     mu_assert(td_add(y, 108, 1) == 0, "Insertion of 108");
129 |     mu_assert(td_add(y, 109, 1) == 0, "Insertion of 109");
130 |     mu_assert(td_add(y, 110, 1) == 0, "Insertion of 110");
131 |     mu_assert(td_add(y, 111, 1) == 0, "Insertion of 111");
132 |     mu_assert(td_add(y, 112, 1) == 0, "Insertion of 112");
133 |     mu_assert(td_add(y, 113, 1) == 0, "Insertion of 113");
134 |     mu_assert(td_add(y, 114, 1) == 0, "Insertion of 114");
135 |     mu_assert(td_add(y, 115, 1) == 0, "Insertion of 115");
136 |     mu_assert(td_add(y, 116, 1) == 0, "Insertion of 116");
137 |     mu_assert(td_add(y, 117, 1) == 0, "Insertion of 117");
138 |     mu_assert(td_add(y, 118, 1) == 0, "Insertion of 118");
139 |     mu_assert(td_add(y, 119, 1) == 0, "Insertion of 119");
140 |     mu_assert(td_add(y, 120, 1) == 0, "Insertion of 120");
141 | 
142 |     for (size_t i = 0; i < 10; i++) {
143 |         td_histogram_t *zz = td_new(10);
144 |         int self_merge_res = 0;
145 |         mu_assert(td_merge(zz, x) == 0, "1st merge x into z");
146 |         mu_assert(td_merge(zz, y) == 0, "1st merge y into z");
147 |         mu_assert(td_merge(zz, x) == 0, "2nd merge x into z");
148 |         mu_assert(td_merge(zz, y) == 0, "2nd merge y into z");
149 |         mu_assert(td_merge(zz, x) == 0, "3rd merge x into z");
150 |         for (size_t j = 0; j < 5; j++) {
151 |             self_merge_res = td_merge(zz, z);
152 |         }
153 |         td_free(z);
154 |         z = zz;
155 |         mu_assert((z->merged_weight + z->unmerged_weight) > 0, "assert z contains weight");
156 |         if (self_merge_res == EDOM)
157 |             break;
158 |     }
159 | 
160 |     td_free(x);
161 |     td_free(y);
162 |     td_free(z);
163 | }
164 | 
165 | MU_TEST(test_quantile_interpolations) {
166 |     td_histogram_t *t = td_new(10);
167 |     mu_assert(t != NULL, "created_histogram");
168 |     mu_assert_long_eq(0, t->unmerged_weight);
169 |     mu_assert_long_eq(0, t->merged_weight);
170 |     mu_assert(td_add(t, 5.0, 2) == 0, "add");
171 |     mu_assert_long_eq(2, t->unmerged_weight);
172 |     // with one data point, all quantiles lead to Rome
173 |     mu_assert_double_eq(5.0, td_quantile(t, .0));
174 |     mu_assert_double_eq(5.0, td_quantile(t, 0.5));
175 |     mu_assert_double_eq(5.0, td_quantile(t, 1.0));
176 |     mu_assert(td_compress(t) == 0, "compress");
177 |     mu_assert_long_eq(0, t->unmerged_weight);
178 |     mu_assert_long_eq(2, t->merged_weight);
179 |     mu_assert(td_add(t, 100.0, 1) == 0, "Insertion");
180 |     // we know that there are at least two centroids now
181 |     td_free(t);
182 | }
183 | 
184 | MU_TEST(test_trimmed_mean_simple) {
185 |     /* Used numpy to check results validity
186 |      import numpy as np
187 |      from scipy import stats
188 |      x = [5,5,5,10,15,15,15]
189 |      np.mean(x)
190 |      10.0
191 |      stats.trim_mean(x, 0.0)
192 |      10.0
193 |      */
194 |     td_histogram_t *t = td_new(100);
195 |     mu_assert(t != NULL, "created_histogram");
196 |     mu_assert_long_eq(0, t->unmerged_weight);
197 |     mu_assert_long_eq(0, t->merged_weight);
198 |     //    stats.trim_mean([], 0.49)
199 |     //    nan
200 |     mu_assert_double_eq(NAN, td_trimmed_mean_symmetric(t, .49));
201 |     mu_assert_double_eq(NAN, td_trimmed_mean(t, 0.49, 0.51));
202 |     mu_assert(td_add(t, 5.0, 1) == 0, "Insertion");
203 |     // with one data point, all quantiles lead to Rome
204 |     // stats.trim_mean(x, 0.49)
205 |     mu_assert_double_eq(5, td_trimmed_mean_symmetric(t, .49));
206 |     mu_assert_double_eq(5, td_trimmed_mean(t, 0.49, 0.51));
207 |     // stats.trim_mean(x, 0.1)
208 |     // 5.0
209 |     mu_assert_double_eq(5, td_trimmed_mean_symmetric(t, .1));
210 |     mu_assert_double_eq(5, td_trimmed_mean(t, 0.1, 0.9));
211 |     // 5.0
212 |     // stats.trim_mean(x, 0.0)
213 |     mu_assert_double_eq(5, td_trimmed_mean_symmetric(t, .0));
214 |     mu_assert_double_eq(5, td_trimmed_mean(t, 0.0, 1));
215 |     // 5.0
216 |     mu_assert(td_add(t, 5.0, 2) == 0, "Insertion");
217 |     mu_assert_double_eq(5, td_trimmed_mean_symmetric(t, .0));
218 |     mu_assert_double_eq(5, td_trimmed_mean(t, 0.0, 1));
219 |     mu_assert(td_add(t, 10.0, 1) == 0, "Insertion");
220 |     mu_assert(td_add(t, 15.0, 3) == 0, "Insertion");
221 |     //    stats.trim_mean(x, 0.0)
222 |     //    10.0
223 |     mu_assert_double_eq(10, td_trimmed_mean_symmetric(t, .0));
224 |     mu_assert_double_eq(10, td_trimmed_mean(t, 0.0, 1));
225 |     // trimmed mean and mean should lead to 10 in here
226 |     //    stats.trim_mean(x, 0.1)
227 |     //    10.0
228 |     mu_assert_double_eq(10, td_trimmed_mean_symmetric(t, .1));
229 |     mu_assert_double_eq(10, td_trimmed_mean(t, .1, .9));
230 |     // trimmed mean and mean should lead to 10 in here
231 |     //    stats.trim_mean(x, 0.25)
232 |     //    10.0
233 |     mu_assert_double_eq(10, td_trimmed_mean_symmetric(t, .25));
234 |     mu_assert_double_eq(10, td_trimmed_mean(t, .25, .75));
235 |     td_free(t);
236 | }
237 | 
238 | MU_TEST(test_trimmed_mean_complex) {
239 |     /* Used numpy to check results validity
240 |      import numpy as np
241 |      from scipy import stats
242 |      x = np.arange(20)
243 |      stats.trim_mean(x, 0.1)
244 |      9.5
245 |      */
246 |     td_histogram_t *t = td_new(100);
247 |     mu_assert(t != NULL, "created_histogram");
248 |     mu_assert_long_eq(0, t->unmerged_weight);
249 |     mu_assert_long_eq(0, t->merged_weight);
250 |     for (int i = 0; i < 20; ++i) {
251 |         mu_assert(td_add(t, (double)i, 1) == 0, "Insertion");
252 |     }
253 |     // trimmed mean and mean should lead to 9.5 in here
254 |     //    stats.trim_mean(x, 0.25)
255 |     //    9.5
256 |     mu_assert_double_eq(9.5, td_trimmed_mean_symmetric(t, .25));
257 |     mu_assert_double_eq(9.5, td_trimmed_mean(t, .25, .75));
258 |     td_free(t);
259 |     t = td_new(100);
260 |     mu_assert(t != NULL, "created_histogram");
261 |     mu_assert_long_eq(0, t->unmerged_weight);
262 |     mu_assert_long_eq(0, t->merged_weight);
263 |     for (int i = 0; i < 200; ++i) {
264 |         mu_assert(td_add(t, (double)i, 1) == 0, "Insertion");
265 |     }
266 |     // trimmed mean and mean should lead to 99.5 in here
267 |     //    x = np.arange(200)
268 |     //    stats.trim_mean(x, 0.25)
269 |     //    99.5
270 |     mu_assert_double_eq_epsilon(99.5, td_trimmed_mean_symmetric(t, .25), 0.1);
271 |     mu_assert_double_eq_epsilon(99.5, td_trimmed_mean(t, .25, .75), 0.1);
272 | 
273 |     // Non symmetric trimmed means
274 |     //    trim_mean(x, 0.1, 0.75)
275 |     //    84.5
276 |     mu_assert_double_eq_epsilon(84.5, td_trimmed_mean(t, .1, 0.75), 0.1);
277 |     //    trim_mean(x, 0.0, 0.75)
278 |     //    74.5
279 |     mu_assert_double_eq_epsilon(74.5, td_trimmed_mean(t, .0, 0.75), 0.1);
280 | 
281 |     td_free(t);
282 |     //    x = [1,2,3,4,5,6,7,8,9,10,100,100,100]
283 |     t = td_new(100);
284 |     for (int i = 1; i < 11; ++i) {
285 |         mu_assert(td_add(t, (double)i, 1) == 0, "Insertion");
286 |     }
287 |     mu_assert(td_add(t, 100, 3) == 0, "Insertion");
288 |     //    stats.trim_mean(x, 0.1)
289 |     //    23.09090909090909
290 |     mu_assert_double_eq_epsilon(23.09090909090909, td_trimmed_mean_symmetric(t, .1), 0.01);
291 |     mu_assert_double_eq_epsilon(23.09090909090909, td_trimmed_mean(t, .1, .9), 0.01);
292 |     //    stats.trim_mean(x, 0.25)
293 |     //    7.0
294 |     mu_assert_double_eq_epsilon(7.0, td_trimmed_mean_symmetric(t, .25), 0.01);
295 |     mu_assert_double_eq_epsilon(7.0, td_trimmed_mean(t, .25, .75), 0.01);
296 |     td_free(t);
297 | }
298 | 
299 | MU_TEST(test_compress_small) {
300 |     td_histogram_t *t = td_new(100);
301 |     mu_assert(t != NULL, "created_histogram");
302 |     mu_assert(td_add(t, 1.0, 1) == 0, "Insertion");
303 |     mu_assert_double_eq(1.0, td_min(t));
304 |     mu_assert_double_eq(1.0, td_max(t));
305 |     mu_assert_double_eq(1.0, td_size(t));
306 |     mu_assert_int_eq(1, td_centroid_count(t));
307 |     mu_assert_long_eq(0, t->total_compressions);
308 |     mu_assert_double_eq(1.0, td_centroids_mean_at(t, 0));
309 |     mu_assert_long_eq(1, td_centroids_weight_at(t, 0));
310 |     mu_assert_int_eq(1, t->unmerged_nodes);
311 |     mu_assert_int_eq(0, t->merged_nodes);
312 |     mu_assert(td_compress(t) == 0, "compress");
313 |     mu_assert_long_eq(1, t->unmerged_nodes + t->merged_nodes);
314 |     mu_assert_double_eq(1.0, td_centroids_mean_at(t, 0));
315 |     mu_assert_long_eq(1, td_centroids_weight_at(t, 0));
316 |     mu_assert_double_eq(1.0, td_quantile(t, 0.001));
317 |     mu_assert_double_eq(1.0, td_quantile(t, 0.01));
318 |     mu_assert_double_eq(1.0, td_quantile(t, 0.5));
319 |     mu_assert_double_eq(1.0, td_quantile(t, 0.99));
320 |     mu_assert_double_eq(1.0, td_quantile(t, 0.999));
321 |     td_free(t);
322 | }
323 | 
324 | MU_TEST(test_compress_large) {
325 |     td_histogram_t *t = td_new(100);
326 |     mu_assert(t != NULL, "created_histogram");
327 |     for (int i = 1; i <= 1000; ++i) {
328 |         mu_assert(td_add(t, (double)i, 1) == 0, "Insertion");
329 |     }
330 | 
331 |     mu_assert_double_eq(1.0, td_min(t));
332 |     mu_assert_double_eq(1000.0, td_max(t));
333 |     mu_assert_double_eq(1000.0, td_size(t));
334 |     // TODO: add this test cases
335 |     // EXPECT_EQ(500500, digest.sum());
336 |     // EXPECT_EQ(500.5, digest.mean());
337 |     // mu_assert_double_eq(1.5, td_quantile(t, 0.001));
338 |     mu_assert_double_eq(10.5, td_quantile(t, 0.01));
339 |     // mu_assert_double_eq_epsilon(500.25, td_quantile(t, 0.5), 0.5);
340 |     // TODO: swap this one by the bellow
341 |     // mu_assert_double_eq(990.25, td_quantile(t, 0.99));
342 |     mu_assert_double_eq_epsilon(990.25, td_quantile(t, 0.99), 0.5);
343 |     // mu_assert_double_eq(999.5, td_quantile(t, 0.999));
344 |     td_free(t);
345 | }
346 | 
347 | MU_TEST(test_negative_values) {
348 |     td_histogram_t *t = td_new(1000);
349 |     mu_assert(t != NULL, "created_histogram");
350 |     for (int i = 1; i <= 100; ++i) {
351 |         mu_assert(td_add(t, (double)i, 1) == 0, "Insertion");
352 |         mu_assert(td_add(t, -(double)i, 1) == 0, "Insertion");
353 |     }
354 |     mu_assert_double_eq(-100.0, td_min(t));
355 |     mu_assert_double_eq(100.0, td_max(t));
356 |     mu_assert_double_eq(200.0, td_size(t));
357 |     mu_assert_double_eq(-100, td_quantile(t, 0.0));
358 |     mu_assert_double_eq(-100, td_quantile(t, 0.001));
359 |     // TODO: fix my epsilon
360 |     mu_assert_double_eq_epsilon(-98.5, td_quantile(t, 0.01), 0.75);
361 |     mu_assert_double_eq_epsilon(98.5, td_quantile(t, 0.99), 0.75);
362 |     mu_assert_double_eq(100, td_quantile(t, 0.999));
363 |     mu_assert_double_eq(100, td_quantile(t, 1));
364 |     td_free(t);
365 | }
366 | 
367 | MU_TEST(test_negative_values_merge) {
368 |     td_histogram_t *d1 = td_new(100);
369 |     td_histogram_t *d2 = td_new(100);
370 |     mu_assert(d1 != NULL, "created_histogram");
371 |     mu_assert(d2 != NULL, "created_histogram");
372 |     for (int i = 1; i <= 100; ++i) {
373 |         mu_assert(td_add(d1, (double)i, 1) == 0, "Insertion");
374 |         mu_assert(td_add(d2, -(double)i, 1) == 0, "Insertion");
375 |     }
376 |     td_merge(d1, d2);
377 |     mu_assert_double_eq(-100.0, td_min(d1));
378 |     mu_assert_double_eq(100.0, td_max(d1));
379 |     mu_assert_double_eq(200.0, td_size(d1));
380 |     mu_assert_double_eq(-100, td_quantile(d1, 0.0));
381 |     mu_assert_double_eq(-100, td_quantile(d1, 0.001));
382 |     // TODO: fix my epsilon
383 |     mu_assert_double_eq_epsilon(-98.5, td_quantile(d1, 0.01), 0.75);
384 |     mu_assert_double_eq_epsilon(98.5, td_quantile(d1, 0.99), 0.75);
385 |     mu_assert_double_eq(100, td_quantile(d1, 0.999));
386 |     mu_assert_double_eq(100, td_quantile(d1, 1));
387 |     td_free(d1);
388 |     td_free(d2);
389 | }
390 | 
391 | MU_TEST(test_large_outlier_test) {
392 |     td_histogram_t *t = td_new(100);
393 |     mu_assert(t != NULL, "created_histogram");
394 |     for (int i = 1; i <= 19; ++i) {
395 |         mu_assert(td_add(t, (double)i, 1) == 0, "Insertion");
396 |     }
397 |     mu_assert(td_add(t, 1000000, 1) == 0, "Insertion");
398 |     mu_assert(td_quantile(t, 0.5) < td_quantile(t, 0.9),
399 |               "False: td_quantile(t, 0.5) < td_quantile(t, 0.9)");
400 |     td_free(t);
401 | }
402 | 
403 | MU_TEST(test_nans) {
404 |     td_histogram_t *t = td_new(1000);
405 |     mu_assert(isnan(td_quantile(t, 0)), "empty value at 0");
406 |     mu_assert(isnan(td_quantile(t, 0.5)), "empty value at .5");
407 |     mu_assert(isnan(td_quantile(t, 1)), "empty value at 1");
408 |     mu_assert(isnan(td_centroids_mean_at(t, 1)), "td_centroids_mean_at on pos > h->merged_nodes");
409 |     mu_assert(isnan(td_centroids_mean_at(t, -1)), "td_centroids_mean_at on pos < 0");
410 |     mu_assert(td_add(t, 1, 1) == 0, "Insertion");
411 |     mu_assert(isnan(td_quantile(t, -.1)), "value at -0.1");
412 |     mu_assert(isnan(td_quantile(t, 1.1)), "value at 1.1");
413 |     td_free(t);
414 | }
415 | 
416 | MU_TEST(test_two_interp) {
417 |     td_histogram_t *t = td_new(1000);
418 |     mu_assert(td_add(t, 1, 1) == 0, "Insertion");
419 |     mu_assert(td_add(t, 10, 1) == 0, "Insertion");
420 |     mu_assert(isfinite(td_quantile(t, .9)), "test_two_interp: value at .9");
421 |     td_reset(t);
422 |     // if the left centroid has more than one sample, we still know
423 |     // that one sample occurred at min so we can do some interpolation
424 |     mu_assert(td_add(t, 1, 10) == 0, "Insertion");
425 |     mu_assert(td_add(t, 10, 1) == 0, "Insertion");
426 |     mu_assert_double_eq(1.0, td_quantile(t, .1));
427 |     td_reset(t);
428 |     // if the right-most centroid has more than one sample, we still know
429 |     // that one sample occurred at max so we can do some interpolation
430 |     mu_assert(td_add(t, 1, 1) == 0, "Insertion");
431 |     mu_assert(td_add(t, 10, 10) == 0, "Insertion");
432 |     mu_assert_double_eq(10.0, td_quantile(t, .9));
433 |     td_reset(t);
434 |     // in between extremes we interpolate between centroids
435 |     mu_assert(td_add(t, 1, 1) == 0, "Insertion");
436 |     mu_assert(td_add(t, 5, 1) == 0, "Insertion");
437 |     mu_assert(td_add(t, 10, 1) == 0, "Insertion");
438 |     // centroids i and i+1 bracket our current point
439 |     // check for unit weight
440 |     // within the singleton's sphere
441 |     // left
442 |     mu_assert_double_eq(5.0, td_quantile(t, .5));
443 |     td_reset(t);
444 |     // in between extremes we interpolate between centroids
445 |     mu_assert(td_add(t, 1, 1) == 0, "Insertion");  // q0
446 |     mu_assert(td_add(t, 4, 1) == 0, "Insertion");  // q20
447 |     mu_assert(td_add(t, 8, 1) == 0, "Insertion");  // q40
448 |     mu_assert(td_add(t, 12, 1) == 0, "Insertion"); // q60
449 |     mu_assert(td_add(t, 16, 1) == 0, "Insertion"); // q80
450 |     mu_assert(td_add(t, 20, 1) == 0, "Insertion"); // q100
451 |     // centroids i and i+1 bracket our current point
452 |     // check for unit weight
453 |     // within the singleton's sphere
454 |     // TODO: check for right
455 |     // mu_assert_double_eq(4.0, td_quantile(t, .20) );
456 |     // mu_assert_double_eq(8.0, td_quantile(t, .40) );
457 |     // mu_assert_double_eq(12.0, td_quantile(t, .60) );
458 |     // mu_assert_double_eq(7.0, td_quantile(t, .70) );
459 |     // mu_assert_double_eq(8.0, td_quantile(t, .75) );
460 |     td_free(t);
461 | }
462 | 
463 | MU_TEST(test_cdf) {
464 |     td_histogram_t *t = td_new(100);
465 |     mu_assert(isnan(td_cdf(t, 1.1)), "no data to examine");
466 |     // interpolate if somehow we have weight > 0 and max != min
467 |     mu_assert(td_add(t, 1, 1) == 0, "Insertion");
468 |     // bellow lower bound
469 |     mu_assert_double_eq(0, td_cdf(t, 0));
470 |     // exactly one centroid, should have max==min
471 |     // min and max are too close together to do any viable interpolation
472 |     mu_assert_double_eq(0.5, td_cdf(t, 1));
473 |     // above upper bound
474 |     mu_assert_double_eq(1.0, td_cdf(t, 2));
475 |     mu_assert(td_add(t, 10, 1) == 0, "Insertion");
476 |     mu_assert_double_eq(.25, td_cdf(t, 1));
477 |     mu_assert_double_eq(.5, td_cdf(t, 5.5));
478 |     // // TODO: fix this
479 |     // mu_assert_double_eq(1,td_cdf(t, 10));
480 |     td_free(t);
481 | }
482 | 
483 | MU_TEST(test_td_size) {
484 |     load_histograms();
485 |     mu_assert(td_size(histogram) == STREAM_SIZE, "td_size(histogram) != STREAM_SIZE");
486 | }
487 | 
488 | MU_TEST(test_td_max) {
489 |     load_histograms();
490 |     mu_assert_double_eq_epsilon(10.0, td_max(histogram), 0.001);
491 | }
492 | 
493 | MU_TEST(test_td_min) {
494 |     load_histograms();
495 |     mu_assert_double_eq_epsilon(0.0, td_min(histogram), 0.001);
496 | }
497 | 
498 | MU_TEST(test_td_init) {
499 |     td_histogram_t *t;
500 |     // overflow detected
501 |     // mu_assert_long_eq(1, td_init(10000000000000000, &t));
502 |     t = NULL;
503 |     // bellow overflow
504 |     mu_assert_long_eq(0, td_init(1000, &t));
505 |     td_free(t);
506 | 
507 |     mu_assert_long_eq(0, td_init(1000000, &t));
508 |     td_free(t);
509 | 
510 |     mu_assert_long_eq(0, td_init(100000000, &t));
511 |     td_free(t);
512 | }
513 | 
514 | MU_TEST(test_quantiles) {
515 |     load_histograms();
516 |     mu_assert_double_eq_epsilon(0.0, td_quantile(histogram, 0.0), 0.001);
517 |     mu_assert_double_eq_epsilon(1.0, td_quantile(histogram, 0.1), 0.02);
518 |     mu_assert_double_eq_epsilon(2.0, td_quantile(histogram, 0.2), 0.02);
519 |     mu_assert_double_eq_epsilon(3.0, td_quantile(histogram, 0.3), 0.03);
520 |     mu_assert_double_eq_epsilon(4.0, td_quantile(histogram, 0.4), 0.04);
521 |     mu_assert_double_eq_epsilon(5.0, td_quantile(histogram, 0.5), 0.05);
522 |     mu_assert_double_eq_epsilon(6.0, td_quantile(histogram, 0.6), 0.04);
523 |     mu_assert_double_eq_epsilon(7.0, td_quantile(histogram, 0.7), 0.03);
524 |     mu_assert_double_eq_epsilon(8.0, td_quantile(histogram, 0.8), 0.02);
525 |     mu_assert_double_eq_epsilon(9.0, td_quantile(histogram, 0.9), 0.02);
526 |     mu_assert_double_eq_epsilon(9.99, td_quantile(histogram, 0.999), 0.01);
527 |     mu_assert_double_eq_epsilon(9.999, td_quantile(histogram, 0.9999), 0.01);
528 |     mu_assert_double_eq_epsilon(9.9999, td_quantile(histogram, 0.99999), 0.01);
529 |     mu_assert_double_eq_epsilon(10.0, td_quantile(histogram, 1), 0.001);
530 | }
531 | 
532 | MU_TEST(test_quantiles_multiple) {
533 |     load_histograms();
534 |     const size_t quantiles_arr_size = 14;
535 |     double values[14] = {0.0};
536 |     double percentiles[14] = {0.0, 0.1, 0.2, 0.3,   0.4,    0.5,     0.6,
537 |                               0.7, 0.8, 0.9, 0.999, 0.9999, 0.99999, 1.0};
538 |     mu_assert(td_quantiles(histogram, NULL, values, quantiles_arr_size) == EINVAL,
539 |               "td_quantiles on NULL percentiles should return EINVAL");
540 |     mu_assert(td_quantiles(histogram, percentiles, NULL, quantiles_arr_size) == EINVAL,
541 |               "td_quantiles on NULL values should return EINVAL");
542 |     mu_assert(td_quantiles(histogram, percentiles, values, quantiles_arr_size) == 0,
543 |               "td_quantiles return should be 0");
544 |     mu_assert_double_eq_epsilon(0.0, values[0], 0.001);
545 |     mu_assert_double_eq_epsilon(1.0, values[1], 0.02);
546 |     mu_assert_double_eq_epsilon(2.0, values[2], 0.02);
547 |     mu_assert_double_eq_epsilon(3.0, values[3], 0.03);
548 |     mu_assert_double_eq_epsilon(4.0, values[4], 0.04);
549 |     mu_assert_double_eq_epsilon(5.0, values[5], 0.05);
550 |     mu_assert_double_eq_epsilon(6.0, values[6], 0.04);
551 |     mu_assert_double_eq_epsilon(7.0, values[7], 0.03);
552 |     mu_assert_double_eq_epsilon(8.0, values[8], 0.02);
553 |     mu_assert_double_eq_epsilon(9.0, values[9], 0.02);
554 |     mu_assert_double_eq_epsilon(9.99, values[10], 0.01);
555 |     mu_assert_double_eq_epsilon(9.999, values[11], 0.01);
556 |     mu_assert_double_eq_epsilon(9.9999, values[12], 0.01);
557 |     mu_assert_double_eq_epsilon(10.0, values[13], 0.001);
558 |     td_free(histogram);
559 |     td_histogram_t *t = td_new(100);
560 |     mu_assert(td_quantiles(t, percentiles, values, quantiles_arr_size) == 0,
561 |               "td_quantiles return should be 0");
562 |     for (int i = 0; i < quantiles_arr_size; ++i) {
563 |         mu_assert(isnan(values[i]), "no data to examine");
564 |     }
565 |     mu_assert(td_add(t, 1, 1) == 0, "Insertion");
566 |     // with one data point, all quantiles lead to Rome
567 |     mu_assert(td_quantiles(t, percentiles, values, quantiles_arr_size) == 0,
568 |               "td_quantiles return should be 0");
569 |     for (int i = 0; i < quantiles_arr_size; ++i) {
570 |         mu_assert_double_eq_epsilon(1.0, values[i], 0.02);
571 |     }
572 |     // q should be in [0,1]
573 |     double percentiles_nans[14] = {-10.0, 10.1, 10.2, 10.3,   10.4,    10.5,     10.6,
574 |                                    10.7,  10.8, 10.9, -0.999, -0.9999, -0.99999, -1.0};
575 |     mu_assert(td_quantiles(t, percentiles_nans, values, quantiles_arr_size) == 0,
576 |               "td_quantiles return should be 0");
577 |     for (int i = 0; i < quantiles_arr_size; ++i) {
578 |         mu_assert(isnan(values[i]), " q should be in [0,1]");
579 |     }
580 |     td_free(t);
581 | }
582 | 
583 | MU_TEST_SUITE(test_suite) {
584 |     MU_RUN_TEST(test_basic);
585 |     MU_RUN_TEST(test_td_init);
586 |     MU_RUN_TEST(test_compress_small);
587 |     MU_RUN_TEST(test_compress_large);
588 |     MU_RUN_TEST(test_nans);
589 |     MU_RUN_TEST(test_negative_values);
590 |     MU_RUN_TEST(test_negative_values_merge);
591 |     MU_RUN_TEST(test_large_outlier_test);
592 |     MU_RUN_TEST(test_two_interp);
593 |     MU_RUN_TEST(test_cdf);
594 |     MU_RUN_TEST(test_td_size);
595 |     MU_RUN_TEST(test_td_max);
596 |     MU_RUN_TEST(test_td_min);
597 |     MU_RUN_TEST(test_quantiles);
598 |     MU_RUN_TEST(test_quantiles_multiple);
599 |     MU_RUN_TEST(test_quantile_interpolations);
600 |     MU_RUN_TEST(test_trimmed_mean_simple);
601 |     MU_RUN_TEST(test_trimmed_mean_complex);
602 |     MU_RUN_TEST(test_overflow);
603 |     MU_RUN_TEST(test_overflow_merge);
604 | }
605 | 
606 | int main(int argc, char *argv[]) {
607 |     MU_RUN_SUITE(test_suite);
608 |     MU_REPORT();
609 |     return MU_EXIT_CODE;
610 | }
611 | 


--------------------------------------------------------------------------------