├── .gitattributes
├── .github
└── workflows
│ ├── bench.yaml
│ └── test.yaml
├── .gitignore
├── .typos.toml
├── Cargo.toml
├── LICENSE
├── README.md
├── assets
└── sample.png
├── examples
├── Cargo.toml
├── benches
│ ├── common.rs
│ ├── input.txt
│ ├── search-btree.rs
│ ├── search-ord.rs
│ ├── search-vec.rs
│ ├── tango-async.rs
│ ├── tango-faster.rs
│ ├── tango-slower.rs
│ └── test_funcs.rs
├── build.rs
├── criterion-ordsearch.sh
└── ordsearch.sh
├── notebooks
├── bootstrap.ipynb
├── empirical-observations.ipynb
├── exp.ipynb
└── ordsearch.ipynb
├── pair-test.gnuplot
├── render-plot.sh
├── rust-toolchain.toml
├── scripts
├── aws-bench.sh
├── aws-results.sh
├── calibrate.sh
├── criterion.sh
├── describe.py
├── linear-sampling-test.sh
├── ordsearch.sh
├── sensitivity-test.sh
└── tango.sh
├── tango-bench
├── Cargo.toml
├── benches
│ └── tango.rs
├── build.rs
└── src
│ ├── cli.rs
│ ├── dylib.rs
│ ├── lib.rs
│ ├── linux.rs
│ └── plot.gnuplot
└── test.sh
/.gitattributes:
--------------------------------------------------------------------------------
1 | notebooks/*.ipynb linguist-detectable=false
2 |
--------------------------------------------------------------------------------
/.github/workflows/bench.yaml:
--------------------------------------------------------------------------------
1 | name: Benchmarks
2 |
3 | on: workflow_dispatch
4 |
5 | jobs:
6 | bench:
7 | runs-on: ubuntu-22.04
8 | steps:
9 | - uses: actions/checkout@v3
10 |
11 | - name: Prepare Environment
12 | run: |
13 | rustup update nightly
14 | rustup default nightly
15 | cargo install cargo-export --version 0.2.0
16 |
17 | - uses: actions/cache@v3
18 | with:
19 | path: |
20 | ~/.cargo/bin/
21 | ~/.cargo/registry/index/
22 | ~/.cargo/registry/cache/
23 | ~/.cargo/git/db/
24 | ./target/
25 | ./baseline-branch/target/
26 | key: Bench/${{ runner.os }}
27 |
28 | - name: Building Benchmarks
29 | run: cargo export target/benchmarks -- bench --bench='search-*'
30 |
31 | - name: Run Benchmarks
32 | run: |
33 | set -eo pipefail
34 |
35 | mkdir -p target/dumps
36 | target/benchmarks/search_ord --color=never compare target/benchmarks/search_ord \
37 | -t 1 -o -d target/dumps | tee target/benchmark.txt
38 |
39 | - uses: actions/upload-artifact@v3
40 | with:
41 | name: benchmark-results
42 | path: |
43 | target/benchmark.txt
44 | target/dumps/*.csv
45 |
--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on: push
4 |
5 | jobs:
6 | lint:
7 | runs-on: ubuntu-22.04
8 | steps:
9 | - uses: actions/checkout@v4
10 | - uses: dtolnay/rust-toolchain@stable
11 | with:
12 | components: "clippy, rustfmt"
13 | - uses: olix0r/cargo-action-fmt/setup@v2
14 | - uses: Swatinem/rust-cache@v2
15 | - name: Running clippy
16 | run: cargo clippy --all-targets --all-features -p tango-bench --message-format=json | cargo-action-fmt
17 | - name: Checking formatting
18 | run: cargo fmt -- --check --color always
19 | - name: Typo
20 | uses: crate-ci/typos@master
21 | test:
22 | strategy:
23 | matrix:
24 | include:
25 | - os: ubuntu-22.04
26 | - os: macos-12
27 | - os: windows-2019
28 | runs-on: ${{ matrix.os }}
29 | steps:
30 | - uses: actions/checkout@v4
31 | - uses: dtolnay/rust-toolchain@stable
32 | - uses: Swatinem/rust-cache@v2
33 |
34 | - name: Run Tests
35 | run: cargo test
36 |
37 | bench:
38 | needs: [test]
39 | runs-on: ubuntu-22.04
40 | steps:
41 | - uses: actions/checkout@v3
42 | - uses: actions/checkout@v3
43 | with:
44 | ref: dev
45 | path: baseline-branch
46 | - uses: dtolnay/rust-toolchain@stable
47 | - uses: Swatinem/rust-cache@v2
48 | - uses: taiki-e/install-action@v2
49 | with:
50 | tool: cargo-export
51 |
52 | - name: Building Benchmarks
53 | run: |
54 | cargo export target/benchmarks -- bench --bench=tango
55 | cd baseline-branch
56 | cargo export target/benchmarks -- bench --bench=tango
57 |
58 | - name: Run Benchmarks
59 | run: |
60 | set -eo pipefail
61 |
62 | target/benchmarks/tango --color=never compare baseline-branch/target/benchmarks/tango \
63 | -v -t 1 --fail-threshold 10 | tee target/benchmark.txt
64 |
65 | - uses: actions/upload-artifact@v3
66 | with:
67 | name: benchmark.txt
68 | path: target/benchmark.txt
69 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | Cargo.lock
3 | **/*.rs.bk
4 | *.svg
5 | /graphs
6 | /*.csv
7 | .ipynb_checkpoints
8 | /local
9 | /.fleet
10 | /.vscode
11 | /.ttr.yaml
12 | /*.drawio
13 |
--------------------------------------------------------------------------------
/.typos.toml:
--------------------------------------------------------------------------------
1 | [files]
2 | extend-exclude = ["examples/benches/input.txt", "notebooks/*.ipynb"]
3 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | resolver = "2"
3 |
4 | members = [
5 | "tango-bench",
6 | "examples"
7 | ]
8 |
9 | [profile.bench]
10 | debug = true
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Denis Bazhenov
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tango.rs
2 |
3 |
4 |

5 |

6 |
7 |
8 | It used to be that benchmarking required a significant amount of time and numerous iterations to arrive at meaningful results, which was particularly arduous when trying to detect subtle changes, such as those within the range of a few percentage points.
9 |
10 | Introducing Tango.rs, a novel benchmarking framework that employs [paired benchmarking](https://www.bazhenov.me/posts/paired-benchmarking/) to assess code performance. This approach capitalizes on the fact that it's far more efficient to measure the performance difference between two simultaneously executing functions compared to two functions executed consecutively.
11 |
12 | 
13 |
14 | Features:
15 |
16 | - very high sensitivity to changes which allows to converge on results quicker than traditional (pointwise) approach. Often the fraction of a second is enough;
17 | - ability to compare different versions of the same code from different VCS commits (A/B-benchmarking);
18 | - async support using tokio.rs;
19 | - macOS, Linux and Windows support;
20 |
21 | ## 1 second, 1 percent, 1 error
22 |
23 | Compared to traditional pointwise benchmarking, paired benchmarking is significantly more sensitive to changes. This heightened sensitivity enables the early detection of statistically significant performance variations.
24 |
25 | Tango is designed to have the capability to detect a 1% change in performance within just 1 second in at least 9 out of 10 test runs.
26 |
27 | ## Prerequirements
28 |
29 | 1. Rust and Cargo toolchain installed (Rust stable is supported on Linux/macOS, nightly is required for Windows)
30 | 1. (_Optional_) [`cargo-export`](https://github.com/bazhenov/cargo-export) installed
31 |
32 | ## Getting started
33 |
34 | 1. Add cargo dependency and create new benchmark:
35 |
36 | ```toml
37 | [dev-dependencies]
38 | tango-bench = "0.6"
39 |
40 | [[bench]]
41 | name = "factorial"
42 | harness = false
43 | ```
44 |
45 | 1. allows rustc to export symbols for dynamic linking from benchmarks
46 |
47 | - **(Linux/macOS)** Add build script (`build.rs`) with following content
48 |
49 | ```rust,ignore
50 | fn main() {
51 | println!("cargo:rustc-link-arg-benches=-rdynamic");
52 | println!("cargo:rerun-if-changed=build.rs");
53 | }
54 | ```
55 |
56 | - **(Windows, nightly required)** Add following code to cargo config (`.cargo/config`)
57 |
58 | ```toml
59 | [build]
60 | rustflags = ["-Zexport-executable-symbols"]
61 | ```
62 |
63 | 1. Add `benches/factorial.rs` with the following content:
64 |
65 | ```rust,no_run
66 | use std::hint::black_box;
67 | use tango_bench::{benchmark_fn, tango_benchmarks, tango_main, IntoBenchmarks};
68 |
69 | pub fn factorial(mut n: usize) -> usize {
70 | let mut result = 1usize;
71 | while n > 0 {
72 | result = result.wrapping_mul(black_box(n));
73 | n -= 1;
74 | }
75 | result
76 | }
77 |
78 | fn factorial_benchmarks() -> impl IntoBenchmarks {
79 | [
80 | benchmark_fn("factorial", |b| b.iter(|| factorial(500))),
81 | ]
82 | }
83 |
84 | tango_benchmarks!(factorial_benchmarks());
85 | tango_main!();
86 | ```
87 |
88 | 1. Build and export benchmark to `target/benchmarks` directory:
89 |
90 | ```console
91 | $ cargo export target/benchmarks -- bench --bench=factorial
92 | ```
93 |
94 | 1. Now lets try to modify `factorial.rs` and make factorial faster :)
95 |
96 | ```rust,ignore
97 | fn factorial_benchmarks() -> impl IntoBenchmarks {
98 | [
99 | benchmark_fn("factorial", |b| b.iter(|| factorial(495))),
100 | ]
101 | }
102 | ```
103 |
104 | 1. Now we can compare new version with already built one:
105 |
106 | ```console
107 | $ cargo bench -q --bench=factorial -- compare target/benchmarks/factorial
108 | factorial [ 375.5 ns ... 369.0 ns ] -1.58%*
109 | ```
110 | The result shows that indeed there is indeed ~1% difference between `factorial(500)` and `factorial(495)`.
111 |
112 | Additional examples are available in `examples` directory.
113 |
114 | ## Async support
115 |
116 | To use Tango.rs in an asynchronous setup, follow these steps:
117 |
118 | 1. Add `tokio` and `tango-bench` dependencies to your `Cargo.toml`:
119 |
120 | ```toml
121 | [dev-dependencies]
122 | tango-bench = { version = "0.6", features = ["async-tokio"] }
123 |
124 | [[bench]]
125 | name = "async_factorial"
126 | harness = false
127 | ```
128 |
129 | 2. Create `benches/async_factorial.rs` with the following content:
130 |
131 | ```rust,no_run
132 | use std::hint::black_box;
133 | use tango_bench::{
134 | async_benchmark_fn, asynchronous::tokio::TokioRuntime, tango_benchmarks, tango_main,
135 | IntoBenchmarks,
136 | };
137 |
138 | pub async fn factorial(mut n: usize) -> usize {
139 | let mut result = 1usize;
140 | while n > 0 {
141 | result = result.wrapping_mul(black_box(n));
142 | n -= 1;
143 | }
144 | result
145 | }
146 |
147 | fn benchmarks() -> impl IntoBenchmarks {
148 | [async_benchmark_fn("async_factorial", TokioRuntime, |b| {
149 | b.iter(|| async { factorial(500).await })
150 | })]
151 | }
152 |
153 | tango_benchmarks!(benchmarks());
154 | tango_main!();
155 | ```
156 |
157 | 3. Build and use benchmarks as you do in synchronous case
158 |
159 | ```console
160 | $ cargo bench -q --bench=async_factorial -- compare
161 | ```
162 |
163 | ## Runner arguments
164 |
165 | There are several arguments you can pass to the `compare` command to change it behavior
166 |
167 | - `-t`, `--time` – how long to run each benchmark (in seconds)
168 | - `-s`, `--samples` – how much samples to gather from each benchmark
169 | - `-f` – filter benchmarks by name. Glob patterns are supported (eg. `*/bench_name/{2,4,8}/**`)
170 | - `-d [path]` – dump CSV with raw samples in a given directory
171 | - `--gnuplot` – generate plot for each benchmark (requires gnuplot to be installed)
172 | - `-o`, `--filter-outliers` – additionally filter outliers
173 | - `-p`, `--parallel` - run base/candidate functions in 2 different threads instead of interleaving in a single thread
174 | - `--fail-threshold` – do fail if new version is slower than baseline on a given percentage
175 | - `--fail-fast` - do fail after first benchmark exceeding fail threshold, not after the whole suite
176 |
177 |
178 | ## Contributing
179 |
180 | The project is in its early stages so any help will be appreciated. Here are some ideas you might find interesting
181 |
182 | - find a way to provide a more user friendly API for registering functions in the system
183 | - if you're a library author, trying out tango and providing feedback will be very useful
184 |
--------------------------------------------------------------------------------
/assets/sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bazhenov/tango/2704f1fd0a74d5d33eb7beaeb35d525695aa9d0e/assets/sample.png
--------------------------------------------------------------------------------
/examples/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "tango-examples"
3 | version = "0.2.0"
4 | edition = "2021"
5 | autobenches = false
6 |
7 | [dependencies]
8 | tango-bench = { path = "../tango-bench" }
9 | rand = { version = "0.8", features = ["small_rng"] }
10 |
11 | [dev-dependencies]
12 | ordsearch = { version = "0.2.5" }
13 | num-traits = "0.2"
14 |
15 | [[bench]]
16 | name = "search-ord"
17 | harness = false
18 |
19 | [[bench]]
20 | name = "search-vec"
21 | harness = false
22 |
23 | [[bench]]
24 | name = "search-btree"
25 | harness = false
26 |
27 | [[bench]]
28 | name = "tango-faster"
29 | harness = false
30 |
31 | [[bench]]
32 | name = "tango-slower"
33 | harness = false
34 |
35 | [[bench]]
36 | name = "tango-async"
37 | harness = false
38 | required-features = ["async-tokio"]
39 |
40 | [features]
41 | prefetch = ["ordsearch/nightly"]
42 | align = []
43 | async-tokio = []
44 |
--------------------------------------------------------------------------------
/examples/benches/common.rs:
--------------------------------------------------------------------------------
1 | extern crate tango_bench;
2 |
3 | use std::{any::type_name, convert::TryFrom, fmt::Debug, iter, marker::PhantomData};
4 | use tango_bench::{benchmark_fn, IntoBenchmarks, MeasurementSettings, DEFAULT_SETTINGS};
5 |
6 | const SIZES: [usize; 14] = [
7 | 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536,
8 | ];
9 |
10 | struct Lcg(usize);
11 |
12 | impl Lcg {
13 | fn next>(&mut self, max_value: usize) -> T {
14 | self.0 = self.0.wrapping_mul(1664525).wrapping_add(1013904223);
15 | T::try_from((self.0 >> 32) % max_value).ok().unwrap()
16 | }
17 | }
18 |
19 | pub struct RandomCollection {
20 | rng: Lcg,
21 | size: usize,
22 | value_dup_factor: usize,
23 | phantom: PhantomData,
24 | }
25 |
26 | impl RandomCollection
27 | where
28 | C::Item: Ord + Copy + TryFrom,
29 | {
30 | pub fn new(size: usize, value_dup_factor: usize, seed: u64) -> Self {
31 | Self {
32 | rng: Lcg(seed as usize),
33 | size,
34 | value_dup_factor,
35 | phantom: PhantomData,
36 | }
37 | }
38 | }
39 |
40 | impl RandomCollection
41 | where
42 | C::Item: Ord + Copy + TryFrom + Debug,
43 | usize: TryFrom,
44 | {
45 | fn random_collection(&mut self) -> Sample {
46 | let vec = generate_sorted_vec(self.size, self.value_dup_factor);
47 | let max = usize::try_from(*vec.last().unwrap()).ok().unwrap();
48 |
49 | Sample {
50 | collection: C::from_sorted_vec(vec),
51 | max_value: max,
52 | }
53 | }
54 |
55 | fn next_needle(&mut self, sample: &Sample) -> C::Item {
56 | self.rng.next(sample.max_value + 1)
57 | }
58 | }
59 |
60 | fn generate_sorted_vec(size: usize, dup_factor: usize) -> Vec
61 | where
62 | T: Ord + Copy + TryFrom,
63 | {
64 | (0..)
65 | .map(|v| 2 * v)
66 | .map(|v| T::try_from(v))
67 | .map_while(Result::ok)
68 | .flat_map(|v| iter::repeat(v).take(dup_factor))
69 | .take(size)
70 | .collect()
71 | }
72 |
73 | pub struct Sample {
74 | collection: C,
75 | max_value: usize,
76 | }
77 |
78 | impl AsRef for Sample {
79 | fn as_ref(&self) -> &C {
80 | &self.collection
81 | }
82 | }
83 |
84 | pub trait FromSortedVec {
85 | type Item;
86 | fn from_sorted_vec(v: Vec) -> Self;
87 | }
88 |
89 | impl FromSortedVec for Vec {
90 | type Item = T;
91 |
92 | fn from_sorted_vec(v: Vec) -> Self {
93 | v
94 | }
95 | }
96 |
97 | /// Generate benchmarks for searching in a collection.
98 | pub fn search_benchmarks(f: F) -> impl IntoBenchmarks
99 | where
100 | C: FromSortedVec + 'static,
101 | F: Fn(&C, C::Item) -> Option + Copy + 'static,
102 | C::Item: Copy + Ord + TryFrom + Debug,
103 | usize: TryFrom,
104 | {
105 | let mut benchmarks = vec![];
106 | for size in SIZES {
107 | let name = format!("{}/{}/nodup", type_name::(), size);
108 | benchmarks.push(benchmark_fn(name, move |b| {
109 | let mut rnd = RandomCollection::::new(size, 1, b.seed);
110 | let input = rnd.random_collection();
111 | b.iter(move || f(&input.collection, rnd.next_needle(&input)))
112 | }));
113 | }
114 | benchmarks
115 | }
116 |
117 | pub const SETTINGS: MeasurementSettings = MeasurementSettings {
118 | samples_per_haystack: usize::MAX,
119 | max_iterations_per_sample: 10_000,
120 | ..DEFAULT_SETTINGS
121 | };
122 |
--------------------------------------------------------------------------------
/examples/benches/search-btree.rs:
--------------------------------------------------------------------------------
1 | #![cfg_attr(feature = "align", feature(fn_align))]
2 |
3 | use common::{search_benchmarks, FromSortedVec};
4 | use std::{collections::BTreeSet, ops::Bound};
5 | use tango_bench::{tango_benchmarks, tango_main};
6 |
7 | mod common;
8 |
9 | impl FromSortedVec for BTreeSet {
10 | type Item = T;
11 |
12 | fn from_sorted_vec(v: Vec) -> Self {
13 | BTreeSet::from_iter(v)
14 | }
15 | }
16 |
17 | #[cfg_attr(feature = "align", repr(align(32)))]
18 | #[cfg_attr(feature = "align", inline(never))]
19 | fn search_btree(haystack: &BTreeSet, needle: T) -> Option {
20 | haystack
21 | .range((Bound::Included(needle), Bound::Unbounded))
22 | .next()
23 | .copied()
24 | }
25 |
26 | tango_benchmarks!(
27 | search_benchmarks(search_btree::),
28 | search_benchmarks(search_btree::),
29 | search_benchmarks(search_btree::),
30 | search_benchmarks(search_btree::)
31 | );
32 | tango_main!(common::SETTINGS);
33 |
--------------------------------------------------------------------------------
/examples/benches/search-ord.rs:
--------------------------------------------------------------------------------
1 | #![cfg_attr(feature = "align", feature(fn_align))]
2 |
3 | use common::{search_benchmarks, FromSortedVec};
4 | use ordsearch::OrderedCollection;
5 | use tango_bench::{tango_benchmarks, tango_main};
6 |
7 | mod common;
8 |
9 | impl FromSortedVec for OrderedCollection {
10 | type Item = T;
11 | fn from_sorted_vec(v: Vec) -> Self {
12 | OrderedCollection::from_sorted_iter(v)
13 | }
14 | }
15 |
16 | #[cfg_attr(feature = "align", repr(align(32)))]
17 | #[cfg_attr(feature = "align", inline(never))]
18 | fn search_ord(haystack: &OrderedCollection, needle: T) -> Option {
19 | haystack.find_gte(needle).copied()
20 | }
21 |
22 | tango_benchmarks!(
23 | search_benchmarks(search_ord::),
24 | search_benchmarks(search_ord::),
25 | search_benchmarks(search_ord::),
26 | search_benchmarks(search_ord::)
27 | );
28 | tango_main!(common::SETTINGS);
29 |
--------------------------------------------------------------------------------
/examples/benches/search-vec.rs:
--------------------------------------------------------------------------------
1 | #![cfg_attr(feature = "align", feature(fn_align))]
2 |
3 | use common::search_benchmarks;
4 | use tango_bench::{tango_benchmarks, tango_main};
5 |
6 | mod common;
7 |
8 | #[cfg_attr(feature = "align", repr(align(32)))]
9 | #[cfg_attr(feature = "align", inline(never))]
10 | #[allow(clippy::ptr_arg)]
11 | fn search_vec(haystack: &Vec, needle: T) -> Option {
12 | haystack
13 | .binary_search(&needle)
14 | .ok()
15 | .and_then(|idx| haystack.get(idx))
16 | .copied()
17 | }
18 |
19 | tango_benchmarks!(
20 | search_benchmarks(search_vec::),
21 | search_benchmarks(search_vec::),
22 | search_benchmarks(search_vec::),
23 | search_benchmarks(search_vec::)
24 | );
25 |
26 | tango_main!(common::SETTINGS);
27 |
--------------------------------------------------------------------------------
/examples/benches/tango-async.rs:
--------------------------------------------------------------------------------
1 | #![cfg_attr(feature = "align", feature(fn_align))]
2 |
3 | use crate::test_funcs::factorial;
4 | use tango_bench::{
5 | async_benchmark_fn, asynchronous::tokio::TokioRuntime, tango_benchmarks, tango_main,
6 | IntoBenchmarks,
7 | };
8 |
9 | mod test_funcs;
10 |
11 | fn num_benchmarks() -> impl IntoBenchmarks {
12 | [async_benchmark_fn("factorial_async", TokioRuntime, |b| {
13 | b.iter(|| async { factorial(500) })
14 | })]
15 | }
16 |
17 | tango_benchmarks!(num_benchmarks());
18 | tango_main!();
19 |
--------------------------------------------------------------------------------
/examples/benches/tango-faster.rs:
--------------------------------------------------------------------------------
1 | #![cfg_attr(feature = "align", feature(fn_align))]
2 |
3 | use crate::test_funcs::{factorial, sum};
4 | use std::rc::Rc;
5 | use tango_bench::{benchmark_fn, tango_benchmarks, tango_main, IntoBenchmarks};
6 | use test_funcs::{
7 | create_str_benchmark, sort_unstable, str_count, str_take, vec_benchmarks, IndexedString,
8 | INPUT_TEXT,
9 | };
10 |
11 | mod test_funcs;
12 |
13 | fn num_benchmarks() -> impl IntoBenchmarks {
14 | [
15 | benchmark_fn("sum", |b| b.iter(|| sum(4950))),
16 | benchmark_fn("factorial", |b| b.iter(|| factorial(495))),
17 | ]
18 | }
19 |
20 | fn str_benchmarks() -> impl IntoBenchmarks {
21 | let input = Rc::new(IndexedString::from(INPUT_TEXT));
22 | [
23 | create_str_benchmark("str_length/random", &input, str_count),
24 | create_str_benchmark("str_length/random_limited", &input, |s| str_take(4950, s)),
25 | ]
26 | }
27 |
28 | tango_benchmarks!(
29 | str_benchmarks(),
30 | num_benchmarks(),
31 | vec_benchmarks(sort_unstable)
32 | );
33 | tango_main!();
34 |
--------------------------------------------------------------------------------
/examples/benches/tango-slower.rs:
--------------------------------------------------------------------------------
1 | #![cfg_attr(feature = "align", feature(fn_align))]
2 |
3 | use crate::test_funcs::{factorial, sum};
4 | use std::rc::Rc;
5 | use tango_bench::{benchmark_fn, tango_benchmarks, tango_main, IntoBenchmarks};
6 | use test_funcs::{
7 | create_str_benchmark, sort_stable, str_count_rev, str_take, vec_benchmarks, IndexedString,
8 | INPUT_TEXT,
9 | };
10 |
11 | mod test_funcs;
12 |
13 | fn num_benchmarks() -> impl IntoBenchmarks {
14 | [
15 | benchmark_fn("sum", |b| b.iter(|| sum(5000))),
16 | benchmark_fn("factorial", |b| b.iter(|| factorial(500))),
17 | ]
18 | }
19 |
20 | fn str_benchmarks() -> impl IntoBenchmarks {
21 | let input = Rc::new(IndexedString::from(INPUT_TEXT));
22 | [
23 | create_str_benchmark("str_length/random", &input, str_count_rev),
24 | create_str_benchmark("str_length/random_limited", &input, |s| str_take(5000, s)),
25 | ]
26 | }
27 |
28 | tango_benchmarks!(
29 | str_benchmarks(),
30 | num_benchmarks(),
31 | vec_benchmarks(sort_stable)
32 | );
33 | tango_main!();
34 |
--------------------------------------------------------------------------------
/examples/benches/test_funcs.rs:
--------------------------------------------------------------------------------
1 | use rand::{distributions::Standard, rngs::SmallRng, Rng, SeedableRng};
2 | use std::{hint::black_box, rc::Rc};
3 | use tango_bench::{benchmark_fn, Benchmark, IntoBenchmarks};
4 |
5 | /// HTML page with a lot of chinese text to test UTF8 decoding speed
6 | #[allow(unused)]
7 | pub const INPUT_TEXT: &str = include_str!("./input.txt");
8 |
9 | #[allow(unused)]
10 | pub(crate) fn create_str_benchmark(
11 | name: &'static str,
12 | input: &Rc,
13 | f: fn(&str) -> usize,
14 | ) -> Benchmark {
15 | let input = Rc::clone(input);
16 | benchmark_fn(name, move |b| {
17 | let mut rng = SmallRng::seed_from_u64(b.seed);
18 | let input = Rc::clone(&input);
19 | b.iter(move || f(random_substring(&input, &mut rng)))
20 | })
21 | }
22 |
23 | fn random_substring<'a>(input: &'a IndexedString, rng: &mut impl Rng) -> &'a str {
24 | let length = 50_000;
25 | let indices = &input.indices;
26 | let start = rng.gen_range(0..indices.len() - length);
27 | let range = indices[start]..indices[start + length];
28 | &input.string[range]
29 | }
30 |
31 | pub(crate) struct IndexedString {
32 | string: String,
33 | indices: Vec,
34 | }
35 |
36 | impl From<&str> for IndexedString {
37 | fn from(value: &str) -> Self {
38 | Self {
39 | string: value.to_owned(),
40 | indices: build_char_indices(value),
41 | }
42 | }
43 | }
44 |
45 | fn build_char_indices(text: &str) -> Vec {
46 | text.char_indices().map(|(idx, _)| idx).collect()
47 | }
48 |
49 | #[cfg_attr(feature = "align", repr(align(32)))]
50 | #[cfg_attr(feature = "align", inline(never))]
51 | #[allow(unused)]
52 | pub fn sum(n: usize) -> usize {
53 | let mut sum = 0;
54 | for i in 0..black_box(n) {
55 | sum += black_box(i);
56 | }
57 | sum
58 | }
59 |
60 | #[cfg_attr(feature = "align", repr(align(32)))]
61 | #[cfg_attr(feature = "align", inline(never))]
62 | #[allow(unused)]
63 | pub fn factorial(mut n: usize) -> usize {
64 | let mut result = 1usize;
65 | while n > 0 {
66 | result = result.wrapping_mul(black_box(n));
67 | n -= 1;
68 | }
69 | result
70 | }
71 |
72 | #[cfg_attr(feature = "align", repr(align(32)))]
73 | #[cfg_attr(feature = "align", inline(never))]
74 | #[allow(unused)]
75 | #[allow(clippy::ptr_arg)]
76 | pub fn str_count_rev(s: &str) -> usize {
77 | let mut l = 0;
78 | for _ in s.chars().rev() {
79 | l += 1;
80 | }
81 | l
82 | }
83 |
84 | #[cfg_attr(feature = "align", repr(align(32)))]
85 | #[cfg_attr(feature = "align", inline(never))]
86 | #[allow(unused)]
87 | #[allow(clippy::ptr_arg)]
88 | pub fn str_count(s: &str) -> usize {
89 | let mut l = 0;
90 | for _ in s.chars() {
91 | l += 1;
92 | }
93 | l
94 | }
95 |
96 | #[cfg_attr(feature = "align", repr(align(32)))]
97 | #[cfg_attr(feature = "align", inline(never))]
98 | #[allow(unused)]
99 | #[allow(clippy::ptr_arg)]
100 | pub fn str_take(n: usize, s: &str) -> usize {
101 | s.chars().take(black_box(n)).count()
102 | }
103 |
104 | #[cfg_attr(feature = "align", repr(align(32)))]
105 | #[cfg_attr(feature = "align", inline(never))]
106 | #[allow(unused)]
107 | #[allow(clippy::ptr_arg)]
108 | pub fn sort_unstable(input: &Vec) -> T {
109 | let mut input = input.clone();
110 | input.sort_unstable();
111 | input[input.len() / 2]
112 | }
113 |
114 | #[cfg_attr(feature = "align", repr(align(32)))]
115 | #[cfg_attr(feature = "align", inline(never))]
116 | #[allow(unused)]
117 | #[allow(clippy::ptr_arg)]
118 | pub fn sort_stable(input: &Vec) -> T {
119 | let mut input = input.clone();
120 | input.sort();
121 | input[input.len() / 2]
122 | }
123 |
124 | #[allow(unused)]
125 | pub fn vec_benchmarks(f: impl Fn(&Vec) -> u64 + Copy + 'static) -> impl IntoBenchmarks {
126 | let mut benches = vec![];
127 | for size in [100, 1_000, 10_000, 100_000] {
128 | benches.push(benchmark_fn(format!("sort/{}", size), move |b| {
129 | let input: Vec = SmallRng::seed_from_u64(b.seed)
130 | .sample_iter(Standard)
131 | .take(1000)
132 | .collect();
133 | b.iter(move || f(&input))
134 | }))
135 | }
136 | benches
137 | }
138 |
--------------------------------------------------------------------------------
/examples/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 | println!("cargo:rustc-link-arg-benches=-rdynamic");
3 | println!("cargo:rerun-if-changed=build.rs");
4 | }
5 |
--------------------------------------------------------------------------------
/examples/criterion-ordsearch.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # This script is running ordsearch criterion benchmarks and producing log in target folder
4 | # Should be execute in the ordsearch directory
5 |
6 | set -eo pipefail
7 |
8 | FILE=./target/criterion.txt
9 |
10 | if [ -f "${FILE}" ]; then
11 | rm -f "${FILE}"
12 | fi
13 |
14 | for i in {1..30}; do
15 | cargo +nightly bench \
16 | --bench=search_comparison \
17 | --features=nightly \
18 | "Search u8/(sorted_vec|ordsearch)/8$" >> "${FILE}"
19 | done
20 |
21 | for NAME in "u8/sorted_vec/8" "u8/ordsearch/8"; do
22 | echo "${NAME}"
23 | cat "${FILE}" | grep "${NAME}" | grep 'time:' | awk '{print $6}'
24 | done
25 |
26 |
27 |
--------------------------------------------------------------------------------
/examples/ordsearch.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -eo pipefail
4 |
5 | cargo export target/benchmarks -- bench --bench='search-*'
6 |
7 | echo "OrderedCollection vs Vec"
8 | target/benchmarks/search_vec compare target/benchmarks/search_ord $@
9 |
10 | echo "OrderedCollection vs BTree"
11 | target/benchmarks/search_btree compare target/benchmarks/search_ord $@
12 |
--------------------------------------------------------------------------------
/notebooks/empirical-observations.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "cbfc36d7-9096-4fb4-bbc8-ae2758a2e4db",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import scipy.stats as st\n",
11 | "import numpy as np\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import pandas as pd\n",
14 | "%config InlineBackend.figure_formats = ['svg']\n",
15 | "\n",
16 | "#def cum_var(input):\n",
17 | "# return np.array([np.var(input[:i+1]) for i in np.arange(0, len(input))])\n",
18 | "\n",
19 | "def cum_var(input):\n",
20 | " s = 0\n",
21 | " m = 0\n",
22 | " n = 0\n",
23 | " result = [0]\n",
24 | "\n",
25 | " for value in input:\n",
26 | " n += 1\n",
27 | " m_p = m;\n",
28 | " m += (value - m) / n\n",
29 | " s += (value - m) * (value - m_p)\n",
30 | " if n > 1:\n",
31 | " result.append(s / (n - 1))\n",
32 | " return np.array(result)"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "id": "cef6ca34-a74c-4175-b415-fce63bb04906",
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "d = pd.read_csv(\"../sum_50000-sum_50000.csv\", header=None, names=[\"base\", \"candidate\"])\n",
43 | "#d = pd.read_csv(\"../factorial_500-factorial_495.csv\", header=None, names=[\"base\", \"candidate\"])\n",
44 | "\n",
45 | "d"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "id": "e5a882b1-86fc-44b4-8ec3-fccbe5146be6",
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "fig = plt.figure()\n",
56 | "ax = fig.add_subplot()\n",
57 | "\n",
58 | "x = [np.min(d), np.max(d)]\n",
59 | "ax.set_yscale('log')\n",
60 | "#ax.set_xscale('log')\n",
61 | "ax.hist(d['candidate'] - d['base'], bins = 500)\n",
62 | "\n",
63 | "fig.show()"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "id": "534303db-4627-459c-accc-ff5195beac03",
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "fig = plt.figure()\n",
74 | "ax = fig.add_subplot()\n",
75 | "\n",
76 | "x = [np.min(d), np.max(d)]\n",
77 | "ax.set_yscale('log')\n",
78 | "ax.set_xscale('log')\n",
79 | "ax.plot(x, x, color='red', linewidth=0.3)\n",
80 | "ax.scatter(d['base'], d['candidate'], s=2)\n",
81 | "\n",
82 | "fig.show()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "id": "578fca8c-e1b2-48e2-a175-a8d98c185d89",
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "diff = np.abs(d['candidate'] - d['base'])\n",
93 | "diff = diff[np.abs(diff).argsort()]\n",
94 | "\n",
95 | "forward_var = cum_var(diff)\n",
96 | "ratio = np.roll(forward_var, -1) / forward_var"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "id": "3ba7aa8a-7ae8-4eae-864f-95ec16183647",
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "#ax[1].set_ylim(0.9, 1.5)\n",
107 | "fig = plt.figure()\n",
108 | "ax = fig.subplots(2, 1)\n",
109 | "\n",
110 | "x = np.arange(0, len(diff))\n",
111 | "last_idx = 200\n",
112 | "#ax.set_yscale('log')\n",
113 | "#ax.set_xscale('log')\n",
114 | "#ax[0].set_ylim(-1000, 1000)\n",
115 | "factor = 10\n",
116 | "ax[0].plot(x[x % factor == 0], forward_var[x % factor == 0], linewidth=0.3)\n",
117 | "\n",
118 | "ax[1].plot(x[x % factor == 0][-1000:], ratio[x % factor == 0], linewidth=0.5, color='red')\n",
119 | "\n",
120 | "print(np.max(ratio[-100:]))\n",
121 | "\n",
122 | "fig.show()"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "id": "6e901144-0f45-450b-8f0d-e20ce89dfd7c",
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "print(ratio[ratio > 1.05])\n",
133 | "print(x[ratio > 1.05])\n",
134 | "#print(ratio[-last_idx:])"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "id": "46b772c6-377d-4615-a849-078b77f1713b",
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "print(diff[-800:])"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "id": "79837283-9a2c-49b8-bf61-a2142f13b9d1",
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "d = pd.read_csv(\"../sum_50000-sum_50000.csv\", header=None, names=[\"base\", \"candidate\"])\n",
155 | "#d = pd.read_csv(\"../factorial_500-factorial_500.csv\", header=None, names=[\"base\", \"candidate\"])\n",
156 | "\n",
157 | "diff = np.abs(d['candidate'] - d['base'])\n",
158 | "diff = diff[np.abs(diff).argsort()]\n",
159 | "\n",
160 | "forward_var = cum_var(diff)\n",
161 | "ratio = np.roll(forward_var, -1) / forward_var\n",
162 | "\n",
163 | "fig = plt.figure()\n",
164 | "ax = fig.subplots(1, 1)\n",
165 | "\n",
166 | "x = np.arange(0, len(diff))\n",
167 | "\n",
168 | "last = len(diff) // 1\n",
169 | "\n",
170 | "x = x[-last:]\n",
171 | "y = np.maximum(ratio[-last:], 1)\n",
172 | "y = y - 1\n",
173 | "ax.set_yscale('log')\n",
174 | "ax.plot(x, y, linewidth=0.5, color='red')\n",
175 | "ax.plot(x, 1 / (x - 1) * 10, linewidth=0.5, color='blue')\n",
176 | "\n",
177 | "fig.show()"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "id": "9079380c-8157-48f0-80b2-c8cf18b7b5b3",
184 | "metadata": {},
185 | "outputs": [],
186 | "source": [
187 | "np.sqrt(len(d))"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "id": "3cbc0903-8425-41ba-8f93-e71bb470ffbc",
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "len(d[d['base'] - d['candidate'] == 0])/len(d)"
198 | ]
199 | }
200 | ],
201 | "metadata": {
202 | "kernelspec": {
203 | "display_name": "Python 3 (ipykernel)",
204 | "language": "python",
205 | "name": "python3"
206 | },
207 | "language_info": {
208 | "codemirror_mode": {
209 | "name": "ipython",
210 | "version": 3
211 | },
212 | "file_extension": ".py",
213 | "mimetype": "text/x-python",
214 | "name": "python",
215 | "nbconvert_exporter": "python",
216 | "pygments_lexer": "ipython3",
217 | "version": "3.11.5"
218 | }
219 | },
220 | "nbformat": 4,
221 | "nbformat_minor": 5
222 | }
223 |
--------------------------------------------------------------------------------
/notebooks/ordsearch.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "e5023e12-a161-4793-899c-27f73e3833ad",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "import scipy.stats as st\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "%config InlineBackend.figure_formats = ['svg']"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 152,
19 | "id": "78d35383-8799-4d53-935a-933bae392108",
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "D=pd.read_csv(\"../test-data\", sep=\";\", header=None)\n",
24 | "\n",
25 | "grouped = {}\n",
26 | "for row in D.iterrows():\n",
27 | " key = row[1][0]\n",
28 | " value = float(row[1][1])\n",
29 | " if not key in grouped:\n",
30 | " grouped[key] = []\n",
31 | " grouped[key].append(value)"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 153,
37 | "id": "61488a42-ac86-41a8-9ef1-241c30db2fd1",
38 | "metadata": {},
39 | "outputs": [
40 | {
41 | "data": {
42 | "image/svg+xml": [
43 | "\n",
44 | "\n",
46 | "\n"
1145 | ],
1146 | "text/plain": [
1147 | ""
1148 | ]
1149 | },
1150 | "metadata": {},
1151 | "output_type": "display_data"
1152 | }
1153 | ],
1154 | "source": [
1155 | "names = list(grouped.keys())\n",
1156 | "box_plot_data = list(grouped.values())\n",
1157 | "\n",
1158 | "fig, ax = plt.subplots(figsize=(6, 2))\n",
1159 | "\n",
1160 | "#fig.title(\"Fruit Growth Distribution\")\n",
1161 | "ax.boxplot(box_plot_data, labels=names, whis=(0.5,99.5), showfliers=False, vert=0, widths=0.8, sym='.', flierprops=dict(markersize=4))\n",
1162 | "# ax.set_xlim([-75,-64])\n",
1163 | "# ax.set_xticks(list(range(-90, -60, 1)))\n",
1164 | "plt.grid(visible=True, which='major', axis='x', color='grey', linestyle='--', linewidth=0.5)\n",
1165 | "#ax.set_yticks()\n",
1166 | "plt.plot();"
1167 | ]
1168 | }
1169 | ],
1170 | "metadata": {
1171 | "kernelspec": {
1172 | "display_name": "Python 3 (ipykernel)",
1173 | "language": "python",
1174 | "name": "python3"
1175 | },
1176 | "language_info": {
1177 | "codemirror_mode": {
1178 | "name": "ipython",
1179 | "version": 3
1180 | },
1181 | "file_extension": ".py",
1182 | "mimetype": "text/x-python",
1183 | "name": "python",
1184 | "nbconvert_exporter": "python",
1185 | "pygments_lexer": "ipython3",
1186 | "version": "3.11.5"
1187 | }
1188 | },
1189 | "nbformat": 4,
1190 | "nbformat_minor": 5
1191 | }
1192 |
--------------------------------------------------------------------------------
/pair-test.gnuplot:
--------------------------------------------------------------------------------
1 | set term svg enhanced size 1200,400 lw 1.5
2 | set output ARG2
3 | set grid
4 |
5 | set datafile separator ',
6 |
7 | set multiplot
8 |
9 | set ylabel "time (us.)"
10 | set xlabel "observation no"
11 |
12 | set title "Execution time"
13 | set size 0.6,1
14 | set origin 0,0
15 | plot ARG1 using ($1/$3 / 1000) title "base" with linespoints pt 1 ps 0.3 lw 0.8 lc 'dark-red', \
16 | ARG1 using (-$2/$3 / 1000) title "-candidate" with linespoints pt 1 ps 0.3 lw 0.8 lc 'dark-green', \
17 | ARG1 using (($2 - $1) / $3 / 1000) title "(candidate-baseline)" with lines lw 0.8 lc 'navy'
18 |
19 | set xtics autofreq
20 | set ytics autofreq
21 |
22 | set ylabel "time (us.) - candidate"
23 | set xlabel "time (us.) - base"
24 |
25 | f(x) = x
26 |
27 | unset title
28 | set size 0.4,1
29 | set origin 0.6,0
30 | unset key
31 |
32 | plot f(x) notitle with lines linestyle 1 lc "red" dt 4 lw 1, \
33 | ARG1 using ($1 / $3 / 1000):($2 / $3 / 1000) title "time to execute" with points pt 1 ps 0.5 lc rgb 'dark-red'
34 |
--------------------------------------------------------------------------------
/render-plot.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gnuplot -c plot.gnuplot "$1" "plot.svg" && osascript -e 'tell application "Google Chrome" to tell the active tab of its first window to reload'
4 |
--------------------------------------------------------------------------------
/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel = "nightly"
3 |
--------------------------------------------------------------------------------
/scripts/aws-bench.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # This scripts is automating experiment on an AWS virtual machine
4 | #
5 | # The goal of an experiment is to measure performance variance reported by both harnesses (tango/criterion).
6 | # UTF8 counting routine is used as a test function. The first one is counting up to 5000 characters in a string
7 | # the second is up to 4950. We are expecting to see 1% difference in performance of those two functions
8 |
9 | CRITERION=./target/criterion.txt
10 | TANGO=./target/tango.txt
11 | TANGO_FILTERED=./target/tango-filtered.txt
12 |
13 | # Building and exporting all benchmarks. Align feature is used to disable inlining and to force 32-byte aligning
14 | # of a tested functions. Without this trick the performance of the functions on Intel platform is heavily influenced
15 | # by code aligning.
16 | cargo +nightly export ./target/benchmarks -- bench --features=align --bench=criterion
17 | cargo +nightly export target/benchmarks -- bench --features=align --bench='tango-*'
18 |
19 | while :
20 | do
21 | date | tee -a "${CRITERION}" | tee -a "${TANGO}" | tee -a "${TANGO_FILTERED}"
22 |
23 | # Running criterion benchmarks
24 | ./target/benchmarks/criterion --bench str_length_495 \
25 | --warm-up-time 1 --measurement-time 1 | tee -a "${CRITERION}"
26 | ./target/benchmarks/criterion --bench str_length_500 \
27 | --warm-up-time 1 --measurement-time 1 | tee -a "${CRITERION}"
28 |
29 | # Running tango benchmarks
30 | ./target/benchmarks/tango_faster compare ./target/benchmarks/tango_slower \
31 | -t 2 -f 'str_length_limit' | tee -a "${TANGO}"
32 | ./target/benchmarks/tango_faster compare ./target/benchmarks/tango_slower \
33 | -t 2 -o -f 'str_length_limit' | tee -a "${TANGO_FILTERED}"
34 | done
35 |
--------------------------------------------------------------------------------
/scripts/aws-results.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -eo pipefail
3 |
4 | CRITERION=./target/criterion.txt
5 | TANGO=./target/tango.txt
6 | TANGO_FILTERED=./target/tango-filtered.txt
7 |
8 | if [ "$1" == "tango" ]; then
9 | cat "${TANGO}" | awk '{print $(NF)}' | egrep -o '(-|\+)[0-9]+\.[0-9]+'
10 | fi
11 |
12 | if [ "$1" == "tango-filtered" ]; then
13 | cat "${TANGO_FILTERED}" | awk '{print $(NF)}' | egrep -o '(-|\+)[0-9]+\.[0-9]+'
14 | fi
15 |
16 | if [ "$1" == "criterion" ]; then
17 | paste \
18 | <(cat "${CRITERION}" | grep -A1 "str_length_5000" | grep 'time:' | awk '{print $5}') \
19 | <(cat "${CRITERION}" | grep -A1 "str_length_4950" | grep 'time:' | awk '{print $5}') | \
20 | awk '{print ($2 - $1) / $1 * 100}'
21 | fi
22 |
--------------------------------------------------------------------------------
/scripts/calibrate.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # This script runs tango-faster/tango-slower pair of benchmarks several times
4 | # and reports how many times the results were statistically significant. Because
5 | # those benchmarks was intentianlly constructed with performance difference,
6 | # the bigger results the better (like [10/10]).
7 |
8 | set -eo pipefail
9 |
10 | cargo export target/benchmarks -- bench --bench='tango-*' --features=align
11 |
12 | CMD="target/benchmarks/tango_faster compare target/benchmarks/tango_slower $@"
13 | OUTPUT=""
14 | ITERATIONS=10
15 |
16 | for (( i=1; i<=ITERATIONS; i++ ))
17 | do
18 | echo -n "."
19 | OUTPUT=$(paste <(echo "$OUTPUT") <($CMD))
20 | done
21 | echo
22 |
23 | echo "Results:"
24 | echo "$OUTPUT" | awk -v iter="$ITERATIONS" -F ' {2,}' '{printf(" [%3d/%3d] %s\n", gsub(/\*/,"", $0), iter, $1)}'
25 |
--------------------------------------------------------------------------------
/scripts/criterion.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -eo pipefail
3 |
4 | FILE=./target/criterion.txt
5 |
6 | if [ -f "${FILE}" ]; then
7 | rm -f "${FILE}"
8 | fi
9 |
10 | cargo export ./target/benchmarks -- bench --bench=criterion
11 |
12 | time (
13 | for i in {1..30}; do
14 | ./target/benchmarks/criterion --bench str_length_495 \
15 | --warm-up-time 1 --measurement-time 1 >> "${FILE}"
16 | ./target/benchmarks/criterion --bench str_length_500 \
17 | --warm-up-time 1 --measurement-time 1 >> "${FILE}"
18 | done
19 | )
20 |
21 | paste \
22 | <(cat "${FILE}" | grep -A1 "str_length_500" | grep 'time:' | awk '{print $5}') \
23 | <(cat "${FILE}" | grep -A1 "str_length_495" | grep 'time:' | awk '{print $5}') | \
24 | awk '{print ($2 - $1) / $1 * 100}'
25 |
--------------------------------------------------------------------------------
/scripts/describe.py:
--------------------------------------------------------------------------------
1 | import colorama
2 | import pandas as pd
3 | import numpy as np
4 | import sys
5 | from colorama import Fore, Back, Style
6 |
7 | def main():
8 | colorama.init()
9 | max_length = [len(name) for name in sys.argv[1:]]
10 | for file in sys.argv[1:]:
11 | df = pd.read_csv(file, header=None)
12 | time = (df[1] - df[0]) / df[2]
13 |
14 | file_name = (file + " ").ljust(max(max_length) + 2, '.')
15 |
16 | print(f"{Fore.LIGHTBLACK_EX}{file_name} {Style.RESET_ALL}", end="")
17 | q1 = np.percentile(time, 25)
18 | q3 = np.percentile(time, 75)
19 | print(f"{np.percentile(time, 5):>10.1f} {time.mean():>10.1f} {np.percentile(time, 95):>10.1f} [IQR:{q3 - q1:>10.1f}]")
20 |
21 | if __name__ == "__main__":
22 | main();
23 |
--------------------------------------------------------------------------------
/scripts/linear-sampling-test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -eo pipefail
3 |
4 | cargo +nightly export ./target/benchmarks -- bench --bench='search-*'
5 |
6 | mkdir -p target/dump
7 | rm -f target/dump/*.csv
8 |
9 | for i in {1..30}; do
10 | for sampler in flat linear random; do
11 | printf "%10s : " "$sampler"
12 | ./target/benchmarks/search_ord compare ./target/benchmarks/search_vec -t 1 \
13 | -f 'search/u32/1024/nodup' -d target/dump --sampler="$sampler"
14 | mv "target/dump/search-u32-1024-nodup.csv" "target/dump/$sampler-$i.csv"
15 | done
16 | done
17 |
--------------------------------------------------------------------------------
/scripts/ordsearch.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -eo pipefail
3 |
4 | cargo +nightly export target/benchmarks -- bench --bench="search-*"
5 |
6 | pushd ../ordsearch/
7 | cargo +nightly export ../tango/target/benchmarks -- bench --bench=search_comparison --features=nightly
8 | popd
9 |
10 | rm -f target/tango.txt
11 | rm -f target/criterion.txt
12 |
13 | # Patching PIE executable if needed
14 | target/benchmarks/search_ord compare target/benchmarks/search_vec -f "*/u32/1024/nodup" -t 0.1 > /dev/null
15 | if [ -f target/benchmarks/search_vec.patched ]; then
16 | mv target/benchmarks/search_vec.patched target/benchmarks/search_vec
17 | chmod +x target/benchmarks/search_vec
18 | fi
19 |
20 | for i in {1..1000}; do
21 | # Tango benchmarks
22 | (
23 | for time in 0.1 0.3 0.5 1.0; do
24 | target/benchmarks/search_ord compare target/benchmarks/search_vec -f "*/u32/1024/nodup" -t "$time" -o \
25 | | awk -v OFS=';' -v FS=" " -v time="$time" '{print "tango/u32/1024/" time "s", $NF}' | tr -d '%*'
26 | done
27 | ) | tee -a target/tango.txt
28 |
29 | # Criterion benchmarks
30 | target/benchmarks/search_comparison --bench "Search u32/(ordsearch|sorted_vec)/1024" \
31 | | tee -a target/criterion.txt
32 | done
33 |
34 | # Reporting code
35 | paste \
36 | <(cat target/criterion.txt | grep -A1 'sorted_vec' | grep 'time:' | awk '{print $4}') \
37 | <(cat target/criterion.txt | grep -A1 'ordsearch' | grep 'time:' | awk '{print $4}') \
38 | | awk 'OFS=";" {print "criterion/u32/1024", ($2 - $1) / $1 * 100}' > target/criterion_u32_1024.txt
39 | (cat target/criterion_u32_1024.txt; cat target/tango.txt) > target/results.txt
40 |
41 | (cat target/criterion.txt | grep 'change:' | awk 'OFS=";" {print "criterion/u32/1024", $3}'; cat target/tango.txt ) | tr -d '%' > target/results.txt
42 |
--------------------------------------------------------------------------------
/scripts/sensitivity-test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -eo pipefail
3 |
4 | cargo +nightly export ./target/benchmarks -- bench --bench='tango-*'
5 |
6 | mkdir -p target/dump
7 | rm -f target/dump/*.csv
8 |
9 | TARGET=target/benchmarks/tango_faster
10 |
11 | "$TARGET" compare "$TARGET" -d target/dump $@
12 | python3 ./scripts/describe.py target/dump/*.csv
13 |
--------------------------------------------------------------------------------
/scripts/tango.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -eo pipefail
3 |
4 | FILE=./target/tango.txt
5 |
6 | if [ -f "${FILE}" ]; then
7 | rm -f "${FILE}"
8 | fi
9 |
10 | cargo export target/benchmarks -- bench --bench='tango-*'
11 |
12 | time (
13 | for i in {1..30}; do
14 | ./target/benchmarks/tango_faster compare ./target/benchmarks/tango_slower \
15 | -t 1 -f 'str_length_limit' >> "${FILE}"
16 | done
17 | )
18 |
19 | cat "${FILE}" | awk '{print $(NF)}' | sed 's/%//'
20 |
--------------------------------------------------------------------------------
/tango-bench/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "tango-bench"
3 | version = "0.6.0"
4 | edition = "2021"
5 | license = "MIT"
6 | description = "Tango benchmarking harness"
7 | homepage = "https://github.com/bazhenov/tango"
8 | documentation = "https://docs.rs/tango-bench"
9 | repository = "https://github.com/bazhenov/tango"
10 | readme = "../README.md"
11 | categories = ["development-tools", "development-tools::profiling"]
12 | keywords = ["benchmarks", "performance"]
13 |
14 | [dependencies]
15 | anyhow = "1.0.75"
16 | clap = { version = "4.4.11", features = ["derive"] }
17 | colorz = { version = "1.1", features = ["supports-color"] }
18 | glob-match = "0.2"
19 | libloading = "0.8"
20 | log = "0.4.20"
21 | num-traits = "0.2"
22 | rand = { version = "0.8", features = ["small_rng"] }
23 | thiserror = "1.0.50"
24 | alloca = "0.4"
25 | tokio = { version = "1.37.0", features = ["rt"], optional = true }
26 |
27 | [target.'cfg(target_os = "linux")'.dependencies]
28 | goblin = "0.7.1"
29 | scroll = "0.11"
30 | tempfile = "3.8"
31 |
32 | [features]
33 | hw-timer = []
34 | async = []
35 | async-tokio = ['async', 'dep:tokio']
36 |
37 | [[bench]]
38 | name = "tango"
39 | harness = false
40 |
--------------------------------------------------------------------------------
/tango-bench/benches/tango.rs:
--------------------------------------------------------------------------------
1 | use rand::{distributions::Standard, rngs::SmallRng, Rng, SeedableRng};
2 | use tango_bench::{
3 | benchmark_fn, iqr_variance_thresholds, tango_benchmarks, tango_main, IntoBenchmarks, Summary,
4 | };
5 |
6 | fn summary_benchmarks() -> impl IntoBenchmarks {
7 | [benchmark_fn("summary", move |b| {
8 | let rnd = SmallRng::seed_from_u64(b.seed);
9 | let input: Vec = rnd.sample_iter(Standard).take(1000).collect();
10 | b.iter(move || Summary::from(&input))
11 | })]
12 | }
13 |
14 | fn iqr_interquartile_range_benchmarks() -> impl IntoBenchmarks {
15 | [benchmark_fn("iqr", move |b| {
16 | let rnd = SmallRng::seed_from_u64(b.seed);
17 | let input: Vec = rnd.sample_iter(Standard).take(1000).collect();
18 | b.iter(move || iqr_variance_thresholds(input.clone()))
19 | })]
20 | }
21 |
22 | fn empty_benchmarks() -> impl IntoBenchmarks {
23 | [benchmark_fn("measure_empty_function", move |p| {
24 | let mut bench = benchmark_fn("_", |b| b.iter(|| 42));
25 | let mut state = bench.prepare_state(p.seed);
26 | p.iter(move || state.measure(1))
27 | })]
28 | }
29 |
30 | tango_benchmarks!(
31 | empty_benchmarks(),
32 | summary_benchmarks(),
33 | iqr_interquartile_range_benchmarks()
34 | );
35 | tango_main!();
36 |
--------------------------------------------------------------------------------
/tango-bench/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 | println!("cargo:rustc-link-arg-benches=-rdynamic");
3 | println!("cargo:rerun-if-changed=build.rs");
4 | }
5 |
--------------------------------------------------------------------------------
/tango-bench/src/cli.rs:
--------------------------------------------------------------------------------
1 | //! Contains functionality of a `cargo bench` harness
2 | use crate::{
3 | dylib::{FunctionIdx, Spi, SpiModeKind},
4 | CacheFirewall, Error, FlatSampleLength, LinearSampleLength, MeasurementSettings,
5 | RandomSampleLength, SampleLength, SampleLengthKind,
6 | };
7 | use anyhow::{bail, Context};
8 | use clap::Parser;
9 | use colorz::mode::{self, Mode};
10 | use core::fmt;
11 | use glob_match::glob_match;
12 | use std::{
13 | env::{self, args, temp_dir},
14 | fmt::Display,
15 | fs,
16 | io::{stderr, Write},
17 | num::NonZeroUsize,
18 | path::{Path, PathBuf},
19 | process::{Command, ExitCode, Stdio},
20 | str::FromStr,
21 | time::Duration,
22 | };
23 |
24 | pub type Result = anyhow::Result;
25 | pub(crate) type StdResult = std::result::Result;
26 |
27 | #[derive(Parser, Debug)]
28 | enum BenchmarkMode {
29 | List {
30 | #[command(flatten)]
31 | bench_flags: CargoBenchFlags,
32 | },
33 | Compare(PairedOpts),
34 | Solo(SoloOpts),
35 | }
36 |
37 | #[derive(Parser, Debug)]
38 | struct PairedOpts {
39 | #[command(flatten)]
40 | bench_flags: CargoBenchFlags,
41 |
42 | /// Path to the executable to test against. Tango will test against itself if no executable given
43 | path: Option,
44 |
45 | /// write CSV dumps of all the measurements in a given location
46 | #[arg(short = 'd', long = "dump")]
47 | path_to_dump: Option,
48 |
49 | /// generate gnuplot graphs for each test (requires --dump [path] to be specified)
50 | #[arg(short = 'g', long = "gnuplot")]
51 | gnuplot: bool,
52 |
53 | /// seed for the random number generator or omit to use a random seed
54 | #[arg(long = "seed")]
55 | seed: Option,
56 |
57 | /// Number of samples to take for each test
58 | #[arg(short = 's', long = "samples")]
59 | samples: Option,
60 |
61 | /// The strategy to decide the number of iterations to run for each sample (values: flat, linear, random)
62 | #[arg(long = "sampler")]
63 | sampler: Option,
64 |
65 | /// Duration of each sample in seconds
66 | #[arg(short = 't', long = "time")]
67 | time: Option,
68 |
69 | /// Fail if the difference between the two measurements is greater than the given threshold in percent
70 | #[arg(long = "fail-threshold")]
71 | fail_threshold: Option,
72 |
73 | /// Should we terminate early if --fail-threshold is exceed
74 | #[arg(long = "fail-fast")]
75 | fail_fast: bool,
76 |
77 | /// Perform a read of a dummy data between samsples to minimize the effect of cache on the performance
78 | /// (size in Kbytes)
79 | #[arg(long = "cache-firewall")]
80 | cache_firewall: Option,
81 |
82 | /// Perform a randomized offset to the stack frame for each sample.
83 | /// (size in bytes)
84 | #[arg(long = "randomize-stack")]
85 | randomize_stack: Option,
86 |
87 | /// Delegate control back to the OS before each sample
88 | #[arg(long = "yield-before-sample")]
89 | yield_before_sample: Option,
90 |
91 | /// Filter tests by name (eg. '*/{sorted,unsorted}/[0-9]*')
92 | #[arg(short = 'f', long = "filter")]
93 | filter: Option,
94 |
95 | /// Report only statistically significant results
96 | #[arg(short = 'g', long = "significant-only", default_value_t = false)]
97 | significant_only: bool,
98 |
99 | /// Enable outlier detection
100 | #[arg(short = 'o', long = "filter-outliers")]
101 | filter_outliers: bool,
102 |
103 | /// Perform warmup iterations before taking measurements (1/10 of sample iterations)
104 | #[arg(long = "warmup")]
105 | warmup_enabled: Option,
106 |
107 | #[arg(short = 'p', long = "parallel")]
108 | parallel: bool,
109 |
110 | /// Quiet mode
111 | #[arg(short = 'q')]
112 | quiet: bool,
113 |
114 | #[arg(short = 'v', long = "verbose", default_value_t = false)]
115 | verbose: bool,
116 | }
117 |
118 | #[derive(Parser, Debug)]
119 | struct SoloOpts {
120 | #[command(flatten)]
121 | bench_flags: CargoBenchFlags,
122 |
123 | /// seed for the random number generator or omit to use a random seed
124 | #[arg(long = "seed")]
125 | seed: Option,
126 |
127 | /// Number of samples to take for each test
128 | #[arg(short = 's', long = "samples")]
129 | samples: Option,
130 |
131 | /// The strategy to decide the number of iterations to run for each sample (values: flat, linear, random)
132 | #[arg(long = "sampler")]
133 | sampler: Option,
134 |
135 | /// Duration of each sample in seconds
136 | #[arg(short = 't', long = "time")]
137 | time: Option,
138 |
139 | /// Perform a read of a dummy data between samsples to minimize the effect of cache on the performance
140 | /// (size in Kbytes)
141 | #[arg(long = "cache-firewall")]
142 | cache_firewall: Option,
143 |
144 | /// Perform a randomized offset to the stack frame for each sample.
145 | /// (size in bytes)
146 | #[arg(long = "randomize-stack")]
147 | randomize_stack: Option,
148 |
149 | /// Delegate control back to the OS before each sample
150 | #[arg(long = "yield-before-sample")]
151 | yield_before_sample: Option,
152 |
153 | /// Filter tests by name (eg. '*/{sorted,unsorted}/[0-9]*')
154 | #[arg(short = 'f', long = "filter")]
155 | filter: Option,
156 |
157 | /// Perform warmup iterations before taking measurements (1/10 of sample iterations)
158 | #[arg(long = "warmup")]
159 | warmup_enabled: Option,
160 |
161 | /// Quiet mode
162 | #[arg(short = 'q')]
163 | quiet: bool,
164 |
165 | #[arg(short = 'v', long = "verbose", default_value_t = false)]
166 | verbose: bool,
167 | }
168 |
169 | #[derive(Parser, Debug)]
170 | #[command(author, version, about, long_about = None)]
171 | struct Opts {
172 | #[command(subcommand)]
173 | subcommand: Option,
174 |
175 | #[command(flatten)]
176 | bench_flags: CargoBenchFlags,
177 |
178 | #[arg(long = "color", default_value = "detect")]
179 | coloring_mode: String,
180 | }
181 |
182 | impl FromStr for SampleLengthKind {
183 | type Err = Error;
184 |
185 | fn from_str(s: &str) -> StdResult {
186 | match s {
187 | "flat" => Ok(SampleLengthKind::Flat),
188 | "linear" => Ok(SampleLengthKind::Linear),
189 | "random" => Ok(SampleLengthKind::Random),
190 | _ => Err(Error::UnknownSamplerType),
191 | }
192 | }
193 | }
194 |
195 | /// Definition of the flags required to comply with `cargo bench` calling conventions.
196 | #[derive(Parser, Debug, Clone)]
197 | struct CargoBenchFlags {
198 | #[arg(long = "bench", default_value_t = true)]
199 | bench: bool,
200 | }
201 |
202 | pub fn run(settings: MeasurementSettings) -> Result {
203 | let opts = Opts::parse();
204 |
205 | match Mode::from_str(&opts.coloring_mode) {
206 | Ok(coloring_mode) => mode::set_coloring_mode(coloring_mode),
207 | Err(_) => eprintln!("[WARN] Invalid coloring mode: {}", opts.coloring_mode),
208 | }
209 |
210 | let subcommand = opts.subcommand.unwrap_or(BenchmarkMode::List {
211 | bench_flags: opts.bench_flags,
212 | });
213 |
214 | match subcommand {
215 | BenchmarkMode::List { bench_flags: _ } => {
216 | let spi = Spi::for_self(SpiModeKind::Synchronous).ok_or(Error::SpiSelfWasMoved)?;
217 | for func in spi.tests() {
218 | println!("{}", func.name);
219 | }
220 | Ok(ExitCode::SUCCESS)
221 | }
222 | BenchmarkMode::Compare(opts) => paired_test::run_test(opts, settings),
223 | BenchmarkMode::Solo(opts) => solo_test::run_test(opts, settings),
224 | }
225 | }
226 |
227 | // Automatically removes a file when goes out of scope
228 | struct AutoDelete(PathBuf);
229 |
230 | impl std::ops::Deref for AutoDelete {
231 | type Target = PathBuf;
232 |
233 | fn deref(&self) -> &Self::Target {
234 | &self.0
235 | }
236 | }
237 |
238 | impl Drop for AutoDelete {
239 | fn drop(&mut self) {
240 | if let Err(e) = fs::remove_file(&self.0) {
241 | eprintln!("Failed to delete file {}: {}", self.0.display(), e);
242 | }
243 | }
244 | }
245 |
246 | fn create_loop_mode(samples: Option, time: Option) -> Result {
247 | let loop_mode = match (samples, time) {
248 | (Some(samples), None) => LoopMode::Samples(samples.into()),
249 | (None, Some(time)) => LoopMode::Time(Duration::from_millis((time * 1000.) as u64)),
250 | (None, None) => LoopMode::Time(Duration::from_millis(100)),
251 | (Some(_), Some(_)) => bail!("-t and -s are mutually exclusive"),
252 | };
253 | Ok(loop_mode)
254 | }
255 |
256 | #[derive(Clone, Copy)]
257 | enum LoopMode {
258 | Samples(usize),
259 | Time(Duration),
260 | }
261 |
262 | impl LoopMode {
263 | fn should_continue(&self, iter_no: usize, loop_time: Duration) -> bool {
264 | match self {
265 | LoopMode::Samples(samples) => iter_no < *samples,
266 | LoopMode::Time(duration) => loop_time < *duration,
267 | }
268 | }
269 | }
270 |
271 | mod solo_test {
272 | use super::*;
273 | use crate::{dylib::Spi, CacheFirewall, Summary};
274 | use alloca::with_alloca;
275 | use rand::{distributions, rngs::SmallRng, Rng, SeedableRng};
276 | use std::thread;
277 |
278 | pub(super) fn run_test(opts: SoloOpts, mut settings: MeasurementSettings) -> Result {
279 | let SoloOpts {
280 | bench_flags: _,
281 | quiet: _,
282 | verbose: _,
283 | filter,
284 | samples,
285 | time,
286 | seed,
287 | sampler,
288 | cache_firewall,
289 | yield_before_sample,
290 | warmup_enabled,
291 | randomize_stack,
292 | } = opts;
293 |
294 | let mut spi_self = Spi::for_self(SpiModeKind::Synchronous).ok_or(Error::SpiSelfWasMoved)?;
295 |
296 | settings.cache_firewall = cache_firewall;
297 | settings.randomize_stack = randomize_stack;
298 |
299 | if let Some(warmup_enabled) = warmup_enabled {
300 | settings.warmup_enabled = warmup_enabled;
301 | }
302 | if let Some(yield_before_sample) = yield_before_sample {
303 | settings.yield_before_sample = yield_before_sample;
304 | }
305 | if let Some(sampler) = sampler {
306 | settings.sampler_type = sampler;
307 | }
308 |
309 | let filter = filter.as_deref().unwrap_or("");
310 | let loop_mode = create_loop_mode(samples, time)?;
311 |
312 | let test_names = spi_self
313 | .tests()
314 | .iter()
315 | .map(|t| &t.name)
316 | .cloned()
317 | .collect::>();
318 | for func_name in test_names {
319 | if !filter.is_empty() && !glob_match(filter, &func_name) {
320 | continue;
321 | }
322 |
323 | let result = run_solo_test(&mut spi_self, &func_name, settings, seed, loop_mode)?;
324 |
325 | reporting::default_reporter_solo(&func_name, &result);
326 | }
327 |
328 | Ok(ExitCode::SUCCESS)
329 | }
330 |
331 | fn run_solo_test(
332 | spi: &mut Spi,
333 | test_name: &str,
334 | settings: MeasurementSettings,
335 | seed: Option,
336 | loop_mode: LoopMode,
337 | ) -> Result> {
338 | const TIME_SLICE_MS: u32 = 10;
339 |
340 | let firewall = settings
341 | .cache_firewall
342 | .map(|s| s * 1024)
343 | .map(CacheFirewall::new);
344 | let baseline_func = spi.lookup(test_name).ok_or(Error::InvalidTestName)?;
345 |
346 | let mut spi_func = TestedFunction::new(spi, baseline_func.idx);
347 |
348 | let seed = seed.unwrap_or_else(rand::random);
349 |
350 | spi_func.prepare_state(seed);
351 | let mut iterations_per_sample = (spi_func.estimate_iterations(TIME_SLICE_MS) / 2).max(1);
352 | let mut sampler = create_sampler(&settings, seed);
353 |
354 | let mut rng = SmallRng::seed_from_u64(seed);
355 | let stack_offset_distr = settings
356 | .randomize_stack
357 | .map(|offset| distributions::Uniform::new(0, offset));
358 |
359 | let mut i = 0;
360 |
361 | let mut sample_iterations = vec![];
362 |
363 | if let LoopMode::Samples(samples) = loop_mode {
364 | sample_iterations.reserve(samples);
365 | spi_func.samples.reserve(samples);
366 | }
367 |
368 | let mut loop_time = Duration::from_secs(0);
369 | let mut loop_iterations = 0;
370 | while loop_mode.should_continue(i, loop_time) {
371 | if loop_time > Duration::from_millis(100) {
372 | // correcting time slice estimates
373 | iterations_per_sample =
374 | loop_iterations * TIME_SLICE_MS as usize / loop_time.as_millis() as usize;
375 | }
376 | let iterations = sampler.next_sample_iterations(i, iterations_per_sample);
377 | loop_iterations += iterations;
378 | let warmup_iterations = settings.warmup_enabled.then(|| (iterations / 10).max(1));
379 |
380 | if settings.yield_before_sample {
381 | thread::yield_now();
382 | }
383 |
384 | let prepare_state_seed = (i % settings.samples_per_haystack == 0).then_some(seed);
385 |
386 | prepare_func(
387 | prepare_state_seed,
388 | &mut spi_func,
389 | warmup_iterations,
390 | firewall.as_ref(),
391 | );
392 |
393 | // Allocate a custom stack frame during runtime, to try to offset alignment of the stack.
394 | if let Some(distr) = stack_offset_distr {
395 | with_alloca(rng.sample(distr), |_| {
396 | spi_func.measure(iterations);
397 | });
398 | } else {
399 | spi_func.measure(iterations);
400 | }
401 |
402 | loop_time += Duration::from_nanos(spi_func.read_sample());
403 | sample_iterations.push(iterations);
404 | i += 1;
405 | }
406 |
407 | let samples = spi_func
408 | .samples
409 | .iter()
410 | .zip(sample_iterations.iter())
411 | .map(|(sample, iterations)| *sample as f64 / *iterations as f64)
412 | .collect::>();
413 | Ok(Summary::from(&samples).unwrap())
414 | }
415 | }
416 |
417 | mod paired_test {
418 | use super::*;
419 | use crate::{calculate_run_result, CacheFirewall, RunResult};
420 | use alloca::with_alloca;
421 | use fs::File;
422 | use rand::{distributions, rngs::SmallRng, Rng, SeedableRng};
423 | use std::{
424 | io::{self, BufWriter},
425 | mem, thread,
426 | };
427 |
428 | pub(super) fn run_test(
429 | opts: PairedOpts,
430 | mut settings: MeasurementSettings,
431 | ) -> Result {
432 | let PairedOpts {
433 | bench_flags: _,
434 | path,
435 | verbose,
436 | filter,
437 | samples,
438 | time,
439 | filter_outliers,
440 | path_to_dump,
441 | gnuplot,
442 | fail_threshold,
443 | fail_fast,
444 | significant_only,
445 | seed,
446 | sampler,
447 | cache_firewall,
448 | yield_before_sample,
449 | warmup_enabled,
450 | parallel,
451 | quiet,
452 | randomize_stack,
453 | } = opts;
454 | let mut path = path
455 | .or_else(|| args().next().map(PathBuf::from))
456 | .expect("No path given");
457 | if path.is_relative() {
458 | // Resolving paths relative to PWD if given
459 | if let Ok(pwd) = env::var("PWD") {
460 | path = PathBuf::from(pwd).join(path)
461 | }
462 | };
463 |
464 | #[cfg(target_os = "linux")]
465 | let path = crate::linux::patch_pie_binary_if_needed(&path)?.unwrap_or(path);
466 |
467 | let mode = if parallel {
468 | SpiModeKind::Asynchronous
469 | } else {
470 | SpiModeKind::Synchronous
471 | };
472 |
473 | let mut spi_self = Spi::for_self(mode).ok_or(Error::SpiSelfWasMoved)?;
474 | let mut spi_lib = Spi::for_library(path, mode);
475 |
476 | settings.filter_outliers = filter_outliers;
477 | settings.cache_firewall = cache_firewall;
478 | settings.randomize_stack = randomize_stack;
479 |
480 | if let Some(warmup_enabled) = warmup_enabled {
481 | settings.warmup_enabled = warmup_enabled;
482 | }
483 | if let Some(yield_before_sample) = yield_before_sample {
484 | settings.yield_before_sample = yield_before_sample;
485 | }
486 | if let Some(sampler) = sampler {
487 | settings.sampler_type = sampler;
488 | }
489 |
490 | let filter = filter.as_deref().unwrap_or("");
491 | let loop_mode = create_loop_mode(samples, time)?;
492 |
493 | let mut exit_code = ExitCode::SUCCESS;
494 |
495 | if let Some(path) = &path_to_dump {
496 | if !path.exists() {
497 | fs::create_dir_all(path)?;
498 | }
499 | }
500 | if gnuplot && path_to_dump.is_none() {
501 | eprintln!("warn: --gnuplot requires -d to be specified. No plots will be generated")
502 | }
503 |
504 | let mut sample_dumps = vec![];
505 |
506 | let test_names = spi_self
507 | .tests()
508 | .iter()
509 | .map(|t| &t.name)
510 | .cloned()
511 | .collect::>();
512 | for func_name in test_names {
513 | if !filter.is_empty() && !glob_match(filter, &func_name) {
514 | continue;
515 | }
516 |
517 | if spi_lib.lookup(&func_name).is_none() {
518 | if !quiet {
519 | writeln!(stderr(), "{} skipped...", &func_name)?;
520 | }
521 | continue;
522 | }
523 |
524 | let (result, sample_dump) = run_paired_test(
525 | &mut spi_lib,
526 | &mut spi_self,
527 | &func_name,
528 | settings,
529 | seed,
530 | loop_mode,
531 | path_to_dump.as_ref(),
532 | )?;
533 |
534 | if let Some(dump) = sample_dump {
535 | sample_dumps.push(dump);
536 | }
537 |
538 | if result.diff_estimate.significant || !significant_only {
539 | if verbose {
540 | reporting::verbose_reporter(&result);
541 | } else {
542 | reporting::default_reporter(&result);
543 | }
544 | }
545 |
546 | if result.diff_estimate.significant {
547 | if let Some(threshold) = fail_threshold {
548 | if result.diff_estimate.pct >= threshold {
549 | eprintln!(
550 | "[ERROR] Performance regressed {:+.1}% >= {:.1}% - test: {}",
551 | result.diff_estimate.pct, threshold, func_name
552 | );
553 | if fail_fast {
554 | return Ok(ExitCode::FAILURE);
555 | } else {
556 | exit_code = ExitCode::FAILURE;
557 | }
558 | }
559 | }
560 | }
561 | }
562 |
563 | if let Some(path_to_dump) = path_to_dump {
564 | if gnuplot && !sample_dumps.is_empty() {
565 | generate_plots(&path_to_dump, sample_dumps.as_slice())?;
566 | }
567 | }
568 |
569 | Ok(exit_code)
570 | }
571 |
572 | /// Measure the difference in performance of two functions
573 | ///
574 | /// Provides a way to save a raw dump of measurements into directory
575 | ///
576 | /// The format is as follows
577 | /// ```txt
578 | /// b_1,c_1
579 | /// b_2,c_2
580 | /// ...
581 | /// b_n,c_n
582 | /// ```
583 | /// where `b_1..b_n` are baseline absolute time (in nanoseconds) measurements
584 | /// and `c_1..c_n` are candidate time measurements
585 | ///
586 | /// Returns a statistical results of a test run and path to raw samples of sample dump was requested
587 | fn run_paired_test(
588 | baseline: &mut Spi,
589 | candidate: &mut Spi,
590 | test_name: &str,
591 | settings: MeasurementSettings,
592 | seed: Option,
593 | loop_mode: LoopMode,
594 | samples_dump_path: Option<&PathBuf>,
595 | ) -> Result<(RunResult, Option)> {
596 | const TIME_SLICE_MS: u32 = 10;
597 |
598 | let firewall = settings
599 | .cache_firewall
600 | .map(|s| s * 1024)
601 | .map(CacheFirewall::new);
602 | let baseline_func = baseline.lookup(test_name).ok_or(Error::InvalidTestName)?;
603 | let candidate_func = candidate.lookup(test_name).ok_or(Error::InvalidTestName)?;
604 |
605 | let mut baseline = TestedFunction::new(baseline, baseline_func.idx);
606 | let mut candidate = TestedFunction::new(candidate, candidate_func.idx);
607 |
608 | let mut a_func = &mut baseline;
609 | let mut b_func = &mut candidate;
610 |
611 | let seed = seed.unwrap_or_else(rand::random);
612 |
613 | a_func.prepare_state(seed);
614 | let a_estimate = (a_func.estimate_iterations(TIME_SLICE_MS) / 2).max(1);
615 |
616 | b_func.prepare_state(seed);
617 | let b_estimate = (b_func.estimate_iterations(TIME_SLICE_MS) / 2).max(1);
618 |
619 | let mut iterations_per_sample = a_estimate.min(b_estimate);
620 | let mut sampler = create_sampler(&settings, seed);
621 |
622 | let mut rng = SmallRng::seed_from_u64(seed);
623 | let stack_offset_distr = settings
624 | .randomize_stack
625 | .map(|offset| distributions::Uniform::new(0, offset));
626 |
627 | let mut i = 0;
628 | let mut switch_counter = 0;
629 |
630 | let mut sample_iterations = vec![];
631 |
632 | if let LoopMode::Samples(samples) = loop_mode {
633 | sample_iterations.reserve(samples);
634 | a_func.samples.reserve(samples);
635 | b_func.samples.reserve(samples);
636 | }
637 |
638 | let mut loop_time = Duration::from_secs(0);
639 | let mut loop_iterations = 0;
640 | while loop_mode.should_continue(i, loop_time) {
641 | if loop_time > Duration::from_millis(100) {
642 | // correcting time slice estimates
643 | iterations_per_sample =
644 | loop_iterations * TIME_SLICE_MS as usize / loop_time.as_millis() as usize;
645 | }
646 | let iterations = sampler.next_sample_iterations(i, iterations_per_sample);
647 | loop_iterations += iterations;
648 | let warmup_iterations = settings.warmup_enabled.then(|| (iterations / 10).max(1));
649 |
650 | // !!! IMPORTANT !!!
651 | // Algorithms should be called in different order on each new iteration.
652 | // This equalize the probability of facing unfortunate circumstances like cache misses or page faults
653 | // for both functions. Although both algorithms are from distinct shared objects and therefore
654 | // must be fully self-contained in terms of virtual address space (each shared object has its own
655 | // generator instances, static variables, memory mappings, etc.) it might be the case that
656 | // on the level of physical memory both of them rely on the same memory-mapped test data, for example.
657 | // In that case first function will experience the larger amount of major page faults.
658 | {
659 | mem::swap(&mut a_func, &mut b_func);
660 | switch_counter += 1;
661 | }
662 |
663 | if settings.yield_before_sample {
664 | thread::yield_now();
665 | }
666 |
667 | let prepare_state_seed = (i % settings.samples_per_haystack == 0).then_some(seed);
668 | let mut sample_time = 0;
669 |
670 | prepare_func(
671 | prepare_state_seed,
672 | a_func,
673 | warmup_iterations,
674 | firewall.as_ref(),
675 | );
676 | prepare_func(
677 | prepare_state_seed,
678 | b_func,
679 | warmup_iterations,
680 | firewall.as_ref(),
681 | );
682 |
683 | // Allocate a custom stack frame during runtime, to try to offset alignment of the stack.
684 | if let Some(distr) = stack_offset_distr {
685 | with_alloca(rng.sample(distr), |_| {
686 | a_func.measure(iterations);
687 | b_func.measure(iterations);
688 | });
689 | } else {
690 | a_func.measure(iterations);
691 | b_func.measure(iterations);
692 | }
693 |
694 | let a_sample_time = a_func.read_sample();
695 | let b_sample_time = b_func.read_sample();
696 | sample_time += a_sample_time.max(b_sample_time);
697 |
698 | loop_time += Duration::from_nanos(sample_time);
699 | sample_iterations.push(iterations);
700 | i += 1;
701 | }
702 |
703 | // If we switched functions odd number of times then we need to swap them back so that
704 | // the first function is always the baseline.
705 | if switch_counter % 2 != 0 {
706 | mem::swap(&mut a_func, &mut b_func);
707 | }
708 |
709 | let run_result = calculate_run_result(
710 | test_name,
711 | &a_func.samples,
712 | &b_func.samples,
713 | &sample_iterations,
714 | settings.filter_outliers,
715 | )
716 | .ok_or(Error::NoMeasurements)?;
717 |
718 | let samples_path = if let Some(path) = samples_dump_path {
719 | let file_path = write_samples(path, test_name, a_func, b_func, sample_iterations)?;
720 | Some(file_path)
721 | } else {
722 | None
723 | };
724 |
725 | Ok((run_result, samples_path))
726 | }
727 |
728 | fn write_samples(
729 | path: &Path,
730 | test_name: &str,
731 | a_func: &TestedFunction,
732 | b_func: &TestedFunction,
733 | iterations: Vec,
734 | ) -> Result {
735 | let file_name = format!("{}.csv", test_name.replace('/', "-"));
736 | let file_path = path.join(file_name);
737 | let s_samples = a_func.samples.iter().copied();
738 | let b_samples = b_func.samples.iter().copied();
739 | let values = s_samples
740 | .zip(b_samples)
741 | .zip(iterations.iter().copied())
742 | .map(|((a, b), c)| (a, b, c));
743 | write_csv(&file_path, values).context("Unable to write raw measurements")?;
744 | Ok(file_path)
745 | }
746 |
747 | fn write_csv(
748 | path: impl AsRef,
749 | values: impl IntoIterator- ,
750 | ) -> io::Result<()> {
751 | let mut file = BufWriter::new(File::create(path)?);
752 | for (a, b, c) in values {
753 | writeln!(&mut file, "{},{},{}", a, b, c)?;
754 | }
755 | Ok(())
756 | }
757 |
758 | fn generate_plots(path: &Path, sample_dumps: &[PathBuf]) -> Result<()> {
759 | let gnuplot_file = AutoDelete(temp_dir().join("tango-plot.gnuplot"));
760 | fs::write(&*gnuplot_file, include_bytes!("plot.gnuplot"))?;
761 | let gnuplot_file_str = gnuplot_file.to_str().unwrap();
762 |
763 | for input in sample_dumps {
764 | let csv_input = input.to_str().unwrap();
765 | let svg_path = input.with_extension("svg");
766 | let cmd = Command::new("gnuplot")
767 | .args([
768 | "-c",
769 | gnuplot_file_str,
770 | csv_input,
771 | svg_path.to_str().unwrap(),
772 | ])
773 | .stdin(Stdio::null())
774 | .stdout(Stdio::inherit())
775 | .stderr(Stdio::inherit())
776 | .status()
777 | .context("Failed to execute gnuplot")?;
778 |
779 | if !cmd.success() {
780 | bail!("gnuplot command failed");
781 | }
782 | }
783 | Ok(())
784 | }
785 | }
786 |
787 | mod reporting {
788 | use crate::cli::{colorize, HumanTime};
789 | use crate::{RunResult, Summary};
790 | use colorz::{mode::Stream, Colorize};
791 |
792 | pub(super) fn verbose_reporter(results: &RunResult) {
793 | let base = results.baseline;
794 | let candidate = results.candidate;
795 |
796 | let significant = results.diff_estimate.significant;
797 |
798 | println!(
799 | "{} (n: {}, outliers: {})",
800 | results.name.bold().stream(Stream::Stdout),
801 | results.diff.n,
802 | results.outliers
803 | );
804 |
805 | println!(
806 | " {:12} {:>15} {:>15} {:>15}",
807 | "",
808 | "baseline".bold().stream(Stream::Stdout),
809 | "candidate".bold().stream(Stream::Stdout),
810 | "∆".bold().stream(Stream::Stdout),
811 | );
812 | println!(
813 | " {:12} ╭────────────────────────────────────────────────",
814 | ""
815 | );
816 | println!(
817 | " {:12} │ {:>15} {:>15} {:>15} {:+4.2}{}{}",
818 | "mean",
819 | HumanTime(base.mean),
820 | HumanTime(candidate.mean),
821 | colorize(
822 | HumanTime(results.diff.mean),
823 | significant,
824 | results.diff.mean < 0.
825 | ),
826 | colorize(
827 | results.diff_estimate.pct,
828 | significant,
829 | results.diff.mean < 0.
830 | ),
831 | colorize("%", significant, results.diff.mean < 0.),
832 | if significant { "*" } else { "" },
833 | );
834 | println!(
835 | " {:12} │ {:>15} {:>15} {:>15}",
836 | "min",
837 | HumanTime(base.min),
838 | HumanTime(candidate.min),
839 | HumanTime(candidate.min - base.min)
840 | );
841 | println!(
842 | " {:12} │ {:>15} {:>15} {:>15}",
843 | "max",
844 | HumanTime(base.max),
845 | HumanTime(candidate.max),
846 | HumanTime(candidate.max - base.max),
847 | );
848 | println!(
849 | " {:12} │ {:>15} {:>15} {:>15}",
850 | "std. dev.",
851 | HumanTime(base.variance.sqrt()),
852 | HumanTime(candidate.variance.sqrt()),
853 | HumanTime(results.diff.variance.sqrt()),
854 | );
855 | println!();
856 | }
857 |
858 | pub(super) fn default_reporter(results: &RunResult) {
859 | let base = results.baseline;
860 | let candidate = results.candidate;
861 | let diff = results.diff;
862 |
863 | let significant = results.diff_estimate.significant;
864 |
865 | let speedup = results.diff_estimate.pct;
866 | let candidate_faster = diff.mean < 0.;
867 | println!(
868 | "{:50} [ {:>8} ... {:>8} ] {:>+7.2}{}{}",
869 | colorize(&results.name, significant, candidate_faster),
870 | HumanTime(base.mean),
871 | colorize(HumanTime(candidate.mean), significant, candidate_faster),
872 | colorize(speedup, significant, candidate_faster),
873 | colorize("%", significant, candidate_faster),
874 | if significant { "*" } else { "" },
875 | )
876 | }
877 |
878 | pub(super) fn default_reporter_solo(name: &str, results: &Summary) {
879 | println!(
880 | "{:50} [ {:>8} ... {:>8} ... {:>8} ] stddev: {:>8}",
881 | name,
882 | HumanTime(results.min),
883 | HumanTime(results.mean),
884 | HumanTime(results.max),
885 | HumanTime(results.variance.sqrt()),
886 | )
887 | }
888 | }
889 |
890 | struct TestedFunction<'a> {
891 | pub(crate) spi: &'a mut Spi,
892 | pub(crate) samples: Vec,
893 | }
894 |
895 | impl<'a> TestedFunction<'a> {
896 | pub(crate) fn new(spi: &'a mut Spi, func: FunctionIdx) -> Self {
897 | spi.select(func);
898 | TestedFunction {
899 | spi,
900 | samples: Vec::new(),
901 | }
902 | }
903 |
904 | pub(crate) fn measure(&mut self, iterations: usize) {
905 | self.spi.measure(iterations);
906 | }
907 |
908 | pub(crate) fn read_sample(&mut self) -> u64 {
909 | let sample = self.spi.read_sample();
910 | self.samples.push(sample);
911 | sample
912 | }
913 |
914 | pub(crate) fn run(&mut self, iterations: usize) -> u64 {
915 | self.spi.run(iterations)
916 | }
917 |
918 | pub(crate) fn prepare_state(&mut self, seed: u64) {
919 | self.spi.prepare_state(seed);
920 | }
921 |
922 | pub(crate) fn estimate_iterations(&mut self, time_ms: u32) -> usize {
923 | self.spi.estimate_iterations(time_ms)
924 | }
925 | }
926 |
927 | fn prepare_func(
928 | prepare_state_seed: Option,
929 | f: &mut TestedFunction,
930 | warmup_iterations: Option,
931 | firewall: Option<&CacheFirewall>,
932 | ) {
933 | if let Some(seed) = prepare_state_seed {
934 | f.prepare_state(seed);
935 | if let Some(firewall) = firewall {
936 | firewall.issue_read();
937 | }
938 | }
939 | if let Some(warmup_iterations) = warmup_iterations {
940 | f.run(warmup_iterations);
941 | }
942 | }
943 |
944 | fn create_sampler(settings: &MeasurementSettings, seed: u64) -> Box {
945 | match settings.sampler_type {
946 | SampleLengthKind::Flat => Box::new(FlatSampleLength::new(settings)),
947 | SampleLengthKind::Linear => Box::new(LinearSampleLength::new(settings)),
948 | SampleLengthKind::Random => Box::new(RandomSampleLength::new(settings, seed)),
949 | }
950 | }
951 |
952 | fn colorize(value: T, do_paint: bool, is_improved: bool) -> impl Display {
953 | use colorz::{ansi, mode::Stream::Stdout, Colorize, Style};
954 |
955 | const RED: Style = Style::new().fg(ansi::Red).const_into_runtime_style();
956 | const GREEN: Style = Style::new().fg(ansi::Green).const_into_runtime_style();
957 | const DEFAULT: Style = Style::new().const_into_runtime_style();
958 |
959 | if do_paint {
960 | if is_improved {
961 | value.into_style_with(GREEN).stream(Stdout)
962 | } else {
963 | value.into_style_with(RED).stream(Stdout)
964 | }
965 | } else {
966 | value.into_style_with(DEFAULT).stream(Stdout)
967 | }
968 | }
969 |
970 | struct HumanTime(f64);
971 |
972 | impl fmt::Display for HumanTime {
973 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
974 | const USEC: f64 = 1_000.;
975 | const MSEC: f64 = USEC * 1_000.;
976 | const SEC: f64 = MSEC * 1_000.;
977 |
978 | if self.0.abs() > SEC {
979 | f.pad(&format!("{:.1} s", self.0 / SEC))
980 | } else if self.0.abs() > MSEC {
981 | f.pad(&format!("{:.1} ms", self.0 / MSEC))
982 | } else if self.0.abs() > USEC {
983 | f.pad(&format!("{:.1} us", self.0 / USEC))
984 | } else if self.0 == 0. {
985 | f.pad("0 ns")
986 | } else {
987 | f.pad(&format!("{:.1} ns", self.0))
988 | }
989 | }
990 | }
991 |
992 | #[cfg(test)]
993 | mod tests {
994 | use super::*;
995 |
996 | #[test]
997 | fn check_human_time() {
998 | assert_eq!(format!("{}", HumanTime(0.1)), "0.1 ns");
999 | assert_eq!(format!("{:>5}", HumanTime(0.)), " 0 ns");
1000 |
1001 | assert_eq!(format!("{}", HumanTime(120.)), "120.0 ns");
1002 |
1003 | assert_eq!(format!("{}", HumanTime(1200.)), "1.2 us");
1004 |
1005 | assert_eq!(format!("{}", HumanTime(1200000.)), "1.2 ms");
1006 |
1007 | assert_eq!(format!("{}", HumanTime(1200000000.)), "1.2 s");
1008 |
1009 | assert_eq!(format!("{}", HumanTime(-1200000.)), "-1.2 ms");
1010 | }
1011 |
1012 | // Sane checking some simple patterns
1013 | #[test]
1014 | fn check_glob() {
1015 | let patterns = vec!["a/*/*", "a/**", "*/32/*", "**/b", "a/{32,64}/*"];
1016 | let input = "a/32/b";
1017 | for pattern in patterns {
1018 | assert!(
1019 | glob_match(pattern, input),
1020 | "failed to match {} against {}",
1021 | pattern,
1022 | input
1023 | );
1024 | }
1025 | }
1026 | }
1027 |
--------------------------------------------------------------------------------
/tango-bench/src/dylib.rs:
--------------------------------------------------------------------------------
1 | //! Loading and resolving symbols from .dylib/.so libraries
2 |
3 | use self::ffi::{VTable, SELF_VTABLE};
4 | use crate::{Benchmark, ErasedSampler, Error};
5 | use anyhow::Context;
6 | use libloading::{Library, Symbol};
7 | use std::{
8 | ffi::{c_char, c_ulonglong},
9 | path::Path,
10 | ptr::{addr_of, null},
11 | slice, str,
12 | sync::mpsc::{channel, Receiver, Sender},
13 | thread::{self, JoinHandle},
14 | };
15 |
16 | pub type FunctionIdx = usize;
17 |
18 | #[derive(Debug, Clone)]
19 | pub struct NamedFunction {
20 | pub name: String,
21 |
22 | /// Function index in FFI API
23 | pub idx: FunctionIdx,
24 | }
25 |
26 | pub(crate) struct Spi {
27 | tests: Vec,
28 | selected_function: Option,
29 | mode: SpiMode,
30 | }
31 |
32 | #[derive(PartialEq, Eq, Clone, Copy)]
33 | pub enum SpiModeKind {
34 | // Benchmarks are executed synchronously when calling SPI
35 | //
36 | // Dispatcher switches between baseline and candidate after each sample
37 | Synchronous,
38 |
39 | // Benchmarks are executed in different threads
40 | //
41 | // Dispatcher creates a separate thread for baseline and candidate, but synchronize them after each benchmark
42 | Asynchronous,
43 | }
44 |
45 | enum SpiMode {
46 | Synchronous {
47 | vt: Box,
48 | last_measurement: u64,
49 | },
50 | Asynchronous {
51 | worker: Option>,
52 | tx: Sender,
53 | rx: Receiver,
54 | },
55 | }
56 |
57 | impl Spi {
58 | pub(crate) fn for_library(path: impl AsRef, mode: SpiModeKind) -> Spi {
59 | let lib = unsafe { Library::new(path.as_ref()) }
60 | .with_context(|| format!("Unable to open library: {}", path.as_ref().display()))
61 | .unwrap();
62 | spi_handle_for_vtable(ffi::LibraryVTable::new(lib).unwrap(), mode)
63 | }
64 |
65 | pub(crate) fn for_self(mode: SpiModeKind) -> Option {
66 | unsafe { SELF_VTABLE.take() }.map(|vt| spi_handle_for_vtable(vt, mode))
67 | }
68 |
69 | pub(crate) fn tests(&self) -> &[NamedFunction] {
70 | &self.tests
71 | }
72 |
73 | pub(crate) fn lookup(&self, name: &str) -> Option<&NamedFunction> {
74 | self.tests.iter().find(|f| f.name == name)
75 | }
76 |
77 | pub(crate) fn run(&mut self, iterations: usize) -> u64 {
78 | match &self.mode {
79 | SpiMode::Synchronous { vt, .. } => vt.run(iterations as c_ulonglong),
80 | SpiMode::Asynchronous { worker: _, tx, rx } => {
81 | tx.send(SpiRequest::Run { iterations }).unwrap();
82 | match rx.recv().unwrap() {
83 | SpiReply::Run(time) => time,
84 | r => panic!("Unexpected response: {:?}", r),
85 | }
86 | }
87 | }
88 | }
89 |
90 | pub(crate) fn measure(&mut self, iterations: usize) {
91 | match &mut self.mode {
92 | SpiMode::Synchronous {
93 | vt,
94 | last_measurement,
95 | } => {
96 | *last_measurement = vt.run(iterations as c_ulonglong);
97 | }
98 | SpiMode::Asynchronous { tx, .. } => {
99 | tx.send(SpiRequest::Measure { iterations }).unwrap();
100 | }
101 | }
102 | }
103 |
104 | pub(crate) fn read_sample(&mut self) -> u64 {
105 | match &self.mode {
106 | SpiMode::Synchronous {
107 | last_measurement, ..
108 | } => *last_measurement,
109 | SpiMode::Asynchronous { rx, .. } => match rx.recv().unwrap() {
110 | SpiReply::Measure(time) => time,
111 | r => panic!("Unexpected response: {:?}", r),
112 | },
113 | }
114 | }
115 |
116 | pub(crate) fn estimate_iterations(&mut self, time_ms: u32) -> usize {
117 | match &self.mode {
118 | SpiMode::Synchronous { vt, .. } => vt.estimate_iterations(time_ms) as usize,
119 | SpiMode::Asynchronous { tx, rx, .. } => {
120 | tx.send(SpiRequest::EstimateIterations { time_ms }).unwrap();
121 | match rx.recv().unwrap() {
122 | SpiReply::EstimateIterations(iters) => iters,
123 | r => panic!("Unexpected response: {:?}", r),
124 | }
125 | }
126 | }
127 | }
128 |
129 | pub(crate) fn prepare_state(&mut self, seed: u64) {
130 | match &self.mode {
131 | SpiMode::Synchronous { vt, .. } => vt.prepare_state(seed),
132 | SpiMode::Asynchronous { tx, rx, .. } => {
133 | tx.send(SpiRequest::PrepareState { seed }).unwrap();
134 | match rx.recv().unwrap() {
135 | SpiReply::PrepareState => {}
136 | r => panic!("Unexpected response: {:?}", r),
137 | }
138 | }
139 | }
140 | }
141 |
142 | pub(crate) fn select(&mut self, idx: usize) {
143 | match &self.mode {
144 | SpiMode::Synchronous { vt, .. } => vt.select(idx as c_ulonglong),
145 | SpiMode::Asynchronous { tx, rx, .. } => {
146 | tx.send(SpiRequest::Select { idx }).unwrap();
147 | match rx.recv().unwrap() {
148 | SpiReply::Select => self.selected_function = Some(idx),
149 | r => panic!("Unexpected response: {:?}", r),
150 | }
151 | }
152 | }
153 | }
154 | }
155 |
156 | impl Drop for Spi {
157 | fn drop(&mut self) {
158 | if let SpiMode::Asynchronous { worker, tx, .. } = &mut self.mode {
159 | if let Some(worker) = worker.take() {
160 | tx.send(SpiRequest::Shutdown).unwrap();
161 | worker.join().unwrap();
162 | }
163 | }
164 | }
165 | }
166 |
167 | fn spi_worker(vt: &dyn VTable, rx: Receiver, tx: Sender) {
168 | use SpiReply as Rp;
169 | use SpiRequest as Rq;
170 |
171 | while let Ok(req) = rx.recv() {
172 | let reply = match req {
173 | Rq::EstimateIterations { time_ms } => {
174 | Rp::EstimateIterations(vt.estimate_iterations(time_ms) as usize)
175 | }
176 | Rq::PrepareState { seed } => {
177 | vt.prepare_state(seed);
178 | Rp::PrepareState
179 | }
180 | Rq::Select { idx } => {
181 | vt.select(idx as c_ulonglong);
182 | Rp::Select
183 | }
184 | Rq::Run { iterations } => Rp::Run(vt.run(iterations as c_ulonglong)),
185 | Rq::Measure { iterations } => Rp::Measure(vt.run(iterations as c_ulonglong)),
186 | Rq::Shutdown => break,
187 | };
188 | tx.send(reply).unwrap();
189 | }
190 | }
191 |
192 | fn spi_handle_for_vtable(vtable: impl VTable + Send + 'static, mode: SpiModeKind) -> Spi {
193 | vtable.init();
194 | let tests = enumerate_tests(&vtable).unwrap();
195 |
196 | match mode {
197 | SpiModeKind::Asynchronous => {
198 | let (request_tx, request_rx) = channel();
199 | let (reply_tx, reply_rx) = channel();
200 | let worker = thread::spawn(move || {
201 | spi_worker(&vtable, request_rx, reply_tx);
202 | });
203 |
204 | Spi {
205 | tests,
206 | selected_function: None,
207 | mode: SpiMode::Asynchronous {
208 | worker: Some(worker),
209 | tx: request_tx,
210 | rx: reply_rx,
211 | },
212 | }
213 | }
214 | SpiModeKind::Synchronous => Spi {
215 | tests,
216 | selected_function: None,
217 | mode: SpiMode::Synchronous {
218 | vt: Box::new(vtable),
219 | last_measurement: 0,
220 | },
221 | },
222 | }
223 | }
224 |
225 | fn enumerate_tests(vt: &dyn VTable) -> Result, Error> {
226 | let mut tests = vec![];
227 | for idx in 0..vt.count() {
228 | vt.select(idx);
229 |
230 | let mut length = 0;
231 | let name_ptr: *const c_char = null();
232 | vt.get_test_name(addr_of!(name_ptr) as _, &mut length);
233 | if length == 0 {
234 | continue;
235 | }
236 | let slice = unsafe { slice::from_raw_parts(name_ptr as *const u8, length as usize) };
237 | let name = str::from_utf8(slice)
238 | .map_err(Error::InvalidFFIString)?
239 | .to_string();
240 | let idx = idx as usize;
241 | tests.push(NamedFunction { name, idx });
242 | }
243 | Ok(tests)
244 | }
245 |
246 | enum SpiRequest {
247 | EstimateIterations { time_ms: u32 },
248 | PrepareState { seed: u64 },
249 | Select { idx: usize },
250 | Run { iterations: usize },
251 | Measure { iterations: usize },
252 | Shutdown,
253 | }
254 |
255 | #[derive(Debug)]
256 | enum SpiReply {
257 | EstimateIterations(usize),
258 | PrepareState,
259 | Select,
260 | Run(u64),
261 | Measure(u64),
262 | }
263 |
264 | /// State which holds the information about list of benchmarks and which one is selected.
265 | /// Used in FFI API (`tango_*` functions).
266 | struct State {
267 | benchmarks: Vec,
268 | selected_function: Option<(usize, Option>)>,
269 | }
270 |
271 | impl State {
272 | fn selected(&self) -> &Benchmark {
273 | &self.benchmarks[self.ensure_selected()]
274 | }
275 |
276 | fn ensure_selected(&self) -> usize {
277 | self.selected_function
278 | .as_ref()
279 | .map(|(idx, _)| *idx)
280 | .expect("No function was selected. Call tango_select() first")
281 | }
282 |
283 | fn selected_state_mut(&mut self) -> Option<&mut Box> {
284 | self.selected_function
285 | .as_mut()
286 | .and_then(|(_, state)| state.as_mut())
287 | }
288 | }
289 |
290 | /// Global state of the benchmarking library
291 | static mut STATE: Option = None;
292 |
293 | /// `tango_init()` implementation
294 | ///
295 | /// This function is not exported from the library, but is used by the `tango_init()` functions
296 | /// generated by the `tango_benchmark!()` macro.
297 | pub fn __tango_init(benchmarks: Vec) {
298 | unsafe {
299 | if STATE.is_none() {
300 | STATE = Some(State {
301 | benchmarks,
302 | selected_function: None,
303 | });
304 | }
305 | }
306 | }
307 |
308 | /// Defines all the foundation types and exported symbols for the FFI communication API between two
309 | /// executables.
310 | ///
311 | /// Tango execution model implies simultaneous execution of the code from two binaries. To achieve that
312 | /// Tango benchmark is compiled in a way that executable is also a shared library (.dll, .so, .dylib). This
313 | /// way two executables can coexist in the single process at the same time.
314 | pub mod ffi {
315 | use super::*;
316 | use std::{
317 | ffi::{c_uint, c_ulonglong},
318 | mem,
319 | os::raw::c_char,
320 | ptr::null,
321 | };
322 |
323 | /// Signature types of all FFI API functions
324 | pub type InitFn = unsafe extern "C" fn();
325 | type CountFn = unsafe extern "C" fn() -> c_ulonglong;
326 | type GetTestNameFn = unsafe extern "C" fn(*mut *const c_char, *mut c_ulonglong);
327 | type SelectFn = unsafe extern "C" fn(c_ulonglong);
328 | type RunFn = unsafe extern "C" fn(c_ulonglong) -> u64;
329 | type EstimateIterationsFn = unsafe extern "C" fn(c_uint) -> c_ulonglong;
330 | type PrepareStateFn = unsafe extern "C" fn(c_ulonglong);
331 | type FreeFn = unsafe extern "C" fn();
332 |
333 | /// This block of constants is checking that all exported tango functions are of valid type according to the API.
334 | /// Those constants are not meant to be used at runtime in any way
335 | #[allow(unused)]
336 | mod type_check {
337 | use super::*;
338 |
339 | const TANGO_COUNT: CountFn = tango_count;
340 | const TANGO_SELECT: SelectFn = tango_select;
341 | const TANGO_GET_TEST_NAME: GetTestNameFn = tango_get_test_name;
342 | const TANGO_RUN: RunFn = tango_run;
343 | const TANGO_ESTIMATE_ITERATIONS: EstimateIterationsFn = tango_estimate_iterations;
344 | const TANGO_FREE: FreeFn = tango_free;
345 | }
346 |
347 | #[no_mangle]
348 | unsafe extern "C" fn tango_count() -> c_ulonglong {
349 | STATE
350 | .as_ref()
351 | .map(|s| s.benchmarks.len() as c_ulonglong)
352 | .unwrap_or(0)
353 | }
354 |
355 | #[no_mangle]
356 | unsafe extern "C" fn tango_select(idx: c_ulonglong) {
357 | if let Some(s) = STATE.as_mut() {
358 | let idx = idx as usize;
359 | assert!(idx < s.benchmarks.len());
360 |
361 | s.selected_function = Some(match s.selected_function.take() {
362 | // Preserving state if the same function is selected
363 | Some((selected, state)) if selected == idx => (selected, state),
364 | _ => (idx, None),
365 | });
366 | }
367 | }
368 |
369 | #[no_mangle]
370 | unsafe extern "C" fn tango_get_test_name(name: *mut *const c_char, length: *mut c_ulonglong) {
371 | if let Some(s) = STATE.as_ref() {
372 | let n = s.selected().name();
373 | *name = n.as_ptr() as _;
374 | *length = n.len() as c_ulonglong;
375 | } else {
376 | *name = null();
377 | *length = 0;
378 | }
379 | }
380 |
381 | #[no_mangle]
382 | unsafe extern "C" fn tango_run(iterations: c_ulonglong) -> u64 {
383 | if let Some(s) = STATE.as_mut() {
384 | s.selected_state_mut()
385 | .expect("no tango_prepare_state() was called")
386 | .measure(iterations as usize)
387 | } else {
388 | 0
389 | }
390 | }
391 |
392 | #[no_mangle]
393 | unsafe extern "C" fn tango_estimate_iterations(time_ms: c_uint) -> c_ulonglong {
394 | if let Some(s) = STATE.as_mut() {
395 | s.selected_state_mut()
396 | .expect("no tango_prepare_state() was called")
397 | .as_mut()
398 | .estimate_iterations(time_ms) as c_ulonglong
399 | } else {
400 | 0
401 | }
402 | }
403 |
404 | #[no_mangle]
405 | unsafe extern "C" fn tango_prepare_state(seed: c_ulonglong) {
406 | if let Some(s) = STATE.as_mut() {
407 | let Some((idx, state)) = &mut s.selected_function else {
408 | panic!("No tango_select() was called")
409 | };
410 | *state = Some(s.benchmarks[*idx].prepare_state(seed));
411 | }
412 | }
413 |
414 | #[no_mangle]
415 | unsafe extern "C" fn tango_free() {
416 | STATE.take();
417 | }
418 |
419 | pub(super) trait VTable {
420 | fn init(&self);
421 | fn count(&self) -> c_ulonglong;
422 | fn select(&self, func_idx: c_ulonglong);
423 | fn get_test_name(&self, ptr: *mut *const c_char, len: *mut c_ulonglong);
424 | fn run(&self, iterations: c_ulonglong) -> c_ulonglong;
425 | fn estimate_iterations(&self, time_ms: c_uint) -> c_ulonglong;
426 | fn prepare_state(&self, seed: c_ulonglong);
427 | }
428 |
429 | pub(super) static mut SELF_VTABLE: Option = Some(SelfVTable);
430 |
431 | /// FFI implementation for the current executable.
432 | ///
433 | /// Used to communicate with FFI API of the executable bypassing dynamic linking.
434 | /// # Safety
435 | /// Instances of this type should not be created directory. The single instance [`SELF_SPI`] should be used instead
436 | pub(super) struct SelfVTable;
437 |
438 | impl VTable for SelfVTable {
439 | fn init(&self) {
440 | // In executable mode `tango_init` is already called by the main function
441 | }
442 |
443 | fn count(&self) -> c_ulonglong {
444 | unsafe { tango_count() }
445 | }
446 |
447 | fn select(&self, func_idx: c_ulonglong) {
448 | unsafe { tango_select(func_idx) }
449 | }
450 |
451 | fn get_test_name(&self, ptr: *mut *const c_char, len: *mut c_ulonglong) {
452 | unsafe { tango_get_test_name(ptr, len) }
453 | }
454 |
455 | fn run(&self, iterations: c_ulonglong) -> u64 {
456 | unsafe { tango_run(iterations) }
457 | }
458 |
459 | fn estimate_iterations(&self, time_ms: c_uint) -> c_ulonglong {
460 | unsafe { tango_estimate_iterations(time_ms) }
461 | }
462 |
463 | fn prepare_state(&self, seed: u64) {
464 | unsafe { tango_prepare_state(seed) }
465 | }
466 | }
467 |
468 | impl Drop for SelfVTable {
469 | fn drop(&mut self) {
470 | unsafe {
471 | tango_free();
472 | }
473 | }
474 | }
475 |
476 | pub(super) struct LibraryVTable {
477 | /// SAFETY: using static here is sound because
478 | /// (1) this struct is private and field can not be accessed outside
479 | /// (2) rust has drop order guarantee (fields are dropped in declaration order)
480 | init_fn: Symbol<'static, InitFn>,
481 | count_fn: Symbol<'static, CountFn>,
482 | select_fn: Symbol<'static, SelectFn>,
483 | get_test_name_fn: Symbol<'static, GetTestNameFn>,
484 | run_fn: Symbol<'static, RunFn>,
485 | estimate_iterations_fn: Symbol<'static, EstimateIterationsFn>,
486 | prepare_state_fn: Symbol<'static, PrepareStateFn>,
487 | free_fn: Symbol<'static, FreeFn>,
488 |
489 | /// SAFETY: This field should be last because it should be dropped last
490 | _library: Box,
491 | }
492 |
493 | impl LibraryVTable {
494 | pub(super) fn new(library: Library) -> Result {
495 | // SAFETY: library is boxed and not moved here, therefore we can safley construct self-referential
496 | // struct here
497 | let library = Box::new(library);
498 | let init_fn = lookup_symbol::(&library, "tango_init")?;
499 | let count_fn = lookup_symbol::(&library, "tango_count")?;
500 | let select_fn = lookup_symbol::(&library, "tango_select")?;
501 | let get_test_name_fn = lookup_symbol::(&library, "tango_get_test_name")?;
502 | let run_fn = lookup_symbol::(&library, "tango_run")?;
503 | let estimate_iterations_fn =
504 | lookup_symbol::(&library, "tango_estimate_iterations")?;
505 | let prepare_state_fn =
506 | lookup_symbol::(&library, "tango_prepare_state")?;
507 | let free_fn = lookup_symbol::(&library, "tango_free")?;
508 | Ok(Self {
509 | _library: library,
510 | init_fn,
511 | count_fn,
512 | select_fn,
513 | get_test_name_fn,
514 | run_fn,
515 | estimate_iterations_fn,
516 | prepare_state_fn,
517 | free_fn,
518 | })
519 | }
520 | }
521 |
522 | impl VTable for LibraryVTable {
523 | fn init(&self) {
524 | unsafe { (self.init_fn)() }
525 | }
526 |
527 | fn count(&self) -> c_ulonglong {
528 | unsafe { (self.count_fn)() }
529 | }
530 |
531 | fn select(&self, func_idx: c_ulonglong) {
532 | unsafe { (self.select_fn)(func_idx) }
533 | }
534 |
535 | fn get_test_name(&self, ptr: *mut *const c_char, len: *mut c_ulonglong) {
536 | unsafe { (self.get_test_name_fn)(ptr, len) }
537 | }
538 |
539 | fn run(&self, iterations: c_ulonglong) -> u64 {
540 | unsafe { (self.run_fn)(iterations) }
541 | }
542 |
543 | fn estimate_iterations(&self, time_ms: c_uint) -> c_ulonglong {
544 | unsafe { (self.estimate_iterations_fn)(time_ms) }
545 | }
546 |
547 | fn prepare_state(&self, seed: c_ulonglong) {
548 | unsafe { (self.prepare_state_fn)(seed) }
549 | }
550 | }
551 |
552 | impl Drop for LibraryVTable {
553 | fn drop(&mut self) {
554 | unsafe { (self.free_fn)() }
555 | }
556 | }
557 |
558 | fn lookup_symbol<'l, T>(
559 | library: &'l Library,
560 | name: &'static str,
561 | ) -> Result, Error> {
562 | unsafe {
563 | let symbol = library
564 | .get(name.as_bytes())
565 | .map_err(Error::UnableToLoadSymbol)?;
566 | Ok(mem::transmute::, Symbol<'static, T>>(symbol))
567 | }
568 | }
569 | }
570 |
--------------------------------------------------------------------------------
/tango-bench/src/lib.rs:
--------------------------------------------------------------------------------
1 | #[cfg(feature = "async")]
2 | pub use asynchronous::async_benchmark_fn;
3 | use core::ptr;
4 | use num_traits::ToPrimitive;
5 | use rand::{rngs::SmallRng, Rng, SeedableRng};
6 | use std::{
7 | cmp::Ordering,
8 | hint::black_box,
9 | io, mem,
10 | ops::{Deref, RangeInclusive},
11 | str::Utf8Error,
12 | time::Duration,
13 | };
14 | use thiserror::Error;
15 | use timer::{ActiveTimer, Timer};
16 |
17 | pub mod cli;
18 | pub mod dylib;
19 | #[cfg(target_os = "linux")]
20 | pub mod linux;
21 |
22 | #[derive(Debug, Error)]
23 | pub enum Error {
24 | #[error("No measurements given")]
25 | NoMeasurements,
26 |
27 | #[error("Invalid string pointer from FFI")]
28 | InvalidFFIString(Utf8Error),
29 |
30 | #[error("Spi::self() was already called")]
31 | SpiSelfWasMoved,
32 |
33 | #[error("Unable to load library symbol")]
34 | UnableToLoadSymbol(#[source] libloading::Error),
35 |
36 | #[error("Unknown sampler type. Available options are: flat and linear")]
37 | UnknownSamplerType,
38 |
39 | #[error("Invalid test name given")]
40 | InvalidTestName,
41 |
42 | #[error("IO Error")]
43 | IOError(#[from] io::Error),
44 | }
45 |
46 | /// Registers benchmark in the system
47 | ///
48 | /// Macros accepts a list of functions that produce any [`IntoBenchmarks`] type. All of the benchmarks
49 | /// created by those functions are registered in the harness.
50 | ///
51 | /// ## Example
52 | /// ```rust
53 | /// use std::time::Instant;
54 | /// use tango_bench::{benchmark_fn, IntoBenchmarks, tango_benchmarks};
55 | ///
56 | /// fn time_benchmarks() -> impl IntoBenchmarks {
57 | /// [benchmark_fn("current_time", |b| b.iter(|| Instant::now()))]
58 | /// }
59 | ///
60 | /// tango_benchmarks!(time_benchmarks());
61 | /// ```
62 | #[macro_export]
63 | macro_rules! tango_benchmarks {
64 | ($($func_expr:expr),+) => {
65 | /// Type checking tango_init() function
66 | const TANGO_INIT: $crate::dylib::ffi::InitFn = tango_init;
67 |
68 | /// Exported function for initializing the benchmark harness
69 | #[no_mangle]
70 | unsafe extern "C" fn tango_init() {
71 | let mut benchmarks = vec![];
72 | $(benchmarks.extend($crate::IntoBenchmarks::into_benchmarks($func_expr));)*
73 | $crate::dylib::__tango_init(benchmarks)
74 | }
75 |
76 | };
77 | }
78 |
79 | /// Main entrypoint for benchmarks
80 | ///
81 | /// This macro generate `main()` function for the benchmark harness. Can be used in a form with providing
82 | /// measurement settings:
83 | /// ```rust
84 | /// use tango_bench::{tango_main, tango_benchmarks, MeasurementSettings};
85 | ///
86 | /// // Register benchmarks
87 | /// tango_benchmarks!([]);
88 | ///
89 | /// tango_main!(MeasurementSettings {
90 | /// samples_per_haystack: 1000,
91 | /// min_iterations_per_sample: 10,
92 | /// max_iterations_per_sample: 10_000,
93 | /// ..Default::default()
94 | /// });
95 | /// ```
96 | #[macro_export]
97 | macro_rules! tango_main {
98 | ($settings:expr) => {
99 | fn main() -> $crate::cli::Result {
100 | // Initialize Tango for SelfVTable usage
101 | unsafe { tango_init() };
102 | $crate::cli::run($settings)
103 | }
104 | };
105 | () => {
106 | tango_main! {$crate::MeasurementSettings::default()}
107 | };
108 | }
109 |
110 | pub struct BenchmarkParams {
111 | pub seed: u64,
112 | }
113 |
114 | pub struct Bencher {
115 | params: BenchmarkParams,
116 | }
117 |
118 | impl Deref for Bencher {
119 | type Target = BenchmarkParams;
120 |
121 | fn deref(&self) -> &Self::Target {
122 | &self.params
123 | }
124 | }
125 |
126 | impl Bencher {
127 | pub fn iter O + 'static>(self, func: F) -> Box {
128 | Box::new(Sampler(func))
129 | }
130 | }
131 |
132 | struct Sampler(F);
133 |
134 | pub trait ErasedSampler {
135 | /// Measures the performance if the function
136 | ///
137 | /// Returns the cumulative execution time (all iterations) with nanoseconds precision,
138 | /// but not necessarily accuracy. Usually this time is get by `clock_gettime()` call or some other
139 | /// platform-specific call.
140 | ///
141 | /// This method should use the same arguments for measuring the test function unless [`prepare_state()`]
142 | /// method is called. Only then new set of input arguments should be generated. It is NOT allowed
143 | /// to call this method without first calling [`prepare_state()`].
144 | ///
145 | /// [`prepare_state()`]: Self::prepare_state()
146 | fn measure(&mut self, iterations: usize) -> u64;
147 |
148 | /// Estimates the number of iterations achievable within given time.
149 | ///
150 | /// Time span is given in milliseconds (`time_ms`). Estimate can be an approximation and it is important
151 | /// for implementation to be fast (in the order of 10 ms).
152 | /// If possible the same input arguments should be used when building the estimate.
153 | /// If the single call of a function is longer than provided timespan the implementation should return 0.
154 | fn estimate_iterations(&mut self, time_ms: u32) -> usize {
155 | let mut iters = 1;
156 | let time_ns = Duration::from_millis(time_ms as u64).as_nanos() as u64;
157 |
158 | for _ in 0..5 {
159 | // Never believe short measurements because they are very unreliable. Pretending that
160 | // measurement at least took 1us guarantees that we won't end up with an unreasonably large number
161 | // of iterations
162 | let time = self.measure(iters).max(1_000);
163 | let time_per_iteration = (time / iters as u64).max(1);
164 | let new_iters = (time_ns / time_per_iteration) as usize;
165 |
166 | // Do early stop if new estimate has the same order of magnitude. It is good enough.
167 | if new_iters < 2 * iters {
168 | return new_iters;
169 | }
170 |
171 | iters = new_iters;
172 | }
173 |
174 | iters
175 | }
176 | }
177 |
178 | impl O> ErasedSampler for Sampler {
179 | fn measure(&mut self, iterations: usize) -> u64 {
180 | let start = ActiveTimer::start();
181 | for _ in 0..iterations {
182 | black_box((self.0)());
183 | }
184 | ActiveTimer::stop(start)
185 | }
186 | }
187 |
188 | pub struct Benchmark {
189 | name: String,
190 | sampler_factory: Box,
191 | }
192 |
193 | pub fn benchmark_fn Box + 'static>(
194 | name: impl Into,
195 | sampler_factory: F,
196 | ) -> Benchmark {
197 | let name = name.into();
198 | assert!(!name.is_empty());
199 | Benchmark {
200 | name,
201 | sampler_factory: Box::new(SyncSampleFactory(sampler_factory)),
202 | }
203 | }
204 |
205 | pub trait SamplerFactory {
206 | fn create_sampler(&mut self, params: BenchmarkParams) -> Box;
207 | }
208 |
209 | struct SyncSampleFactory(F);
210 |
211 | impl Box> SamplerFactory for SyncSampleFactory {
212 | fn create_sampler(&mut self, params: BenchmarkParams) -> Box {
213 | (self.0)(Bencher { params })
214 | }
215 | }
216 |
217 | impl Benchmark {
218 | /// Generates next haystack for the measurement
219 | ///
220 | /// Calling this method should update internal haystack used for measurement.
221 | /// Returns `true` if update happens, `false` if implementation doesn't support haystack generation.
222 | /// Haystack/Needle distinction is described in [`Generator`] trait.
223 | pub fn prepare_state(&mut self, seed: u64) -> Box {
224 | self.sampler_factory
225 | .create_sampler(BenchmarkParams { seed })
226 | }
227 |
228 | /// Name of the benchmark
229 | pub fn name(&self) -> &str {
230 | self.name.as_str()
231 | }
232 | }
233 |
234 | /// Converts the implementing type into a vector of [`Benchmark`].
235 | pub trait IntoBenchmarks {
236 | fn into_benchmarks(self) -> Vec;
237 | }
238 |
239 | impl IntoBenchmarks for [Benchmark; N] {
240 | fn into_benchmarks(self) -> Vec {
241 | self.into_iter().collect()
242 | }
243 | }
244 |
245 | impl IntoBenchmarks for Vec {
246 | fn into_benchmarks(self) -> Vec {
247 | self
248 | }
249 | }
250 |
251 | /// Describes basic settings for the benchmarking process
252 | ///
253 | /// This structure is passed to [`cli::run()`].
254 | ///
255 | /// Should be created only with overriding needed properties, like so:
256 | /// ```rust
257 | /// use tango_bench::MeasurementSettings;
258 | ///
259 | /// let settings = MeasurementSettings {
260 | /// min_iterations_per_sample: 1000,
261 | /// ..Default::default()
262 | /// };
263 | /// ```
264 | #[derive(Clone, Copy, Debug)]
265 | pub struct MeasurementSettings {
266 | pub filter_outliers: bool,
267 |
268 | /// The number of samples per one generated haystack
269 | pub samples_per_haystack: usize,
270 |
271 | /// Minimum number of iterations in a sample for each of 2 tested functions
272 | pub min_iterations_per_sample: usize,
273 |
274 | /// The number of iterations in a sample for each of 2 tested functions
275 | pub max_iterations_per_sample: usize,
276 |
277 | pub sampler_type: SampleLengthKind,
278 |
279 | /// If true scheduler performs warmup iterations before measuring function
280 | pub warmup_enabled: bool,
281 |
282 | /// Size of a CPU cache firewall in KBytes
283 | ///
284 | /// If set, the scheduler will perform a dummy data read between samples generation to spoil the CPU cache
285 | ///
286 | /// Cache firewall is a way to reduce the impact of the CPU cache on the benchmarking process. It tries
287 | /// to minimize discrepancies in performance between two algorithms due to the CPU cache state.
288 | pub cache_firewall: Option,
289 |
290 | /// If true, scheduler will perform a yield of control back to the OS before taking each sample
291 | ///
292 | /// Yielding control to the OS is a way to reduce the impact of OS scheduler on the benchmarking process.
293 | pub yield_before_sample: bool,
294 |
295 | /// If set, use alloca to allocate a random offset for the stack each sample.
296 | /// This to reduce memory alignment effects on the benchmarking process.
297 | ///
298 | /// May cause UB if the allocation is larger then the thread stack size.
299 | pub randomize_stack: Option,
300 | }
301 |
302 | #[derive(Clone, Copy, Debug)]
303 | pub enum SampleLengthKind {
304 | Flat,
305 | Linear,
306 | Random,
307 | }
308 |
309 | /// Performs a dummy reads from memory to spoil given amount of CPU cache
310 | ///
311 | /// Uses cache aligned data arrays to perform minimum amount of reads possible to spoil the cache
312 | struct CacheFirewall {
313 | cache_lines: Vec,
314 | }
315 |
316 | impl CacheFirewall {
317 | fn new(bytes: usize) -> Self {
318 | let n = bytes / mem::size_of::();
319 | let cache_lines = vec![CacheLine::default(); n];
320 | Self { cache_lines }
321 | }
322 |
323 | fn issue_read(&self) {
324 | for line in &self.cache_lines {
325 | // Because CacheLine is aligned on 64 bytes it is enough to read single element from the array
326 | // to spoil the whole cache line
327 | unsafe { ptr::read_volatile(&line.0[0]) };
328 | }
329 | }
330 | }
331 |
332 | #[repr(C)]
333 | #[repr(align(64))]
334 | #[derive(Default, Clone, Copy)]
335 | struct CacheLine([u16; 32]);
336 |
337 | pub const DEFAULT_SETTINGS: MeasurementSettings = MeasurementSettings {
338 | filter_outliers: false,
339 | samples_per_haystack: 1,
340 | min_iterations_per_sample: 1,
341 | max_iterations_per_sample: 5000,
342 | sampler_type: SampleLengthKind::Random,
343 | cache_firewall: None,
344 | yield_before_sample: false,
345 | warmup_enabled: true,
346 | randomize_stack: None,
347 | };
348 |
349 | impl Default for MeasurementSettings {
350 | fn default() -> Self {
351 | DEFAULT_SETTINGS
352 | }
353 | }
354 |
355 | /// Responsible for determining the number of iterations to run for each sample
356 | ///
357 | /// Different sampler strategies can influence the results heavily. For example, if function is dependent heavily
358 | /// on a memory subsystem, then it should be tested with different number of iterations to be representative
359 | /// for different memory access patterns and cache states.
360 | trait SampleLength {
361 | /// Returns the number of iterations to run for the next sample
362 | ///
363 | /// Accepts the number of iteration being run starting from 0 and cumulative time spent by both functions
364 | fn next_sample_iterations(&mut self, iteration_no: usize, estimate: usize) -> usize;
365 | }
366 |
367 | /// Runs the same number of iterations for each sample
368 | ///
369 | /// Estimates the number of iterations based on the number of iterations achieved in 10 ms and uses
370 | /// this number as a base for the number of iterations for each sample.
371 | struct FlatSampleLength {
372 | min: usize,
373 | max: usize,
374 | }
375 |
376 | impl FlatSampleLength {
377 | fn new(settings: &MeasurementSettings) -> Self {
378 | FlatSampleLength {
379 | min: settings.min_iterations_per_sample.max(1),
380 | max: settings.max_iterations_per_sample,
381 | }
382 | }
383 | }
384 |
385 | impl SampleLength for FlatSampleLength {
386 | fn next_sample_iterations(&mut self, _iteration_no: usize, estimate: usize) -> usize {
387 | estimate.clamp(self.min, self.max)
388 | }
389 | }
390 |
391 | struct LinearSampleLength {
392 | min: usize,
393 | max: usize,
394 | }
395 |
396 | impl LinearSampleLength {
397 | fn new(settings: &MeasurementSettings) -> Self {
398 | Self {
399 | min: settings.min_iterations_per_sample.max(1),
400 | max: settings.max_iterations_per_sample,
401 | }
402 | }
403 | }
404 |
405 | impl SampleLength for LinearSampleLength {
406 | fn next_sample_iterations(&mut self, iteration_no: usize, estimate: usize) -> usize {
407 | let estimate = estimate.clamp(self.min, self.max);
408 | (iteration_no % estimate) + 1
409 | }
410 | }
411 |
412 | /// Sampler that randomly determines the number of iterations to run for each sample
413 | ///
414 | /// This sampler uses a random number generator to decide the number of iterations for each sample.
415 | struct RandomSampleLength {
416 | rng: SmallRng,
417 | min: usize,
418 | max: usize,
419 | }
420 |
421 | impl RandomSampleLength {
422 | pub fn new(settings: &MeasurementSettings, seed: u64) -> Self {
423 | Self {
424 | rng: SmallRng::seed_from_u64(seed),
425 | min: settings.min_iterations_per_sample.max(1),
426 | max: settings.max_iterations_per_sample,
427 | }
428 | }
429 | }
430 |
431 | impl SampleLength for RandomSampleLength {
432 | fn next_sample_iterations(&mut self, _iteration_no: usize, estimate: usize) -> usize {
433 | let estimate = estimate.clamp(self.min, self.max);
434 | self.rng.gen_range(1..=estimate)
435 | }
436 | }
437 |
438 | /// Calculates the result of the benchmarking run
439 | ///
440 | /// Return None if no measurements were made
441 | pub(crate) fn calculate_run_result>(
442 | name: N,
443 | baseline: &[u64],
444 | candidate: &[u64],
445 | iterations_per_sample: &[usize],
446 | filter_outliers: bool,
447 | ) -> Option {
448 | assert!(baseline.len() == candidate.len());
449 | assert!(baseline.len() == iterations_per_sample.len());
450 |
451 | let mut iterations_per_sample = iterations_per_sample.to_vec();
452 |
453 | let mut diff = candidate
454 | .iter()
455 | .zip(baseline.iter())
456 | // Calculating difference between candidate and baseline
457 | .map(|(&c, &b)| (c as f64 - b as f64))
458 | .zip(iterations_per_sample.iter())
459 | // Normalizing difference to iterations count
460 | .map(|(diff, &iters)| diff / iters as f64)
461 | .collect::>();
462 |
463 | // need to save number of original samples to calculate number of outliers correctly
464 | let n = diff.len();
465 |
466 | // Normalizing measurements to iterations count
467 | let mut baseline = baseline
468 | .iter()
469 | .zip(iterations_per_sample.iter())
470 | .map(|(&v, &iters)| (v as f64) / (iters as f64))
471 | .collect::>();
472 | let mut candidate = candidate
473 | .iter()
474 | .zip(iterations_per_sample.iter())
475 | .map(|(&v, &iters)| (v as f64) / (iters as f64))
476 | .collect::>();
477 |
478 | // Calculating measurements range. All measurements outside this interval considered outliers
479 | let range = if filter_outliers {
480 | iqr_variance_thresholds(diff.to_vec())
481 | } else {
482 | None
483 | };
484 |
485 | // Cleaning measurements from outliers if needed
486 | if let Some(range) = range {
487 | // We filtering outliers to build statistical Summary and the order of elements in arrays
488 | // doesn't matter, therefore swap_remove() is used. But we need to make sure that all arrays
489 | // has the same length
490 | assert_eq!(diff.len(), baseline.len());
491 | assert_eq!(diff.len(), candidate.len());
492 |
493 | let mut i = 0;
494 | while i < diff.len() {
495 | if range.contains(&diff[i]) {
496 | i += 1;
497 | } else {
498 | diff.swap_remove(i);
499 | iterations_per_sample.swap_remove(i);
500 | baseline.swap_remove(i);
501 | candidate.swap_remove(i);
502 | }
503 | }
504 | };
505 |
506 | let diff_summary = Summary::from(&diff)?;
507 | let baseline_summary = Summary::from(&baseline)?;
508 | let candidate_summary = Summary::from(&candidate)?;
509 |
510 | let diff_estimate = DiffEstimate::build(&baseline_summary, &diff_summary);
511 |
512 | Some(RunResult {
513 | baseline: baseline_summary,
514 | candidate: candidate_summary,
515 | diff: diff_summary,
516 | name: name.into(),
517 | diff_estimate,
518 | outliers: n - diff_summary.n,
519 | })
520 | }
521 |
522 | /// Contains the estimation of how much faster or slower is candidate function compared to baseline
523 | pub(crate) struct DiffEstimate {
524 | // Percentage of difference between candidate and baseline
525 | //
526 | // Negative value means that candidate is faster than baseline, positive - slower.
527 | pct: f64,
528 |
529 | // Is the difference statistically significant
530 | significant: bool,
531 | }
532 |
533 | impl DiffEstimate {
534 | /// Builds [`DiffEstimate`] from flat sampling
535 | ///
536 | /// Flat sampling is a sampling where each measurement is normalized by the number of iterations.
537 | /// This is needed to make measurements comparable between each other. Linear sampling is more
538 | /// robust to outliers, but it is requiring more iterations.
539 | ///
540 | /// It is assumed that baseline and candidate are already normalized by iterations count.
541 | fn build(baseline: &Summary, diff: &Summary) -> Self {
542 | let std_dev = diff.variance.sqrt();
543 | let std_err = std_dev / (diff.n as f64).sqrt();
544 | let z_score = diff.mean / std_err;
545 |
546 | // significant result is far away from 0 and have more than 0.5% base/candidate difference
547 | // z_score = 2.6 corresponds to 99% significance level
548 | let significant = z_score.abs() >= 2.6
549 | && (diff.mean / baseline.mean).abs() > 0.005
550 | && diff.mean.abs() >= ActiveTimer::precision() as f64;
551 | let pct = diff.mean / baseline.mean * 100.0;
552 |
553 | Self { pct, significant }
554 | }
555 | }
556 |
557 | /// Describes the results of a single benchmark run
558 | pub(crate) struct RunResult {
559 | /// name of a test
560 | name: String,
561 |
562 | /// statistical summary of baseline function measurements
563 | baseline: Summary,
564 |
565 | /// statistical summary of candidate function measurements
566 | candidate: Summary,
567 |
568 | /// individual measurements of a benchmark (candidate - baseline)
569 | diff: Summary,
570 |
571 | diff_estimate: DiffEstimate,
572 |
573 | /// Numbers of detected and filtered outliers
574 | outliers: usize,
575 | }
576 |
577 | /// Statistical summary for a given iterator of numbers.
578 | ///
579 | /// Calculates all the information using single pass over the data. Mean and variance are calculated using
580 | /// streaming algorithm described in _Art of Computer Programming, Vol 2, page 232_.
581 | #[derive(Clone, Copy)]
582 | pub struct Summary {
583 | pub n: usize,
584 | pub min: T,
585 | pub max: T,
586 | pub mean: f64,
587 | pub variance: f64,
588 | }
589 |
590 | impl Summary {
591 | pub fn from<'a, C>(values: C) -> Option
592 | where
593 | C: IntoIterator
- ,
594 | T: ToPrimitive + Copy + Default + 'a,
595 | {
596 | Self::running(values.into_iter().copied()).last()
597 | }
598 |
599 | pub fn running(iter: I) -> impl Iterator
- >
600 | where
601 | T: ToPrimitive + Copy + Default,
602 | I: Iterator
- ,
603 | {
604 | RunningSummary {
605 | iter,
606 | n: 0,
607 | min: T::default(),
608 | max: T::default(),
609 | mean: 0.,
610 | s: 0.,
611 | }
612 | }
613 | }
614 |
615 | struct RunningSummary {
616 | iter: I,
617 | n: usize,
618 | min: T,
619 | max: T,
620 | mean: f64,
621 | s: f64,
622 | }
623 |
624 | impl Iterator for RunningSummary
625 | where
626 | T: Copy + PartialOrd,
627 | I: Iterator
- ,
628 | T: ToPrimitive,
629 | {
630 | type Item = Summary;
631 |
632 | fn next(&mut self) -> Option {
633 | let value = self.iter.next()?;
634 | let fvalue = value.to_f64().expect("f64 overflow detected");
635 |
636 | if self.n == 0 {
637 | self.min = value;
638 | self.max = value;
639 | }
640 |
641 | if let Some(Ordering::Less) = value.partial_cmp(&self.min) {
642 | self.min = value;
643 | }
644 | if let Some(Ordering::Greater) = value.partial_cmp(&self.max) {
645 | self.max = value;
646 | }
647 |
648 | self.n += 1;
649 | let mean_p = self.mean;
650 | self.mean += (fvalue - self.mean) / self.n as f64;
651 | self.s += (fvalue - mean_p) * (fvalue - self.mean);
652 | let variance = if self.n > 1 {
653 | self.s / (self.n - 1) as f64
654 | } else {
655 | 0.
656 | };
657 |
658 | Some(Summary {
659 | n: self.n,
660 | min: self.min,
661 | max: self.max,
662 | mean: self.mean,
663 | variance,
664 | })
665 | }
666 | }
667 |
668 | /// Outlier detection algorithm based on interquartile range
669 | ///
670 | /// Observations that are 1.5 IQR away from the corresponding quartile are consideted as outliers
671 | /// as described in original Tukey's paper.
672 | pub fn iqr_variance_thresholds(mut input: Vec) -> Option> {
673 | const MINIMUM_IQR: f64 = 1.;
674 |
675 | input.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
676 | let (q1, q3) = (input.len() / 4, input.len() * 3 / 4 - 1);
677 | if q1 >= q3 || q3 >= input.len() {
678 | return None;
679 | }
680 | // In case q1 and q3 are equal, we need to make sure that IQR is not 0
681 | // In the future it would be nice to measure system timer precision empirically.
682 | let iqr = (input[q3] - input[q1]).max(MINIMUM_IQR);
683 |
684 | let low_threshold = input[q1] - iqr * 1.5;
685 | let high_threshold = input[q3] + iqr * 1.5;
686 |
687 | // Calculating the indices of the thresholds in an dataset
688 | let low_threshold_idx =
689 | match input[0..q1].binary_search_by(|probe| probe.total_cmp(&low_threshold)) {
690 | Ok(idx) => idx,
691 | Err(idx) => idx,
692 | };
693 |
694 | let high_threshold_idx =
695 | match input[q3..].binary_search_by(|probe| probe.total_cmp(&high_threshold)) {
696 | Ok(idx) => idx,
697 | Err(idx) => idx,
698 | };
699 |
700 | if low_threshold_idx == 0 || high_threshold_idx >= input.len() {
701 | return None;
702 | }
703 |
704 | // Calculating the equal number of observations which should be removed from each "side" of observations
705 | let outliers_cnt = low_threshold_idx.min(input.len() - high_threshold_idx);
706 |
707 | Some(input[outliers_cnt]..=(input[input.len() - outliers_cnt - 1]))
708 | }
709 |
710 | mod timer {
711 | use std::time::Instant;
712 |
713 | #[cfg(all(feature = "hw-timer", target_arch = "x86_64"))]
714 | pub(super) type ActiveTimer = x86::RdtscpTimer;
715 |
716 | #[cfg(not(all(feature = "hw-timer", target_arch = "x86_64")))]
717 | pub(super) type ActiveTimer = PlatformTimer;
718 |
719 | pub(super) trait Timer {
720 | fn start() -> T;
721 | fn stop(start_time: T) -> u64;
722 |
723 | /// Timer precision in nanoseconds
724 | ///
725 | /// The results less than the precision of a timer are considered not significant
726 | fn precision() -> u64 {
727 | 1
728 | }
729 | }
730 |
731 | pub(super) struct PlatformTimer;
732 |
733 | impl Timer for PlatformTimer {
734 | #[inline]
735 | fn start() -> Instant {
736 | Instant::now()
737 | }
738 |
739 | #[inline]
740 | fn stop(start_time: Instant) -> u64 {
741 | start_time.elapsed().as_nanos() as u64
742 | }
743 | }
744 |
745 | #[cfg(all(feature = "hw-timer", target_arch = "x86_64"))]
746 | pub(super) mod x86 {
747 | use super::Timer;
748 | use std::arch::x86_64::{__rdtscp, _mm_mfence};
749 |
750 | pub struct RdtscpTimer;
751 |
752 | impl Timer for RdtscpTimer {
753 | #[inline]
754 | fn start() -> u64 {
755 | unsafe {
756 | _mm_mfence();
757 | __rdtscp(&mut 0)
758 | }
759 | }
760 |
761 | #[inline]
762 | fn stop(start: u64) -> u64 {
763 | unsafe {
764 | let end = __rdtscp(&mut 0);
765 | _mm_mfence();
766 | end - start
767 | }
768 | }
769 | }
770 | }
771 | }
772 |
773 | #[cfg(feature = "async")]
774 | pub mod asynchronous {
775 | use super::{Benchmark, BenchmarkParams, ErasedSampler, Sampler, SamplerFactory};
776 | use std::{future::Future, ops::Deref};
777 |
778 | pub fn async_benchmark_fn(
779 | name: impl Into,
780 | runtime: R,
781 | sampler_factory: F,
782 | ) -> Benchmark
783 | where
784 | R: AsyncRuntime + 'static,
785 | F: FnMut(AsyncBencher) -> Box + 'static,
786 | {
787 | let name = name.into();
788 | assert!(!name.is_empty());
789 | Benchmark {
790 | name,
791 | sampler_factory: Box::new(AsyncSampleFactory(sampler_factory, runtime)),
792 | }
793 | }
794 |
795 | pub struct AsyncSampleFactory(pub F, pub R);
796 |
797 | impl) -> Box> SamplerFactory
798 | for AsyncSampleFactory
799 | {
800 | fn create_sampler(&mut self, params: BenchmarkParams) -> Box {
801 | (self.0)(AsyncBencher {
802 | params,
803 | runtime: self.1,
804 | })
805 | }
806 | }
807 |
808 | pub struct AsyncBencher {
809 | params: BenchmarkParams,
810 | runtime: R,
811 | }
812 |
813 | impl AsyncBencher {
814 | pub fn iter(self, func: F) -> Box
815 | where
816 | O: 'static,
817 | Fut: Future