├── README.md
└── p1bench.c
/README.md:
--------------------------------------------------------------------------------
1 | p1bench
2 | =======
3 |
4 | Perturbation benchmark.
5 |
6 | This is intended to be used before other benchmark tests, to investigate CPU and memory variation. This helps you better interpret the results of any microbenchmark test, by characterizing variance that will sway results.
7 |
8 | Let's say you wanted to do a 500 ms CPU benchmark of gzip performance: p1bench can be run with a 500 ms interval to show what baseline variation you may see, based on a simple spin loop, before running a more complex gzip microbenchmark.
9 |
10 | p1bench can also be run in a mode (-m) to test memory variation.
11 |
12 | ## Operating Systems
13 |
14 | Tested on Linux and OSX. Should work anywhere with a C compiler and libpthread.
15 |
16 | ## Compile
17 |
18 | ```
19 | gcc -O0 -pthread -o p1bench p1bench.c
20 | ```
21 |
22 | ## Screenshots
23 |
24 | CPU test for a 500 ms duration:
25 |
26 |
27 | $ ./p1bench 500
28 | Calibrating for 500 ms... (target iteration count: 220661127)
29 | Run 100/100, Ctrl-C to stop (-0.30% diff)
30 |
31 | Perturbation percent by count for 500 ms runs:
32 | Slower% Count Count% Histogram
33 | 0.0%: 5 5.00% *********
34 | 0.1%: 5 5.00% *********
35 | 0.2%: 2 2.00% ****
36 | 0.3%: 2 2.00% ****
37 | 0.4%: 11 11.00% *******************
38 | 0.5%: 4 4.00% *******
39 | 0.6%: 3 3.00% *****
40 | 0.7%: 4 4.00% *******
41 | 0.8%: 5 5.00% *********
42 | 0.9%: 1 1.00% **
43 | 1.0%: 30 30.00% **************************************************
44 | 2.0%: 27 27.00% *********************************************
45 | 3.0%: 1 1.00% **
46 |
47 | Percentiles: 50th: 1.356%, 90th: 2.439%, 99th: 2.954%, 100th: 3.035%
48 | Fastest: 500.715 ms, 50th: 507.504 ms, mean: 507.359 ms, slowest: 515.911 ms
49 | Fastest rate: 440692064/s, 50th: 434796823/s, mean: 434921085/s, slowest: 427711614/s
50 |
51 |
52 | Many numbers are printed to characterize variance, and the histogram shows it visually. Just from the histogram, I'd expect a variance of up to 2% (fastest to slowest) for a CPU microbenchmark of the same duration (500 ms).
53 |
54 | This is a custom histogram, where the bin size varies:
55 |
56 | - variation 0 - 1%: 0.1% binsize
57 | - variation 1 - 20%: 1% binsize
58 | - variation 20+%: 10% binsize
59 |
60 | Here's a much noisier system:
61 |
62 |
63 | $ ./p1bench 500
64 | Calibrating for 500 ms... (target iteration count: 206071738)
65 | Run 100/100, Ctrl-C to stop (0.85% diff)
66 |
67 | Perturbation percent by count for 500 ms runs:
68 | Slower% Count Count% Histogram
69 | 0.0%: 1 1.00% ****
70 | 0.1%: 0 0.00%
71 | 0.2%: 0 0.00%
72 | 0.3%: 0 0.00%
73 | 0.4%: 1 1.00% ****
74 | 0.5%: 0 0.00%
75 | 0.6%: 0 0.00%
76 | 0.7%: 0 0.00%
77 | 0.8%: 1 1.00% ****
78 | 0.9%: 0 0.00%
79 | 1.0%: 1 1.00% ****
80 | 2.0%: 10 10.00% ********************************
81 | 3.0%: 7 7.00% **********************
82 | 4.0%: 12 12.00% **************************************
83 | 5.0%: 14 14.00% ********************************************
84 | 6.0%: 10 10.00% ********************************
85 | 7.0%: 16 16.00% **************************************************
86 | 8.0%: 5 5.00% ****************
87 | 9.0%: 10 10.00% ********************************
88 | 10.0%: 5 5.00% ****************
89 | 11.0%: 1 1.00% ****
90 | 12.0%: 2 2.00% *******
91 | 13.0%: 1 1.00% ****
92 | 14.0%: 1 1.00% ****
93 | 15.0%: 1 1.00% ****
94 | 16.0%: 0 0.00%
95 | 17.0%: 0 0.00%
96 | 18.0%: 0 0.00%
97 | 19.0%: 0 0.00%
98 | 20.0%: 0 0.00%
99 | 30.0%: 1 1.00% ****
100 |
101 | Percentiles: 50th: 6.258%, 90th: 10.085%, 99th: 15.431%, 100th: 38.078%
102 | Fastest: 485.364 ms, 50th: 515.739 ms, mean: 518.336 ms, slowest: 670.182 ms
103 | Fastest rate: 424571533/s, 50th: 399565939/s, mean: 397564008/s, slowest: 307486232/s
104 |
105 |
106 | If I wanted to do a 500 ms CPU microbenchmark on this system, well, I'd find another system.
107 |
108 | USAGE:
109 |
110 |
111 | USAGE: p1bench [-hv] [-m Mbytes] [time(ms) [count]]
112 | -v # verbose: per run details
113 | -m Mbytes # memory test working set
114 | eg,
115 | p1bench # 100ms (default) CPU spin loop
116 | p1bench 300 # 300ms CPU spin loop
117 | p1bench 300 100 # 300ms CPU spin loop, 100 times
118 | p1bench -m 1024 # 1GB memory read loop
119 |
120 |
--------------------------------------------------------------------------------
/p1bench.c:
--------------------------------------------------------------------------------
1 | /*
2 | * p1bench - perturbation benchmark. Tests simple CPU or memory loops.
3 | *
4 | * This is intended to be used before other benchmark tests, to investigate
5 | * CPU and memory variation. This helps you better interpret the results of any
6 | * microbenchmark test, by characterizing variance that will sway results.
7 | *
8 | * Let's say you wanted to do a 500 ms CPU benchmark of gzip performance:
9 | * p1bench can be run with a 500 ms interval to show what baseline variation
10 | * you may see, based on a simple spin loop, before running the more complex
11 | * gzip microbenchmark.
12 | *
13 | * p1bench can also be run in a mode (-m) to test memory variation.
14 | *
15 | * gcc -O0 -pthread -o p1bench p1bench.c
16 | *
17 | * USAGE: see -h for usage.
18 | *
19 | * Copyright 2018 Netflix, Inc.
20 | * Licensed under the Apache License, Version 2.0 (the "License")
21 | *
22 | * 03-Jan-2018 Brendan Gregg Created this.
23 | */
24 |
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 | #include
31 | #include
32 | #include
33 | #include
34 | #include
35 |
36 | void usage()
37 | {
38 | printf("USAGE: p1bench [-hv] [-m Mbytes] [time(ms) [count]]\n"
39 | " -v # verbose: per run details\n"
40 | " -m Mbytes # memory test working set\n"
41 | " eg,\n"
42 | " p1bench # 100ms (default) CPU spin loop\n"
43 | " p1bench 300 # 300ms CPU spin loop\n"
44 | " p1bench 300 100 # 300ms CPU spin loop, 100 times\n"
45 | " p1bench -m 1024 # 1GB memory read loop\n");
46 | }
47 |
48 | /*
49 | * These functions aren't just for code cleanliness: they show up in profilers
50 | * when doing active benchmarking to debug the benchmark.
51 | */
52 |
53 | int g_testrun = 1;
54 | void teststop(int dummy) {
55 | g_testrun = 0;
56 | }
57 |
58 | void *spintest(void *arg)
59 | {
60 | signal(SIGUSR1, teststop);
61 | unsigned long long *count = (unsigned long long *)arg;
62 | for (;g_testrun;) { (*count)++; }
63 | return NULL;
64 | }
65 |
66 | unsigned long long spinrun(unsigned long long count)
67 | {
68 | unsigned long long i;
69 | // check compiler doesn't elide this (-O0)
70 | for (i = 0; i < count; i++) {;}
71 | return i;
72 | }
73 |
74 | // memory parameters
75 | char *g_mem;
76 | unsigned long long g_memsize;
77 | unsigned long long g_stride;
78 |
79 | void *memtest(void *arg)
80 | {
81 | unsigned long long *count = (unsigned long long *)arg;
82 | char *memp;
83 | unsigned long long i, j;
84 | int junk;
85 |
86 | signal(SIGUSR1, teststop);
87 | memp = g_mem;
88 | for (;g_testrun;) {
89 | junk += memp[0];
90 | memp += g_stride;
91 | if (memp > (g_mem + g_memsize))
92 | memp = g_mem;
93 | (*count)++;
94 | }
95 |
96 | return NULL;
97 | }
98 |
99 | unsigned long long memrun(unsigned long long count)
100 | {
101 | char *memp;
102 | unsigned long long i, j;
103 | int junk;
104 |
105 | signal(SIGUSR1, teststop);
106 | memp = g_mem;
107 | for (i = 0; i < count; i++) {
108 | junk += memp[0];
109 | memp += g_stride;
110 | if (memp > (g_mem + g_memsize))
111 | memp = g_mem;
112 | }
113 | return i;
114 | }
115 |
116 | /*
117 | * Runs the loop function for the target_us while incrementing count.
118 | * This gives us a ballpark figure of the target count.
119 | */
120 | void test_run(unsigned long long target_us, unsigned long long *count,
121 | void *(*loop)(void *))
122 | {
123 | pthread_t thread;
124 | int err;
125 |
126 | if (!target_us)
127 | return;
128 |
129 | g_testrun = 1;
130 | (*count) = 0;
131 | if (pthread_create(&thread, NULL, loop, count) != 0) {
132 | perror("Thread create failed");
133 | exit(1);
134 | }
135 | usleep(target_us);
136 | if ((err = pthread_kill(thread, SIGUSR1))) {
137 | perror("Couldn't terminate worker thread normally");
138 | exit(1);
139 | }
140 | pthread_join(thread, NULL);
141 | }
142 |
143 | /*
144 | * Finds a ballpark target count, then runs the real run function with that
145 | * count several times (test_runs) to fine tune the target count.
146 | */
147 | unsigned long long find_count(unsigned long long target_us,
148 | int test_us, int test_runs,
149 | void *(*test)(void *),
150 | unsigned long long (*run)(unsigned long long))
151 | {
152 | unsigned long long time_us;
153 | unsigned long long fastest_time_us = ~0ULL;
154 | unsigned long long iter_count = 0;
155 | static struct timeval ts[2];
156 | int i;
157 |
158 | test_run(test_us, &iter_count, test);
159 | for (i = 0; i < test_runs; i++) {
160 | gettimeofday(&ts[0], NULL);
161 | (void) run(iter_count);
162 | gettimeofday(&ts[1], NULL);
163 | time_us = 1000000 * (ts[1].tv_sec - ts[0].tv_sec) +
164 | (ts[1].tv_usec - ts[0].tv_usec) / 1;
165 | if (time_us < fastest_time_us)
166 | fastest_time_us = time_us;
167 | }
168 | return iter_count * target_us / fastest_time_us;
169 | }
170 |
171 | /*
172 | * Value to histogram index.
173 | * This is a custom histogram with the following ranges:
174 | * value 0-1: 0.1 step size, idx 0-9
175 | * value 1-20: 1 step size, idx 10-28
176 | * value 20+: 10 step size, idx 19+
177 | */
178 | static int hist_idx(double value, int buckets)
179 | {
180 | int idx;
181 |
182 | if (value < 1)
183 | idx = (int)(10 * value);
184 | else if (value < 20)
185 | // idx = previous_steps + (value - min_range_value)
186 | idx = 10 + (int)(value - 1);
187 | else
188 | // idx = previous_steps + (value - min_range_value)/10
189 | idx = 29 + (int)((value - 20) / 10);
190 |
191 | if (idx > buckets - 1)
192 | idx = buckets - 1;
193 | return idx;
194 | }
195 |
196 | // histogram index to minimum value
197 | double hist_val(int idx)
198 | {
199 | if (idx < 10)
200 | return (double)idx / 10;
201 | else if (idx < 29)
202 | // 10 -> 1.0
203 | // 11 -> 2
204 | // value = idx - previous_steps + min_range_value
205 | return (double)idx - 10 + 1;
206 | else
207 | // value = (idx - previous_steps) * 10 + min_range_value
208 | return (double)(idx - 29) * 10 + 20;
209 | }
210 |
211 | static int ullcmp(const void *p1, const void *p2)
212 | {
213 | unsigned long long a = *(unsigned long long *)p1;
214 | unsigned long long b = *(unsigned long long *)p2;
215 | return a - b;
216 | }
217 |
218 | // not worth -lm for this
219 | int myceil(double x)
220 | {
221 | if (x > (int)x)
222 | return (int)x + 1;
223 | return (int)x;
224 | }
225 |
226 | int g_mainrun = 1;
227 | void mainstop(int dummy) {
228 | g_mainrun = 0;
229 | printf("stopping...\n");
230 | }
231 |
232 | // histogram bucket count
233 | #define BUCKETS 200
234 |
235 | int main(int argc, char *argv[])
236 | {
237 | unsigned long long iter_count, time_us, time_usr_us,
238 | time_sys_us, ivcs, last_us, total_time_us, fastest_time_us,
239 | slowest_time_us;
240 | unsigned long long target_us = 100 * 1000; // default target ms
241 | double slower_ms, diff_pct;
242 | static struct timeval ts[2];
243 | struct rusage u[2];
244 | int test_us = 100 * 1000;
245 | int test_runs = 5; // calibration
246 | int max_runs = 100;
247 | int verbose = 0;
248 | int hist[BUCKETS] = {0};
249 | int bar_width = 50;
250 | int c, i, j, runs, idx, max_idx;
251 | unsigned long long *runs_us;
252 | unsigned long long pagesize;
253 | char *memp;
254 | unsigned long long (*run)(unsigned long long) = spinrun;
255 | void *(*test)(void *) = spintest;
256 |
257 | // defaults
258 | g_stride = 64;
259 | g_memsize = 0;
260 |
261 | // options
262 | while ((c = getopt(argc, argv, "hm:v")) != -1) {
263 | switch (c) {
264 | case 'm':
265 | g_memsize = atoi(optarg) * 1024 * 1024;
266 | if (!g_memsize) {
267 | printf("-m Mbytes must be non-zero\n");
268 | usage();
269 | return 0;
270 | }
271 | run = memrun;
272 | test = memtest;
273 | break;
274 | case 'v':
275 | verbose = 1;
276 | break;
277 | case 'h':
278 | usage();
279 | return 0;
280 | default:
281 | usage();
282 | return 0;
283 | }
284 | }
285 | argc -= optind;
286 | if (argc > 2) {
287 | usage();
288 | return 0;
289 | }
290 | if (argc)
291 | target_us = atoll(argv[optind]) * 1000;
292 | if (argc > 1)
293 | max_runs = atoll(argv[optind + 1]);
294 | if (!target_us) {
295 | printf("ERROR: target ms must be > 0\n");
296 | usage();
297 | return 1;
298 | }
299 |
300 | // per-run statistics
301 | if ((runs_us = malloc(max_runs * sizeof (time_us))) == NULL) {
302 | printf("ERROR: can't allocate memory for %d runs\n", max_runs);
303 | return 1;
304 | }
305 |
306 | /*
307 | * populate working set
308 | */
309 | if (g_memsize) {
310 | printf("Allocating %llu Mbytes...\n",
311 | g_memsize / (1024 * 1024));
312 | if ((g_mem = malloc(g_memsize)) == NULL) {
313 | printf("ERROR allocating -m memory. Exiting.\n");
314 | return 1;
315 | }
316 | pagesize = getpagesize();
317 | for (memp = g_mem; memp < (g_mem + g_memsize);
318 | memp += pagesize) {
319 | memp[0] = 'A';
320 | }
321 | }
322 |
323 | /*
324 | * determine target run count
325 | */
326 | printf("Calibrating for %llu ms...", target_us / 1000);
327 | fflush(stdout);
328 | iter_count = find_count(target_us, test_us, test_runs, test, run);
329 | printf(" (target iteration count: %llu)\n", iter_count);
330 |
331 | signal(SIGINT, mainstop);
332 | time_us = 0;
333 | diff_pct = 0;
334 |
335 | // run loop
336 | fastest_time_us = ~0ULL;
337 | slowest_time_us = 0;
338 | for (i = 0; g_mainrun && i < max_runs; i++) {
339 | last_us = time_us;
340 | /*
341 | * spin time, with timeout
342 | */
343 | getrusage(RUSAGE_SELF, &u[0]);
344 | gettimeofday(&ts[0], NULL);
345 | (void) run(iter_count);
346 | gettimeofday(&ts[1], NULL);
347 | getrusage(RUSAGE_SELF, &u[1]);
348 |
349 | /*
350 | * calculate times
351 | */
352 | time_us = 1000000 * (ts[1].tv_sec - ts[0].tv_sec) +
353 | (ts[1].tv_usec - ts[0].tv_usec) / 1;
354 | if (time_us < fastest_time_us)
355 | fastest_time_us = time_us;
356 | if (time_us > slowest_time_us)
357 | slowest_time_us = time_us;
358 | runs_us[i] = time_us;
359 | if (last_us)
360 | diff_pct = 100 * (((double)time_us / last_us) - 1);
361 |
362 | // status output
363 | if (!verbose) {
364 | printf("\rRun %d/%d, Ctrl-C to stop (%.2f%% diff) ",
365 | i + 1, max_runs, diff_pct);
366 | fflush(stdout);
367 | continue;
368 | }
369 |
370 | // debug stats
371 | time_usr_us = 1000000 *
372 | (u[1].ru_utime.tv_sec - u[0].ru_utime.tv_sec) +
373 | (u[1].ru_utime.tv_usec - u[0].ru_utime.tv_usec) / 1;
374 | time_sys_us = 1000000 *
375 | (u[1].ru_stime.tv_sec - u[0].ru_stime.tv_sec) +
376 | (u[1].ru_stime.tv_usec - u[0].ru_stime.tv_usec) / 1;
377 | ivcs = u[1].ru_nivcsw - u[0].ru_nivcsw;
378 |
379 | // verbose output
380 | if (i == 0) {
381 | printf("%s %s %s %s %s %s\n", "run", "time(ms)",
382 | "usr_time(ms)", "sys_time(ms)",
383 | "involuntary_csw", "diff%");
384 | printf("%d %.2f %.1f %.1f %llu -\n", i + 1,
385 | (double)time_us / 1000,
386 | (double)time_usr_us / 1000,
387 | (double)time_sys_us / 1000, ivcs);
388 | } else {
389 | printf("%d %.2f %.1f %.1f %llu %.1f\n", i + 1,
390 | (double)time_us / 1000,
391 | (double)time_usr_us / 1000,
392 | (double)time_sys_us / 1000, ivcs, diff_pct);
393 | }
394 | }
395 | runs = i;
396 |
397 | /*
398 | * post-process: histogram and percentiles
399 | */
400 | total_time_us = 0;
401 | max_idx = 0;
402 | for (i = 0; i < runs; i++) {
403 | idx = hist_idx(100 *
404 | (((double)runs_us[i] / fastest_time_us) - 1), BUCKETS);
405 | if (idx < 0) {
406 | // shouldn't happen
407 | printf("ERROR: negative hist idx; fix program.\n");
408 | return 1;
409 | }
410 | hist[idx]++;
411 | if (idx > max_idx)
412 | max_idx = idx;
413 | total_time_us += runs_us[i];
414 | }
415 | int max_bucket_count = 0;
416 | for (i = 0; i <= max_idx; i++) {
417 | if (hist[i] > max_bucket_count)
418 | max_bucket_count = hist[i];
419 | }
420 |
421 | /*
422 | * print histogram and stats
423 | */
424 | if (!verbose)
425 | printf("\n");
426 | printf("\nPerturbation percent by count for %llu ms runs:\n",
427 | target_us / 1000);
428 | printf("%9s %6s %7s %s\n", "Slower%", "Count", "Count%", "Histogram");
429 | int bar;
430 | double min;
431 | for (i = 0; i <= max_idx; i++) {
432 | min = hist_val(i);
433 | printf("%8.1f%%%s %6d %6.2f%% ", min,
434 | i == BUCKETS - 1 ? "+" : ":", hist[i],
435 | (double)100 * hist[i] / runs);
436 | bar = myceil((double)bar_width * hist[i] / max_bucket_count);
437 | for (j = 0; j < bar; j++)
438 | printf("*");
439 | printf("\n");
440 | }
441 |
442 | qsort(runs_us, runs, sizeof (time_us), ullcmp);
443 | printf("\nPercentiles:");
444 | if (runs >= 3) {
445 | printf(" 50th: %.3f%%", (double)100 *
446 | (runs_us[runs * 50 / 100 - 1] - fastest_time_us) /
447 | fastest_time_us);
448 | }
449 | if (runs >= 10) {
450 | printf(", 90th: %.3f%%", (double)100 *
451 | (runs_us[runs * 90 / 100 - 1] - fastest_time_us) /
452 | fastest_time_us);
453 | }
454 | if (runs >= 100) {
455 | printf(", 99th: %.3f%%", (double)100 *
456 | (runs_us[runs * 99 / 100 - 1] - fastest_time_us) /
457 | fastest_time_us);
458 | }
459 | if (runs >= 3)
460 | printf(",");
461 | printf(" 100th: %.3f%%\n", (double)100 *
462 | (runs_us[runs - 1] - fastest_time_us) / fastest_time_us);
463 |
464 | printf("Fastest: %.3f ms, 50th: %.3f ms, mean: %.3f ms, "
465 | "slowest: %.3f ms\n",
466 | (double)fastest_time_us / 1000,
467 | (double)runs_us[runs * 50 / 100 - 1] / 1000,
468 | (double)total_time_us / (runs * 1000),
469 | (double)slowest_time_us / 1000);
470 | printf("Fastest rate: %llu/s, 50th: %llu/s, mean: %llu/s, "
471 | "slowest: %llu/s\n",
472 | iter_count * 1000000 / runs_us[0],
473 | iter_count * 1000000 / runs_us[runs * 50 / 100 - 1],
474 | iter_count * 1000000 / (total_time_us / runs),
475 | iter_count * 1000000 / runs_us[runs - 1]);
476 |
477 | return (0);
478 | }
479 |
--------------------------------------------------------------------------------