├── README.md └── p1bench.c /README.md: -------------------------------------------------------------------------------- 1 | p1bench 2 | ======= 3 | 4 | Perturbation benchmark. 5 | 6 | This is intended to be used before other benchmark tests, to investigate CPU and memory variation. This helps you better interpret the results of any microbenchmark test, by characterizing variance that will sway results. 7 | 8 | Let's say you wanted to do a 500 ms CPU benchmark of gzip performance: p1bench can be run with a 500 ms interval to show what baseline variation you may see, based on a simple spin loop, before running a more complex gzip microbenchmark. 9 | 10 | p1bench can also be run in a mode (-m) to test memory variation. 11 | 12 | ## Operating Systems 13 | 14 | Tested on Linux and OSX. Should work anywhere with a C compiler and libpthread. 15 | 16 | ## Compile 17 | 18 | ``` 19 | gcc -O0 -pthread -o p1bench p1bench.c 20 | ``` 21 | 22 | ## Screenshots 23 | 24 | CPU test for a 500 ms duration: 25 | 26 |
 27 | $ ./p1bench 500
 28 | Calibrating for 500 ms... (target iteration count: 220661127)
 29 | Run 100/100, Ctrl-C to stop (-0.30% diff)
 30 | 
 31 | Perturbation percent by count for 500 ms runs:
 32 |   Slower%   Count  Count% Histogram
 33 |      0.0%:      5   5.00% *********
 34 |      0.1%:      5   5.00% *********
 35 |      0.2%:      2   2.00% ****
 36 |      0.3%:      2   2.00% ****
 37 |      0.4%:     11  11.00% *******************
 38 |      0.5%:      4   4.00% *******
 39 |      0.6%:      3   3.00% *****
 40 |      0.7%:      4   4.00% *******
 41 |      0.8%:      5   5.00% *********
 42 |      0.9%:      1   1.00% **
 43 |      1.0%:     30  30.00% **************************************************
 44 |      2.0%:     27  27.00% *********************************************
 45 |      3.0%:      1   1.00% **
 46 | 
 47 | Percentiles: 50th: 1.356%, 90th: 2.439%, 99th: 2.954%, 100th: 3.035%
 48 | Fastest: 500.715 ms, 50th: 507.504 ms, mean: 507.359 ms, slowest: 515.911 ms
 49 | Fastest rate: 440692064/s, 50th: 434796823/s, mean: 434921085/s, slowest: 427711614/s
 50 | 
51 | 52 | Many numbers are printed to characterize variance, and the histogram shows it visually. Just from the histogram, I'd expect a variance of up to 2% (fastest to slowest) for a CPU microbenchmark of the same duration (500 ms). 53 | 54 | This is a custom histogram, where the bin size varies: 55 | 56 | - variation 0 - 1%: 0.1% binsize 57 | - variation 1 - 20%: 1% binsize 58 | - variation 20+%: 10% binsize 59 | 60 | Here's a much noisier system: 61 | 62 |
 63 | $ ./p1bench 500
 64 | Calibrating for 500 ms... (target iteration count: 206071738)
 65 | Run 100/100, Ctrl-C to stop (0.85% diff)
 66 | 
 67 | Perturbation percent by count for 500 ms runs:
 68 |   Slower%   Count  Count% Histogram
 69 |      0.0%:      1   1.00% ****
 70 |      0.1%:      0   0.00%
 71 |      0.2%:      0   0.00%
 72 |      0.3%:      0   0.00%
 73 |      0.4%:      1   1.00% ****
 74 |      0.5%:      0   0.00%
 75 |      0.6%:      0   0.00%
 76 |      0.7%:      0   0.00%
 77 |      0.8%:      1   1.00% ****
 78 |      0.9%:      0   0.00%
 79 |      1.0%:      1   1.00% ****
 80 |      2.0%:     10  10.00% ********************************
 81 |      3.0%:      7   7.00% **********************
 82 |      4.0%:     12  12.00% **************************************
 83 |      5.0%:     14  14.00% ********************************************
 84 |      6.0%:     10  10.00% ********************************
 85 |      7.0%:     16  16.00% **************************************************
 86 |      8.0%:      5   5.00% ****************
 87 |      9.0%:     10  10.00% ********************************
 88 |     10.0%:      5   5.00% ****************
 89 |     11.0%:      1   1.00% ****
 90 |     12.0%:      2   2.00% *******
 91 |     13.0%:      1   1.00% ****
 92 |     14.0%:      1   1.00% ****
 93 |     15.0%:      1   1.00% ****
 94 |     16.0%:      0   0.00%
 95 |     17.0%:      0   0.00%
 96 |     18.0%:      0   0.00%
 97 |     19.0%:      0   0.00%
 98 |     20.0%:      0   0.00%
 99 |     30.0%:      1   1.00% ****
100 | 
101 | Percentiles: 50th: 6.258%, 90th: 10.085%, 99th: 15.431%, 100th: 38.078%
102 | Fastest: 485.364 ms, 50th: 515.739 ms, mean: 518.336 ms, slowest: 670.182 ms
103 | Fastest rate: 424571533/s, 50th: 399565939/s, mean: 397564008/s, slowest: 307486232/s
104 | 
105 | 106 | If I wanted to do a 500 ms CPU microbenchmark on this system, well, I'd find another system. 107 | 108 | USAGE: 109 | 110 |
111 | USAGE: p1bench [-hv] [-m Mbytes] [time(ms) [count]]
112 |                    -v         # verbose: per run details
113 |                    -m Mbytes  # memory test working set
114 |    eg,
115 |        p1bench          # 100ms (default) CPU spin loop
116 |        p1bench 300      # 300ms CPU spin loop
117 |        p1bench 300 100  # 300ms CPU spin loop, 100 times
118 |        p1bench -m 1024  # 1GB memory read loop
119 | 
120 | -------------------------------------------------------------------------------- /p1bench.c: -------------------------------------------------------------------------------- 1 | /* 2 | * p1bench - perturbation benchmark. Tests simple CPU or memory loops. 3 | * 4 | * This is intended to be used before other benchmark tests, to investigate 5 | * CPU and memory variation. This helps you better interpret the results of any 6 | * microbenchmark test, by characterizing variance that will sway results. 7 | * 8 | * Let's say you wanted to do a 500 ms CPU benchmark of gzip performance: 9 | * p1bench can be run with a 500 ms interval to show what baseline variation 10 | * you may see, based on a simple spin loop, before running the more complex 11 | * gzip microbenchmark. 12 | * 13 | * p1bench can also be run in a mode (-m) to test memory variation. 14 | * 15 | * gcc -O0 -pthread -o p1bench p1bench.c 16 | * 17 | * USAGE: see -h for usage. 18 | * 19 | * Copyright 2018 Netflix, Inc. 20 | * Licensed under the Apache License, Version 2.0 (the "License") 21 | * 22 | * 03-Jan-2018 Brendan Gregg Created this. 23 | */ 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | void usage() 37 | { 38 | printf("USAGE: p1bench [-hv] [-m Mbytes] [time(ms) [count]]\n" 39 | " -v # verbose: per run details\n" 40 | " -m Mbytes # memory test working set\n" 41 | " eg,\n" 42 | " p1bench # 100ms (default) CPU spin loop\n" 43 | " p1bench 300 # 300ms CPU spin loop\n" 44 | " p1bench 300 100 # 300ms CPU spin loop, 100 times\n" 45 | " p1bench -m 1024 # 1GB memory read loop\n"); 46 | } 47 | 48 | /* 49 | * These functions aren't just for code cleanliness: they show up in profilers 50 | * when doing active benchmarking to debug the benchmark. 51 | */ 52 | 53 | int g_testrun = 1; 54 | void teststop(int dummy) { 55 | g_testrun = 0; 56 | } 57 | 58 | void *spintest(void *arg) 59 | { 60 | signal(SIGUSR1, teststop); 61 | unsigned long long *count = (unsigned long long *)arg; 62 | for (;g_testrun;) { (*count)++; } 63 | return NULL; 64 | } 65 | 66 | unsigned long long spinrun(unsigned long long count) 67 | { 68 | unsigned long long i; 69 | // check compiler doesn't elide this (-O0) 70 | for (i = 0; i < count; i++) {;} 71 | return i; 72 | } 73 | 74 | // memory parameters 75 | char *g_mem; 76 | unsigned long long g_memsize; 77 | unsigned long long g_stride; 78 | 79 | void *memtest(void *arg) 80 | { 81 | unsigned long long *count = (unsigned long long *)arg; 82 | char *memp; 83 | unsigned long long i, j; 84 | int junk; 85 | 86 | signal(SIGUSR1, teststop); 87 | memp = g_mem; 88 | for (;g_testrun;) { 89 | junk += memp[0]; 90 | memp += g_stride; 91 | if (memp > (g_mem + g_memsize)) 92 | memp = g_mem; 93 | (*count)++; 94 | } 95 | 96 | return NULL; 97 | } 98 | 99 | unsigned long long memrun(unsigned long long count) 100 | { 101 | char *memp; 102 | unsigned long long i, j; 103 | int junk; 104 | 105 | signal(SIGUSR1, teststop); 106 | memp = g_mem; 107 | for (i = 0; i < count; i++) { 108 | junk += memp[0]; 109 | memp += g_stride; 110 | if (memp > (g_mem + g_memsize)) 111 | memp = g_mem; 112 | } 113 | return i; 114 | } 115 | 116 | /* 117 | * Runs the loop function for the target_us while incrementing count. 118 | * This gives us a ballpark figure of the target count. 119 | */ 120 | void test_run(unsigned long long target_us, unsigned long long *count, 121 | void *(*loop)(void *)) 122 | { 123 | pthread_t thread; 124 | int err; 125 | 126 | if (!target_us) 127 | return; 128 | 129 | g_testrun = 1; 130 | (*count) = 0; 131 | if (pthread_create(&thread, NULL, loop, count) != 0) { 132 | perror("Thread create failed"); 133 | exit(1); 134 | } 135 | usleep(target_us); 136 | if ((err = pthread_kill(thread, SIGUSR1))) { 137 | perror("Couldn't terminate worker thread normally"); 138 | exit(1); 139 | } 140 | pthread_join(thread, NULL); 141 | } 142 | 143 | /* 144 | * Finds a ballpark target count, then runs the real run function with that 145 | * count several times (test_runs) to fine tune the target count. 146 | */ 147 | unsigned long long find_count(unsigned long long target_us, 148 | int test_us, int test_runs, 149 | void *(*test)(void *), 150 | unsigned long long (*run)(unsigned long long)) 151 | { 152 | unsigned long long time_us; 153 | unsigned long long fastest_time_us = ~0ULL; 154 | unsigned long long iter_count = 0; 155 | static struct timeval ts[2]; 156 | int i; 157 | 158 | test_run(test_us, &iter_count, test); 159 | for (i = 0; i < test_runs; i++) { 160 | gettimeofday(&ts[0], NULL); 161 | (void) run(iter_count); 162 | gettimeofday(&ts[1], NULL); 163 | time_us = 1000000 * (ts[1].tv_sec - ts[0].tv_sec) + 164 | (ts[1].tv_usec - ts[0].tv_usec) / 1; 165 | if (time_us < fastest_time_us) 166 | fastest_time_us = time_us; 167 | } 168 | return iter_count * target_us / fastest_time_us; 169 | } 170 | 171 | /* 172 | * Value to histogram index. 173 | * This is a custom histogram with the following ranges: 174 | * value 0-1: 0.1 step size, idx 0-9 175 | * value 1-20: 1 step size, idx 10-28 176 | * value 20+: 10 step size, idx 19+ 177 | */ 178 | static int hist_idx(double value, int buckets) 179 | { 180 | int idx; 181 | 182 | if (value < 1) 183 | idx = (int)(10 * value); 184 | else if (value < 20) 185 | // idx = previous_steps + (value - min_range_value) 186 | idx = 10 + (int)(value - 1); 187 | else 188 | // idx = previous_steps + (value - min_range_value)/10 189 | idx = 29 + (int)((value - 20) / 10); 190 | 191 | if (idx > buckets - 1) 192 | idx = buckets - 1; 193 | return idx; 194 | } 195 | 196 | // histogram index to minimum value 197 | double hist_val(int idx) 198 | { 199 | if (idx < 10) 200 | return (double)idx / 10; 201 | else if (idx < 29) 202 | // 10 -> 1.0 203 | // 11 -> 2 204 | // value = idx - previous_steps + min_range_value 205 | return (double)idx - 10 + 1; 206 | else 207 | // value = (idx - previous_steps) * 10 + min_range_value 208 | return (double)(idx - 29) * 10 + 20; 209 | } 210 | 211 | static int ullcmp(const void *p1, const void *p2) 212 | { 213 | unsigned long long a = *(unsigned long long *)p1; 214 | unsigned long long b = *(unsigned long long *)p2; 215 | return a - b; 216 | } 217 | 218 | // not worth -lm for this 219 | int myceil(double x) 220 | { 221 | if (x > (int)x) 222 | return (int)x + 1; 223 | return (int)x; 224 | } 225 | 226 | int g_mainrun = 1; 227 | void mainstop(int dummy) { 228 | g_mainrun = 0; 229 | printf("stopping...\n"); 230 | } 231 | 232 | // histogram bucket count 233 | #define BUCKETS 200 234 | 235 | int main(int argc, char *argv[]) 236 | { 237 | unsigned long long iter_count, time_us, time_usr_us, 238 | time_sys_us, ivcs, last_us, total_time_us, fastest_time_us, 239 | slowest_time_us; 240 | unsigned long long target_us = 100 * 1000; // default target ms 241 | double slower_ms, diff_pct; 242 | static struct timeval ts[2]; 243 | struct rusage u[2]; 244 | int test_us = 100 * 1000; 245 | int test_runs = 5; // calibration 246 | int max_runs = 100; 247 | int verbose = 0; 248 | int hist[BUCKETS] = {0}; 249 | int bar_width = 50; 250 | int c, i, j, runs, idx, max_idx; 251 | unsigned long long *runs_us; 252 | unsigned long long pagesize; 253 | char *memp; 254 | unsigned long long (*run)(unsigned long long) = spinrun; 255 | void *(*test)(void *) = spintest; 256 | 257 | // defaults 258 | g_stride = 64; 259 | g_memsize = 0; 260 | 261 | // options 262 | while ((c = getopt(argc, argv, "hm:v")) != -1) { 263 | switch (c) { 264 | case 'm': 265 | g_memsize = atoi(optarg) * 1024 * 1024; 266 | if (!g_memsize) { 267 | printf("-m Mbytes must be non-zero\n"); 268 | usage(); 269 | return 0; 270 | } 271 | run = memrun; 272 | test = memtest; 273 | break; 274 | case 'v': 275 | verbose = 1; 276 | break; 277 | case 'h': 278 | usage(); 279 | return 0; 280 | default: 281 | usage(); 282 | return 0; 283 | } 284 | } 285 | argc -= optind; 286 | if (argc > 2) { 287 | usage(); 288 | return 0; 289 | } 290 | if (argc) 291 | target_us = atoll(argv[optind]) * 1000; 292 | if (argc > 1) 293 | max_runs = atoll(argv[optind + 1]); 294 | if (!target_us) { 295 | printf("ERROR: target ms must be > 0\n"); 296 | usage(); 297 | return 1; 298 | } 299 | 300 | // per-run statistics 301 | if ((runs_us = malloc(max_runs * sizeof (time_us))) == NULL) { 302 | printf("ERROR: can't allocate memory for %d runs\n", max_runs); 303 | return 1; 304 | } 305 | 306 | /* 307 | * populate working set 308 | */ 309 | if (g_memsize) { 310 | printf("Allocating %llu Mbytes...\n", 311 | g_memsize / (1024 * 1024)); 312 | if ((g_mem = malloc(g_memsize)) == NULL) { 313 | printf("ERROR allocating -m memory. Exiting.\n"); 314 | return 1; 315 | } 316 | pagesize = getpagesize(); 317 | for (memp = g_mem; memp < (g_mem + g_memsize); 318 | memp += pagesize) { 319 | memp[0] = 'A'; 320 | } 321 | } 322 | 323 | /* 324 | * determine target run count 325 | */ 326 | printf("Calibrating for %llu ms...", target_us / 1000); 327 | fflush(stdout); 328 | iter_count = find_count(target_us, test_us, test_runs, test, run); 329 | printf(" (target iteration count: %llu)\n", iter_count); 330 | 331 | signal(SIGINT, mainstop); 332 | time_us = 0; 333 | diff_pct = 0; 334 | 335 | // run loop 336 | fastest_time_us = ~0ULL; 337 | slowest_time_us = 0; 338 | for (i = 0; g_mainrun && i < max_runs; i++) { 339 | last_us = time_us; 340 | /* 341 | * spin time, with timeout 342 | */ 343 | getrusage(RUSAGE_SELF, &u[0]); 344 | gettimeofday(&ts[0], NULL); 345 | (void) run(iter_count); 346 | gettimeofday(&ts[1], NULL); 347 | getrusage(RUSAGE_SELF, &u[1]); 348 | 349 | /* 350 | * calculate times 351 | */ 352 | time_us = 1000000 * (ts[1].tv_sec - ts[0].tv_sec) + 353 | (ts[1].tv_usec - ts[0].tv_usec) / 1; 354 | if (time_us < fastest_time_us) 355 | fastest_time_us = time_us; 356 | if (time_us > slowest_time_us) 357 | slowest_time_us = time_us; 358 | runs_us[i] = time_us; 359 | if (last_us) 360 | diff_pct = 100 * (((double)time_us / last_us) - 1); 361 | 362 | // status output 363 | if (!verbose) { 364 | printf("\rRun %d/%d, Ctrl-C to stop (%.2f%% diff) ", 365 | i + 1, max_runs, diff_pct); 366 | fflush(stdout); 367 | continue; 368 | } 369 | 370 | // debug stats 371 | time_usr_us = 1000000 * 372 | (u[1].ru_utime.tv_sec - u[0].ru_utime.tv_sec) + 373 | (u[1].ru_utime.tv_usec - u[0].ru_utime.tv_usec) / 1; 374 | time_sys_us = 1000000 * 375 | (u[1].ru_stime.tv_sec - u[0].ru_stime.tv_sec) + 376 | (u[1].ru_stime.tv_usec - u[0].ru_stime.tv_usec) / 1; 377 | ivcs = u[1].ru_nivcsw - u[0].ru_nivcsw; 378 | 379 | // verbose output 380 | if (i == 0) { 381 | printf("%s %s %s %s %s %s\n", "run", "time(ms)", 382 | "usr_time(ms)", "sys_time(ms)", 383 | "involuntary_csw", "diff%"); 384 | printf("%d %.2f %.1f %.1f %llu -\n", i + 1, 385 | (double)time_us / 1000, 386 | (double)time_usr_us / 1000, 387 | (double)time_sys_us / 1000, ivcs); 388 | } else { 389 | printf("%d %.2f %.1f %.1f %llu %.1f\n", i + 1, 390 | (double)time_us / 1000, 391 | (double)time_usr_us / 1000, 392 | (double)time_sys_us / 1000, ivcs, diff_pct); 393 | } 394 | } 395 | runs = i; 396 | 397 | /* 398 | * post-process: histogram and percentiles 399 | */ 400 | total_time_us = 0; 401 | max_idx = 0; 402 | for (i = 0; i < runs; i++) { 403 | idx = hist_idx(100 * 404 | (((double)runs_us[i] / fastest_time_us) - 1), BUCKETS); 405 | if (idx < 0) { 406 | // shouldn't happen 407 | printf("ERROR: negative hist idx; fix program.\n"); 408 | return 1; 409 | } 410 | hist[idx]++; 411 | if (idx > max_idx) 412 | max_idx = idx; 413 | total_time_us += runs_us[i]; 414 | } 415 | int max_bucket_count = 0; 416 | for (i = 0; i <= max_idx; i++) { 417 | if (hist[i] > max_bucket_count) 418 | max_bucket_count = hist[i]; 419 | } 420 | 421 | /* 422 | * print histogram and stats 423 | */ 424 | if (!verbose) 425 | printf("\n"); 426 | printf("\nPerturbation percent by count for %llu ms runs:\n", 427 | target_us / 1000); 428 | printf("%9s %6s %7s %s\n", "Slower%", "Count", "Count%", "Histogram"); 429 | int bar; 430 | double min; 431 | for (i = 0; i <= max_idx; i++) { 432 | min = hist_val(i); 433 | printf("%8.1f%%%s %6d %6.2f%% ", min, 434 | i == BUCKETS - 1 ? "+" : ":", hist[i], 435 | (double)100 * hist[i] / runs); 436 | bar = myceil((double)bar_width * hist[i] / max_bucket_count); 437 | for (j = 0; j < bar; j++) 438 | printf("*"); 439 | printf("\n"); 440 | } 441 | 442 | qsort(runs_us, runs, sizeof (time_us), ullcmp); 443 | printf("\nPercentiles:"); 444 | if (runs >= 3) { 445 | printf(" 50th: %.3f%%", (double)100 * 446 | (runs_us[runs * 50 / 100 - 1] - fastest_time_us) / 447 | fastest_time_us); 448 | } 449 | if (runs >= 10) { 450 | printf(", 90th: %.3f%%", (double)100 * 451 | (runs_us[runs * 90 / 100 - 1] - fastest_time_us) / 452 | fastest_time_us); 453 | } 454 | if (runs >= 100) { 455 | printf(", 99th: %.3f%%", (double)100 * 456 | (runs_us[runs * 99 / 100 - 1] - fastest_time_us) / 457 | fastest_time_us); 458 | } 459 | if (runs >= 3) 460 | printf(","); 461 | printf(" 100th: %.3f%%\n", (double)100 * 462 | (runs_us[runs - 1] - fastest_time_us) / fastest_time_us); 463 | 464 | printf("Fastest: %.3f ms, 50th: %.3f ms, mean: %.3f ms, " 465 | "slowest: %.3f ms\n", 466 | (double)fastest_time_us / 1000, 467 | (double)runs_us[runs * 50 / 100 - 1] / 1000, 468 | (double)total_time_us / (runs * 1000), 469 | (double)slowest_time_us / 1000); 470 | printf("Fastest rate: %llu/s, 50th: %llu/s, mean: %llu/s, " 471 | "slowest: %llu/s\n", 472 | iter_count * 1000000 / runs_us[0], 473 | iter_count * 1000000 / runs_us[runs * 50 / 100 - 1], 474 | iter_count * 1000000 / (total_time_us / runs), 475 | iter_count * 1000000 / runs_us[runs - 1]); 476 | 477 | return (0); 478 | } 479 | --------------------------------------------------------------------------------