├── tuned
    ├── gs-isol-cpus
    │   ├── vars.conf
    │   └── tuned.conf
    ├── gs-isol-cpus-hz
    │   ├── vars.conf
    │   └── tuned.conf
    ├── gs-isol-cpus-half-hz
    │   ├── vars.conf
    │   └── tuned.conf
    └── gs-latency
    │   └── tuned.conf
├── .gitmodules
├── CMakeLists.txt
├── makefile
├── ptp-clock-future.h
├── util.h
├── attic
    └── tsc.h
├── doc
    ├── osjitter-atom-C3758-hz.txt
    ├── osjitter-atom-C3758-half-hz.txt
    ├── pingpong-atom-C3758-write-early-enabled.txt
    ├── pingpong-atom-C3758-half-hz.txt
    └── pingpong-atom-C3758.txt
├── helper
    ├── bench2tidy.py
    └── bench_playbook.py
├── tsc.h
├── bench_syscalls.cc
├── util.c
├── ptp-clock-offset.c
├── README.md
├── osjitter.c
├── pingpong.c
└── LICENSE


/tuned/gs-isol-cpus/vars.conf:
--------------------------------------------------------------------------------
1 | isolated_cores=5-7
2 | 


--------------------------------------------------------------------------------
/tuned/gs-isol-cpus-hz/vars.conf:
--------------------------------------------------------------------------------
1 | isolated_cores=5-7
2 | 


--------------------------------------------------------------------------------
/tuned/gs-isol-cpus-half-hz/vars.conf:
--------------------------------------------------------------------------------
1 | isolated_cores=5-7
2 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "benchmark"]
2 | 	path = benchmark
3 | 	url = https://github.com/google/benchmark.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.1)
 2 | project(bench_syscalls CXX)
 3 | 
 4 | 
 5 | add_subdirectory(benchmark)
 6 | 
 7 | add_executable(bench_syscalls
 8 |     bench_syscalls.cc
 9 |     )
10 | target_link_libraries(bench_syscalls benchmark::benchmark)
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | CFLAGSW_GCC = -Wall -Wextra -Wno-missing-field-initializers \
 3 |     -Wno-parentheses -Wno-missing-braces \
 4 |     -Wmissing-prototypes -Wfloat-equal \
 5 |     -Wwrite-strings -Wpointer-arith -Wcast-align \
 6 |     -Wnull-dereference \
 7 |     -Werror=multichar -Werror=sizeof-pointer-memaccess -Werror=return-type \
 8 |     -fstrict-aliasing
 9 | 
10 | CFLAGS0 = -pthread -g
11 | CFLAGS1 = -O3
12 | 
13 | CFLAGS = $(CFLAGSW_GCC) $(CFLAGS0) $(CFLAGS1)
14 | 
15 | .PHONY: all
16 | all: osjitter pingpong
17 | 
18 | osjitter: util.o
19 | 
20 | pingpong: util.o
21 | 
22 | ptp-clock-offset: util.o
23 | 
24 | .PHONY: clean
25 | clean:
26 | 	rm -f osjitter osjitter.o util.o pingpong pingpong.o ptp-clock-offset
27 | 


--------------------------------------------------------------------------------
/ptp-clock-future.h:
--------------------------------------------------------------------------------
 1 | #ifndef PTP_CLOCK_FUTURE_H
 2 | #define PTP_CLOCK_FUTURE_H
 3 | 
 4 | #include <linux/ptp_clock.h>
 5 | 
 6 | 
 7 | // Note that PTP_SYS_OFFSET_EXTENDED is missing on some RHEL 7 versions although
 8 | // PTP_SYS_OFFSET_PRECISE is even available.
 9 | 
10 | 
11 | // imported from https://sourceforge.net/p/linuxptp/code/ci/61c6a708980217119e829e4b41ea2504e673e4fb/
12 | #ifndef PTP_SYS_OFFSET_EXTENDED
13 | 
14 | #define PTP_SYS_OFFSET_EXTENDED \
15 | 	_IOWR(PTP_CLK_MAGIC, 9, struct ptp_sys_offset_extended)
16 | 
17 | struct ptp_sys_offset_extended {
18 | 	unsigned int n_samples; /* Desired number of measurements. */
19 | 	unsigned int rsv[3];    /* Reserved for future use. */
20 | 	/*
21 | 	 * Array of [system, phc, system] time stamps. The kernel will provide
22 | 	 * 3*n_samples time stamps.
23 | 	 */
24 | 	struct ptp_clock_time ts[PTP_MAX_SAMPLES][3];
25 | };
26 | 
27 | #endif /* PTP_SYS_OFFSET_EXTENDED */
28 | 
29 | 
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/util.h:
--------------------------------------------------------------------------------
 1 | 
 2 | // 2019, Georg Sauthoff <mail@gms.tf>
 3 | //
 4 | // SPDX-License-Identifier: GPL-3.0-or-later
 5 | 
 6 | #ifndef OSJITTER_UTIL_H
 7 | #define OSJITTER_UTIL_H
 8 | 
 9 | #include <stdint.h>
10 | #include <stddef.h>
11 | 
12 | static inline int cmp_u32(const void *a, const void *b)
13 | {
14 |     const uint32_t *x = a;
15 |     const uint32_t *y = b;
16 | 
17 |     if (*x < *y)
18 |         return -1;
19 |     if (*x > *y)
20 |         return 1;
21 |     return 0;
22 | }
23 | 
24 | // Linux Kernel has a function that is named the same
25 | static inline uint64_t mul_u64_u32_shr(uint64_t cyc, uint32_t mult, uint32_t shift)
26 | {
27 |     __uint128_t x = cyc;
28 |     x *= mult;
29 |     x >>= shift;
30 |     return x;
31 | }
32 | 
33 | void perror_e(int r, const char *msg);
34 | 
35 | uint32_t percentile_u32(const uint32_t *x, size_t n, size_t a, size_t b);
36 | uint32_t mad_u32(const uint32_t *x, uint32_t *y, size_t n);
37 | 
38 | int get_tsc_khz(uint32_t *tsc_khz);
39 | 
40 | void clocks_calc_mult_shift(
41 |         uint32_t *mult, uint32_t *shift, uint32_t from, uint32_t to,
42 |         uint32_t maxsec);
43 | 
44 | int get_tsc_perf(uint32_t *mult, uint32_t *shift);
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/attic/tsc.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <x86intrin.h> // __rdtsc(), _mm_lfence(), ...
 3 | 
 4 | 
 5 | extern __inline uint64_t __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 6 | double_fenced_rdtsc(void)
 7 | {
 8 |     // https://www.felixcloutier.com/x86/rdtsc
 9 |     // If software requires RDTSC to be executed only after all previous
10 |     // instructions have executed and all previous loads and stores are
11 |     // globally visible, it can execute the sequence MFENCE;LFENCE immediately
12 |     // before RDTSC.
13 |     // If software requires RDTSC to be executed prior to execution of any
14 |     // subsequent instruction (including any memory accesses), it can execute
15 |     // the sequence LFENCE immediately after RDTSC.
16 |     _mm_mfence();
17 |     _mm_lfence();
18 |     uint64_t r = __rdtsc();
19 |     _mm_lfence();
20 |     return r;
21 | }
22 | 
23 | extern __inline uint64_t __attribute__((__gnu_inline__, __always_inline__, __artificial__))
24 | far_fenced_rdtsc(void)
25 | {
26 |     // https://www.felixcloutier.com/x86/rdtsc
27 |     // If software requires RDTSC to be executed prior to execution of any
28 |     // subsequent instruction (including any memory accesses), it can execute
29 |     // the sequence LFENCE immediately after RDTSC.
30 |     uint64_t r = __rdtsc();
31 |     _mm_lfence();
32 |     return r;
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/doc/osjitter-atom-C3758-hz.txt:
--------------------------------------------------------------------------------
 1 | $ ./osjitter  -t 60
 2 |  CPU  TSC_khz  #intr  #delta  ovfl_ns  invol_ctx  sum_intr_ns  iratio  rt_s  loop_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns   max_ns  mad_ns
 3 |    0  2200000  71112   71112        0      30615   1307556496   0.022    60       22       5938    4131    7280   71342  122061    197748 32007581    1554
 4 |    1  2200000  69271   69271        0      31437    734304556   0.012    60       22       5454    2960    6852   12580   92996    114692 14820361    2171
 5 |    2  2200000  66890   66890        0       1130    868183998   0.014    60       22       2745    2383    3327    4860   18203   1733105 22469364     328
 6 |    3  2200000  70697   70697        0      14009    552432760   0.009    60       22       2847    2490    6605   12346   69640    190076  2848408     419
 7 |    4  2200000  67158   67158        0       1837    343931516   0.006    60       22       2820    2491    3488    5195   50712    440682  2845008     311
 8 |    5  2200000  60062   60062        0          1    166677345   0.003    60       22       2725    2428    3069    3299    4287      5413    12129     251
 9 |    6  2200000  60062   60062        0          1    161701916   0.003    60       22       2653    2360    2975    3191    4100      5133    13279     244
10 |    7  2200000  60062   60062        0          1    161126550   0.003    60       21       2644    2351    2964    3183    4066      5123    13364     241
11 | 


--------------------------------------------------------------------------------
/doc/osjitter-atom-C3758-half-hz.txt:
--------------------------------------------------------------------------------
 1 | $ ./osjitter  -t 60
 2 |  CPU  TSC_khz  #intr  #delta  ovfl_ns  invol_ctx  sum_intr_ns  iratio  rt_s  loop_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns   max_ns  mad_ns
 3 |    0  2200000  60273   60273        0        122    229869307   0.004    60       20       3482    3322    3740    3940    4648     36907  3929568     145
 4 |    1  2200000  60249   60249        0        617    170593054   0.003    60       20       2685    2339    2970    3101    7809     14140  2288695     249
 5 |    2  2200000  60307   60307        0        216    219753975   0.004    60       21       2581    2298    2872    3009    3909    453060  1941053     230
 6 |    3  2200000  60206   60206        0      25792    310482349   0.005    60       20       3426    2748    7840    7960    8382     13177  2528739     876
 7 |    4  2200000  60244   60244        0        450    176518457   0.003    60       20       2642    2310    2957    3093    4880     11532  4537616     256
 8 |    5  2200000  60061   60061        0          1    156952003   0.003    60       20       2606    2299    2907    3044    3396      4388    15060     242
 9 |    6  2200000  60061   60061        0          1    154872338   0.003    60       20       2566    2270    2870    3010    3333      4434    14995     241
10 |    7  2200000  60061   60061        0          1    154459207   0.003    60       20       2560    2260    2865    3000    3332      4420    12925     241
11 | 


--------------------------------------------------------------------------------
/helper/bench2tidy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # SPDX-License-Identifier: GPL-3.0-or-later
 4 | # SPDX-FileCopyrightText: © 2021 Georg Sauthoff <mail@gms.tf>
 5 | 
 6 | import argparse
 7 | 
 8 | def is_not_a_point(line):
 9 |     for k in ('_mean"', '_median"', '_stddev"'):
10 |         if k in line:
11 |             return True
12 |     return False
13 | 
14 | def dump_csv(filename, host, o):
15 |     with open(filename) as f:
16 |         state = 0
17 |         for line in f:
18 |             if state == 0:
19 |                 if line.startswith('name,iterations,real_time,cpu_time,time_unit'):
20 |                     state = 1
21 |             elif state == 1:
22 |                 if is_not_a_point(line):
23 |                     continue
24 |                 i = line.rindex(',ns,')
25 |                 o.write(f'{host},{line[:i]}\n')
26 | 
27 | 
28 | def main(filenames, ofilename):
29 |     with open(ofilename, 'w') as f:
30 |         f.write('host,name,iterations,real_ns,cpu_ns\n')
31 |         for fn in filenames:
32 |             host = fn[fn.rindex('-')+1:-4]
33 |             dump_csv(fn, host, f)
34 | 
35 | def parse_args():
36 |     p = argparse.ArgumentParser()
37 |     p.add_argument('filenames', metavar='CSV_FILENAME', nargs='+',
38 |             help='hosts under test')
39 |     p.add_argument('--out', '-o', default='all.csv',
40 |             help='resulting CSV filename (default: %(default)s)')
41 |     return p.parse_args()
42 | 
43 | if __name__ == '__main__':
44 |     args = parse_args()
45 |     main(args.filenames, args.out)
46 | 
47 | 


--------------------------------------------------------------------------------
/doc/pingpong-atom-C3758-write-early-enabled.txt:
--------------------------------------------------------------------------------
 1 | $ ./pingpong --pin 0 6 --pin 1 5 --spin
 2 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
 3 |      0  2200000  500000     171     538        178     178     179     179     184       201       0 
 4 |      1  2200000  500000     169     531        175     174     175     175     175       186       0 
 5 | $ ./pingpong --pin 0 6 --pin 1 5 --spin
 6 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
 7 |      0  2200000  500000     176     551        178     178     179     179     185       251       0 
 8 |      1  2200000  500000     169     568        175     174     175     175     175       198       0 
 9 | $ ./pingpong --pin 0 6 --pin 1 5 --spin
10 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
11 |      0  2200000  500000     170     538        178     178     179     179     184       201       0 
12 |      1  2200000  500000     169     525        175     174     175     175     175       187       0 
13 | $ ./pingpong --pin 0 6 --pin 1 5 --spin-pause
14 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
15 |      0  2200000  500000     169     540        172     170     176     176     182       219       2 
16 |      1  2200000  500000     164     530        170     166     172     173     176       196       2 
17 | $ ./pingpong --pin 0 6 --pin 1 5 --spin-pause -p 2
18 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
19 |      0  2200000  500000     169     548        175     171     178     179     241       269       2 
20 |      1  2200000  500000     166     535        201     198     205     205     255       271       2
21 | 


--------------------------------------------------------------------------------
/tuned/gs-latency/tuned.conf:
--------------------------------------------------------------------------------
 1 | #
 2 | # tuned configuration
 3 | #
 4 | 
 5 | [main]
 6 | # based on /usr/lib/tuned/latency-performance/tuned.conf
 7 | summary=Aggressive latency settings
 8 | 
 9 | [variables]
10 | include=/etc/tuned/gs-isol-cpus/vars.conf
11 | not_isolated_cpumask = ${f:cpulist2hex_invert:${isolated_cores}}
12 | 
13 | [cpu]
14 | #force_latency=1
15 | # Should be equivalent to idle=poll processor.max_cstate=0 intel_idle.max_cstate=0
16 | force_latency=0
17 | governor=performance
18 | energy_perf_bias=performance
19 | min_perf_pct=100
20 | 
21 | [sysctl]
22 | # ktune sysctl settings for rhel6 servers, maximizing i/o throughput
23 | #
24 | # Minimal preemption granularity for CPU-bound tasks:
25 | # (default: 1 msec#  (1 + ilog(ncpus)), units: nanoseconds)
26 | kernel.sched_min_granularity_ns=10000000
27 | 
28 | # If a workload mostly uses anonymous memory and it hits this limit, the entire
29 | # working set is buffered for I/O, and any more write buffering would require
30 | # swapping, so it's time to throttle writes until I/O can catch up.  Workloads
31 | # that mostly use file mappings may be able to use even higher values.
32 | #
33 | # The generator of dirty data starts writeback at this percentage (system default
34 | # is 20%)
35 | vm.dirty_ratio=10
36 | 
37 | # Start background writeback (via writeback threads) at this percentage (system
38 | # default is 10%)
39 | vm.dirty_background_ratio=3
40 | 
41 | # The swappiness parameter controls the tendency of the kernel to move
42 | # processes out of physical memory and onto the swap disk.
43 | # 0 tells the kernel to avoid swapping processes out of physical memory
44 | # for as long as possible
45 | # 100 tells the kernel to aggressively swap processes out of physical memory
46 | # and move them to swap cache
47 | vm.swappiness=10
48 | 
49 | # The total time the scheduler will consider a migrated process
50 | # "cache hot" and thus less likely to be re-migrated
51 | # (system default is 500000, i.e. 0.5 ms)
52 | kernel.sched_migration_cost_ns=5000000
53 | 
54 | # ^^^ above sysctl params from /usr/lib/tuned/latency-performance/tuned.conf
55 | vm.stat_interval=60
56 | 
57 | [sysfs]
58 | /sys/bus/workqueue/devices/writeback/cpumask = ${not_isolated_cpumask}
59 | /sys/devices/virtual/workqueue/cpumask = ${not_isolated_cpumask}
60 | # should be equivalent to mce=ignore_ce
61 | /sys/devices/system/machinecheck/machinecheck*/ignore_ce = 1
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/doc/pingpong-atom-C3758-half-hz.txt:
--------------------------------------------------------------------------------
 1 | $ ./pingpong --pin 0 6 --pin 1 5 --sem
 2 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
 3 |      0  2200000  500000    4130   10298       4471    4370    4708    4801    5152      8041     123 
 4 |      1  2200000  500000    4058   10873       4484    4377    4738    4829    5201      8121     126 
 5 | $ ./pingpong --pin 0 6 --pin 1 5 --futex
 6 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
 7 |      0  2200000  500000    4003    8964       4270    4200    4527    4608    4966      7720      73 
 8 |      1  2200000  500000    2631    9746       4263    4186    4533    4692    5024      7660      91 
 9 | $ ./pingpong --pin 0 6 --pin 1 5 --pipe
10 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
11 |      0  2200000  500000    5290   12497       5862    5660    6405    6560    7006      9944     210 
12 |      1  2200000  500000    5246   12132       5730    5533    6209    6531    6985      9837     211 
13 | $ ./pingpong --pin 0 6 --pin 1 5 --cv
14 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
15 |      0  2200000  500000    4498    9666       4845    4776    5055    5181    5549      8419      79 
16 |      1  2200000  500000    4390   13333       4765    4669    5026    5174    5521      8239     107 
17 | $ ./pingpong --pin 0 6 --pin 1 5 --spin
18 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
19 |      0  2200000  500000     173    3540        179     178     179     180     186       259       0 
20 |      1  2200000  500000     170    3455        175     174     176     176     181       186       0 
21 | $ ./pingpong --pin 0 6 --pin 1 5 --spin-pause
22 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
23 |      0  2200000  500000     168    3667        176     170     177     180     202       276       3 
24 |      1  2200000  500000     166    3376        170     166     173     177     201       209       3
25 | $ ./pingpong --pin 0 6 --pin 1 5 --spin-pause -p 2
26 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
27 |      0  2200000  500000     169    3965        191     178     241     255     271       455      12 
28 |      1  2200000  500000     166    3719        198     181     245     261     271       444      16
29 | 


--------------------------------------------------------------------------------
/doc/pingpong-atom-C3758.txt:
--------------------------------------------------------------------------------
 1 | $ ./pingpong --pin 0 6 --pin 1 5  --sem                                         
 2 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
 3 |      0  2200000  500000    4704   12329       5123    4916    5660    6095    6548     10087     216 
 4 |      1  2200000  500000    4654   16630       5105    4906    5690    6093    6562     10080     210 
 5 | $ ./pingpong --pin 0 6 --pin 1 5  --futex
 6 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
 7 |      0  2200000  500000    4552   12166       5003    4870    5660    5936    6350     10100     160 
 8 |      1  2200000  500000    4482   11956       4863    4721    5542    5897    6349     10038     164 
 9 | $ ./pingpong --pin 0 6 --pin 1 5  --pipe 
10 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
11 |      0  2200000  500000    6047  831490       6654    6394    7539    7928    8394     12050     272 
12 |      1  2200000  500000    5900   14528       6551    6264    7387    7913    8324     11926     300 
13 | $ ./pingpong --pin 0 6 --pin 1 5  --cv  
14 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
15 |      0  2200000  500000    5290   13729       5700    5528    6548    6771    7224     10766     192 
16 |      1  2200000  500000    5088   15968       5541    5369    6338    6559    6959     10613     193 
17 | $ ./pingpong --pin 0 6 --pin 1 5  --spin
18 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
19 |      0  2200000  500000     176     538        178     178     179     179     185       268       0 
20 |      1  2200000  500000     172     535        175     174     175     175     181       205       0 
21 | $ ./pingpong --pin 0 6 --pin 1 5  --spin-pause
22 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
23 |      0  2200000  500000     171     612        180     179     180     180     186       206       0 
24 |      1  2200000  500000     167     616        176     176     176     176     176       200       0 
25 | $ ./pingpong --pin 0 6 --pin 1 5  --spin-pause -p 2
26 | Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns
27 |      0  2200000  500000     169     549        175     171     178     179     244       269       2 
28 |      1  2200000  500000     166     535        201     198     205     205     255       271       2 
29 | 


--------------------------------------------------------------------------------
/tsc.h:
--------------------------------------------------------------------------------
 1 | // 2019, Georg Sauthoff <mail@gms.tf>
 2 | //
 3 | // SPDX-License-Identifier: GPL-3.0-or-later
 4 | 
 5 | 
 6 | // Read Time-Stamp Counter
 7 | extern __inline uint64_t __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 8 | fenced_rdtsc(void)
 9 | {
10 |     uint64_t x;
11 |     asm volatile (
12 |         ".intel_syntax noprefix  \n\t" // switch to prettier syntax
13 |         // 'If software requires RDTSC to be executed only after all previous
14 |         // instructions have executed and all previous loads and stores are
15 |         // globally visible, it can execute the sequence MFENCE;LFENCE
16 |         // immediately before RDTSC.'
17 |         // https://www.felixcloutier.com/x86/rdtsc
18 |         "mfence                  \n\t"
19 |         "lfence                  \n\t"
20 |         // similar effect, execute CPUID before RDTSC
21 |         // cf. https://www.intel.de/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
22 |         //"cpuid                   \n\t" // writes to EAX, EBX, ECX, EDX
23 |         "rdtsc                   \n\t" // counter into EDX:EAX
24 |         "shl     rdx, 0x20       \n\t" // shift higher-half left
25 |         "or      rax, rdx        \n\t" // combine them
26 |         ".att_syntax prefix      \n\t" // switch back to the default syntax
27 | 
28 |         : "=a" (x)       // output operands,
29 |                          // i.e. overwrites (=)  R'a'X which is mapped to x
30 |         :                // input operands
31 |         : "rdx");        // additional clobbers (with cpuid also: rbx, rcx)
32 |     return x;
33 | }
34 | // Read Time-Stamp Counter and Processor ID
35 | // 'The RDTSCP instruction is not a serializing instruction, but it does wait
36 | // until all previous instructions have executed and all previous loads are
37 | // globally visible.'
38 | // https://www.felixcloutier.com/x86/rdtscp
39 | extern __inline uint64_t __attribute__((__gnu_inline__, __always_inline__, __artificial__))
40 | fenced_rdtscp(void)
41 | {
42 |     uint64_t x;
43 |     asm volatile (
44 |         ".intel_syntax noprefix  \n\t"
45 |         "rdtscp                  \n\t" // counter into EDX:EAX, id into ECX
46 |         // 'If software requires RDTSCP to be executed prior to execution of
47 |         // any subsequent instruction (including any memory accesses), it can
48 |         // execute LFENCE immediately after RDTSCP.'
49 |         // https://www.felixcloutier.com/x86/rdtscp
50 |         "lfence                  \n\t" // better than CPUID
51 |         // alternatively call CPUID (clobbers more registers, though)
52 |         // cf. https://www.intel.de/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
53 |         "shl     rdx, 0x20       \n\t" // shift higher-half left
54 |         "or      rax, rdx        \n\t" // combine them
55 |         ".att_syntax prefix      \n\t"
56 | 
57 |         : "=a" (x)       // output operands,
58 |                          // i.e. overwrites (=)  R'a'X which is mapped to x
59 |         :                // input operands
60 |         : "rdx", "rcx"); // additional clobbers
61 |     return x;
62 | }
63 | 


--------------------------------------------------------------------------------
/tuned/gs-isol-cpus-half-hz/tuned.conf:
--------------------------------------------------------------------------------
 1 | #
 2 | # tuned configuration
 3 | #
 4 | 
 5 | [main]
 6 | # based on /usr/lib/tuned/throughput-performance/tuned.conf
 7 | summary=Isolate CPUs as much as possible, i.e. as adaptive ticks CPUs
 8 | 
 9 | [variables]
10 | include=${i:PROFILE_DIR}/vars.conf
11 | 
12 | [cpu]
13 | governor=performance
14 | energy_perf_bias=performance
15 | min_perf_pct=100
16 | 
17 | [disk]
18 | # The default unit for readahead is KiB.  This can be adjusted to sectors
19 | # by specifying the relevant suffix, eg. (readahead => 8192 s). There must
20 | # be at least one space between the number and suffix (if suffix is specified).
21 | readahead=>4096
22 | 
23 | [sysctl]
24 | # ktune sysctl settings for rhel6 servers, maximizing i/o throughput
25 | #
26 | # Minimal preemption granularity for CPU-bound tasks:
27 | # (default: 1 msec#  (1 + ilog(ncpus)), units: nanoseconds)
28 | kernel.sched_min_granularity_ns = 10000000
29 | 
30 | # SCHED_OTHER wake-up granularity.
31 | # (default: 1 msec#  (1 + ilog(ncpus)), units: nanoseconds)
32 | #
33 | # This option delays the preemption effects of decoupled workloads
34 | # and reduces their over-scheduling. Synchronous workloads will still
35 | # have immediate wakeup/sleep latencies.
36 | kernel.sched_wakeup_granularity_ns = 15000000
37 | 
38 | # If a workload mostly uses anonymous memory and it hits this limit, the entire
39 | # working set is buffered for I/O, and any more write buffering would require
40 | # swapping, so it's time to throttle writes until I/O can catch up.  Workloads
41 | # that mostly use file mappings may be able to use even higher values.
42 | #
43 | # The generator of dirty data starts writeback at this percentage (system default
44 | # is 20%)
45 | vm.dirty_ratio = 40
46 | 
47 | # Start background writeback (via writeback threads) at this percentage (system
48 | # default is 10%)
49 | vm.dirty_background_ratio = 10
50 | 
51 | # PID allocation wrap value.  When the kernel's next PID value
52 | # reaches this value, it wraps back to a minimum PID value.
53 | # PIDs of value pid_max or larger are not allocated.
54 | #
55 | # A suggested value for pid_max is 1024 * <# of cpu cores/threads in system>
56 | # e.g., a box with 32 cpus, the default of 32768 is reasonable, for 64 cpus,
57 | # 65536, for 4096 cpus, 4194304 (which is the upper limit possible).
58 | #kernel.pid_max = 65536
59 | 
60 | # The swappiness parameter controls the tendency of the kernel to move
61 | # processes out of physical memory and onto the swap disk.
62 | # 0 tells the kernel to avoid swapping processes out of physical memory
63 | # for as long as possible
64 | # 100 tells the kernel to aggressively swap processes out of physical memory
65 | # and move them to swap cache
66 | vm.swappiness=10
67 | 
68 | # nohz=on - just to be explicit, it is already the default
69 | # rcu_nocbs= - implied by nohz_full=
70 | # tsc=reliable - avoid timer interruptions where the TSCs of the different cores are compared
71 | #                cf. clocksource_watchdog calls in ftrace traces
72 | [bootloader]
73 | cmdline=isolcpus=${isolated_cores} nohz=on rcu_nocbs=${isolated_cores} rcu_nocb_poll nowatchdog mce=ignore_ce acpi_irq_nobalance pcie_aspm=off tsc=reliable
74 | 
75 | 


--------------------------------------------------------------------------------
/tuned/gs-isol-cpus-hz/tuned.conf:
--------------------------------------------------------------------------------
 1 | #
 2 | # tuned configuration
 3 | #
 4 | 
 5 | [main]
 6 | # based on /usr/lib/tuned/throughput-performance/tuned.conf
 7 | summary=Isolate CPUs as much as possible, i.e. as adaptive ticks CPUs
 8 | 
 9 | [variables]
10 | include=${i:PROFILE_DIR}/vars.conf
11 | 
12 | [cpu]
13 | governor=performance
14 | energy_perf_bias=performance
15 | min_perf_pct=100
16 | 
17 | [disk]
18 | # The default unit for readahead is KiB.  This can be adjusted to sectors
19 | # by specifying the relevant suffix, eg. (readahead => 8192 s). There must
20 | # be at least one space between the number and suffix (if suffix is specified).
21 | readahead=>4096
22 | 
23 | [sysctl]
24 | # ktune sysctl settings for rhel6 servers, maximizing i/o throughput
25 | #
26 | # Minimal preemption granularity for CPU-bound tasks:
27 | # (default: 1 msec#  (1 + ilog(ncpus)), units: nanoseconds)
28 | kernel.sched_min_granularity_ns = 10000000
29 | 
30 | # SCHED_OTHER wake-up granularity.
31 | # (default: 1 msec#  (1 + ilog(ncpus)), units: nanoseconds)
32 | #
33 | # This option delays the preemption effects of decoupled workloads
34 | # and reduces their over-scheduling. Synchronous workloads will still
35 | # have immediate wakeup/sleep latencies.
36 | kernel.sched_wakeup_granularity_ns = 15000000
37 | 
38 | # If a workload mostly uses anonymous memory and it hits this limit, the entire
39 | # working set is buffered for I/O, and any more write buffering would require
40 | # swapping, so it's time to throttle writes until I/O can catch up.  Workloads
41 | # that mostly use file mappings may be able to use even higher values.
42 | #
43 | # The generator of dirty data starts writeback at this percentage (system default
44 | # is 20%)
45 | vm.dirty_ratio = 40
46 | 
47 | # Start background writeback (via writeback threads) at this percentage (system
48 | # default is 10%)
49 | vm.dirty_background_ratio = 10
50 | 
51 | # PID allocation wrap value.  When the kernel's next PID value
52 | # reaches this value, it wraps back to a minimum PID value.
53 | # PIDs of value pid_max or larger are not allocated.
54 | #
55 | # A suggested value for pid_max is 1024 * <# of cpu cores/threads in system>
56 | # e.g., a box with 32 cpus, the default of 32768 is reasonable, for 64 cpus,
57 | # 65536, for 4096 cpus, 4194304 (which is the upper limit possible).
58 | #kernel.pid_max = 65536
59 | 
60 | # The swappiness parameter controls the tendency of the kernel to move
61 | # processes out of physical memory and onto the swap disk.
62 | # 0 tells the kernel to avoid swapping processes out of physical memory
63 | # for as long as possible
64 | # 100 tells the kernel to aggressively swap processes out of physical memory
65 | # and move them to swap cache
66 | vm.swappiness=10
67 | 
68 | # nohz=off   - old-school scheduler behavior, i.e. disable dyntick-idle mode
69 | # rcu_nocbs= - implied by nohz_full=
70 | # tsc=reliable - avoid timer interruptions where the TSCs of the different cores are compared
71 | #                cf. clocksource_watchdog calls in ftrace traces
72 | [bootloader]
73 | cmdline=isolcpus=${isolated_cores} nohz=off rcu_nocbs=${isolated_cores} rcu_nocb_poll nowatchdog mce=ignore_ce acpi_irq_nobalance pcie_aspm=off tsc=reliable
74 | 
75 | 


--------------------------------------------------------------------------------
/tuned/gs-isol-cpus/tuned.conf:
--------------------------------------------------------------------------------
 1 | #
 2 | # tuned configuration
 3 | #
 4 | 
 5 | [main]
 6 | # based on /usr/lib/tuned/throughput-performance/tuned.conf
 7 | summary=Isolate CPUs as much as possible, i.e. as adaptive ticks CPUs
 8 | 
 9 | [variables]
10 | include=${i:PROFILE_DIR}/vars.conf
11 | 
12 | [cpu]
13 | governor=performance
14 | energy_perf_bias=performance
15 | min_perf_pct=100
16 | 
17 | [disk]
18 | # The default unit for readahead is KiB.  This can be adjusted to sectors
19 | # by specifying the relevant suffix, eg. (readahead => 8192 s). There must
20 | # be at least one space between the number and suffix (if suffix is specified).
21 | readahead=>4096
22 | 
23 | [sysctl]
24 | # ktune sysctl settings for rhel6 servers, maximizing i/o throughput
25 | #
26 | # Minimal preemption granularity for CPU-bound tasks:
27 | # (default: 1 msec#  (1 + ilog(ncpus)), units: nanoseconds)
28 | kernel.sched_min_granularity_ns = 10000000
29 | 
30 | # SCHED_OTHER wake-up granularity.
31 | # (default: 1 msec#  (1 + ilog(ncpus)), units: nanoseconds)
32 | #
33 | # This option delays the preemption effects of decoupled workloads
34 | # and reduces their over-scheduling. Synchronous workloads will still
35 | # have immediate wakeup/sleep latencies.
36 | kernel.sched_wakeup_granularity_ns = 15000000
37 | 
38 | # If a workload mostly uses anonymous memory and it hits this limit, the entire
39 | # working set is buffered for I/O, and any more write buffering would require
40 | # swapping, so it's time to throttle writes until I/O can catch up.  Workloads
41 | # that mostly use file mappings may be able to use even higher values.
42 | #
43 | # The generator of dirty data starts writeback at this percentage (system default
44 | # is 20%)
45 | vm.dirty_ratio = 40
46 | 
47 | # Start background writeback (via writeback threads) at this percentage (system
48 | # default is 10%)
49 | vm.dirty_background_ratio = 10
50 | 
51 | # PID allocation wrap value.  When the kernel's next PID value
52 | # reaches this value, it wraps back to a minimum PID value.
53 | # PIDs of value pid_max or larger are not allocated.
54 | #
55 | # A suggested value for pid_max is 1024 * <# of cpu cores/threads in system>
56 | # e.g., a box with 32 cpus, the default of 32768 is reasonable, for 64 cpus,
57 | # 65536, for 4096 cpus, 4194304 (which is the upper limit possible).
58 | #kernel.pid_max = 65536
59 | 
60 | # The swappiness parameter controls the tendency of the kernel to move
61 | # processes out of physical memory and onto the swap disk.
62 | # 0 tells the kernel to avoid swapping processes out of physical memory
63 | # for as long as possible
64 | # 100 tells the kernel to aggressively swap processes out of physical memory
65 | # and move them to swap cache
66 | vm.swappiness=10
67 | 
68 | # cf. https://unix.stackexchange.com/a/539266/1131
69 | # nohz=on - just to be explicit, it is already the default
70 | # rcu_nocbs= - implied by nohz_full=
71 | # tsc=reliable - avoid timer interruptions where the TSCs of the different cores are compared
72 | #                cf. clocksource_watchdog calls in ftrace traces
73 | [bootloader]
74 | cmdline=isolcpus=${isolated_cores} nohz=on nohz_full=${isolated_cores} rcu_nocbs=${isolated_cores} rcu_nocb_poll nowatchdog mce=ignore_ce acpi_irq_nobalance pcie_aspm=off tsc=reliable
75 | 
76 | 


--------------------------------------------------------------------------------
/helper/bench_playbook.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Distribute and run a benchmark on a bunch of hosts.
 4 | #
 5 | # SPDX-License-Identifier: GPL-3.0-or-later
 6 | # SPDX-FileCopyrightText: © 2021 Georg Sauthoff <mail@gms.tf>
 7 | 
 8 | import mitogen
 9 | import mitogen.select
10 | import mitogen.utils
11 | 
12 | import argparse
13 | import logging
14 | import os
15 | import platform
16 | import subprocess
17 | import tempfile
18 | 
19 | 
20 | log = logging.getLogger(__name__)
21 | 
22 | def bench(exe, bcmd):
23 |     with tempfile.TemporaryDirectory() as d:
24 |         exe_path = f'{d}/bench'
25 |         with open(exe_path, 'wb') as f:
26 |             f.write(exe)
27 |         os.chmod(exe_path, 0o755)
28 |         core = min(int(os.cpu_count()/2*1.5), os.cpu_count()-1)
29 |         ts = [ 'taskset', '-c', str(core) ]
30 |         subprocess.check_output(ts + bcmd, cwd=d)
31 |         hostname = platform.node().split('.', 1)[0]
32 |         with open(f'{d}/out.csv') as f:
33 |             csv = f.read()
34 |         with open('/proc/cmdline') as f:
35 |             cmdline = f.read().strip()
36 |         try:
37 |             tuned = subprocess.check_output(['/usr/sbin/tuned-adm', 'active'], universal_newlines=True)
38 |             tuned = tuned.split()[-1]
39 |         except:
40 |             tuned = ''
41 |         with open('/proc/cpuinfo') as f:
42 |             cpuinfo = f.read().splitlines()
43 |             cpuinfo = [ l.split(': ')[-1] for l in cpuinfo if l.startswith('model name') ][0]
44 |         return hostname, cpuinfo, cmdline, tuned, csv
45 | 
46 | 
47 | def main(router, hosts, exe_path, bcmd, out_dir):
48 |     with open(exe_path, 'rb') as f:
49 |         exe = f.read()
50 | 
51 |     cns = [ (router.ssh(hostname=h, python_path='/usr/bin/python3'), h) for h in hosts ]
52 | 
53 |     fs = []
54 |     for c, host in cns:
55 |         log.info(f'Starting bench on {host} ...')
56 |         fs.append(c.call_async(bench, exe, bcmd))
57 | 
58 |     with open(f'{out_dir}/hosts.csv', 'w') as g:
59 |         g.write('hostname,cpuinfo,cmdline,tuned\n')
60 |         for i, res in enumerate(mitogen.select.Select(fs)):
61 |             log.info(f'Receiving from {res.router._stream_by_id[res.src_id].conn.options.hostname} ...')
62 |             r = res.unpickle()
63 |             g.write(f'{r[0]},{r[1]},"{r[2]}",{r[3]}\n')
64 |             with open(f'{out_dir}/bench-{r[0]}.csv', 'w') as f:
65 |                 f.write(r[4])
66 | 
67 | def parse_args():
68 |     p = argparse.ArgumentParser()
69 |     p.add_argument('hosts', metavar='HOST', nargs='+',
70 |             help='hosts under test')
71 |     p.add_argument('--out', '-o', default='out',
72 |             help='local directory for storing collected benchmark results (default: %(default)s)')
73 |     p.add_argument('--exe', '-e', default='bench_syscalls',
74 |             help='executable to transfer and execute remotely (default: %(default)s)')
75 |     p.add_argument('-n', type=int, default=3,
76 |             help='benchmark repetitions (default: %(default)s)')
77 |     p.add_argument('--log', default='pb.log',
78 |             help='logfile (is more verbose than the console log) (default: %(default)s)')
79 |     args = p.parse_args()
80 |     return args
81 | 
82 | if __name__ == '__main__':
83 |     args = parse_args()
84 |     bcmd = [ './bench', '--benchmark_out_format=csv', '--benchmark_out=out.csv',
85 |              f'--benchmark_repetitions={args.n}' ]
86 |     os.makedirs(args.out, exist_ok=True)
87 |     mitogen.utils.log_to_file(args.log)
88 |     h = logging.StreamHandler()
89 |     h.setFormatter(logging.Formatter(
90 |         '%(asctime)s - %(levelname)-8s - %(message)s [%(name)s]',
91 |         '%Y-%m-%d %H:%M:%S'))
92 |     log.addHandler(h)
93 |     mitogen.utils.run_with_router(main, args.hosts, args.exe, bcmd, args.out)
94 | 
95 | 


--------------------------------------------------------------------------------
/bench_syscalls.cc:
--------------------------------------------------------------------------------
  1 | 
  2 | // SPDX-License-Identifier: GPL-3.0-or-later
  3 | // SPDX-FileCopyrightText: © 2021 Georg Sauthoff <mail@gms.tf>
  4 | 
  5 | #include <benchmark/benchmark.h>
  6 | 
  7 | #include <unistd.h>
  8 | #include <sys/types.h>
  9 | #include <sched.h>
 10 | #include <pthread.h>
 11 | #include <time.h>
 12 | #include <math.h>
 13 | #include <sys/prctl.h>
 14 | 
 15 | #include <assert.h>
 16 | 
 17 | 
 18 | static void bench_getuid(benchmark::State& state) {
 19 |     for (auto _ : state) {
 20 |         getuid();
 21 |     }
 22 | }
 23 | 
 24 | BENCHMARK(bench_getuid);
 25 | 
 26 | static void bench_getpid(benchmark::State& state) {
 27 |     for (auto _ : state) {
 28 |         getpid();
 29 |     }
 30 | }
 31 | 
 32 | BENCHMARK(bench_getpid);
 33 | 
 34 | static void bench_close(benchmark::State& state) {
 35 |     for (auto _ : state) {
 36 |         close(999);
 37 |     }
 38 | }
 39 | 
 40 | BENCHMARK(bench_close);
 41 | 
 42 | static void bench_syscall(benchmark::State& state) {
 43 |     for (auto _ : state) {
 44 |         syscall(423);
 45 |     }
 46 | }
 47 | 
 48 | BENCHMARK(bench_syscall);
 49 | 
 50 | static void bench_sched_yield(benchmark::State& state) {
 51 |     for (auto _ : state) {
 52 |         sched_yield();
 53 |     }
 54 | }
 55 | 
 56 | BENCHMARK(bench_sched_yield);
 57 | 
 58 | static void bench_clock_gettime(benchmark::State& state) {
 59 |     struct timespec ts = {0};
 60 |     for (auto _ : state) {
 61 |         clock_gettime(CLOCK_REALTIME, &ts);
 62 |     }
 63 | }
 64 | 
 65 | BENCHMARK(bench_clock_gettime);
 66 | 
 67 | static void bench_clock_gettime_tai(benchmark::State& state) {
 68 |     struct timespec ts = {0};
 69 |     for (auto _ : state) {
 70 |         clock_gettime(CLOCK_TAI, &ts);
 71 |     }
 72 | }
 73 | 
 74 | BENCHMARK(bench_clock_gettime_tai);
 75 | 
 76 | static void bench_clock_gettime_monotonic(benchmark::State& state) {
 77 |     struct timespec ts = {0};
 78 |     for (auto _ : state) {
 79 |         clock_gettime(CLOCK_MONOTONIC, &ts);
 80 |     }
 81 | }
 82 | 
 83 | BENCHMARK(bench_clock_gettime_monotonic);
 84 | 
 85 | static void bench_clock_gettime_monotonic_raw(benchmark::State& state) {
 86 |     struct timespec ts = {0};
 87 |     for (auto _ : state) {
 88 |         clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
 89 |     }
 90 | }
 91 | 
 92 | BENCHMARK(bench_clock_gettime_monotonic_raw);
 93 | 
 94 | static void bench_nanosleep0(benchmark::State& state) {
 95 |     struct timespec ts = {0};
 96 |     for (auto _ : state) {
 97 |         int r = nanosleep(&ts, 0);
 98 |         assert(!r);
 99 |     }
100 | }
101 | 
102 | BENCHMARK(bench_nanosleep0);
103 | 
104 | static void bench_nanosleep0_slack1(benchmark::State& state) {
105 |     int r = prctl(PR_SET_TIMERSLACK, 1, 0, 0, 0);
106 |     assert(!r);
107 |     struct timespec ts = {0};
108 |     for (auto _ : state) {
109 |         int r = nanosleep(&ts, 0);
110 |         assert(!r);
111 |     }
112 | }
113 | 
114 | BENCHMARK(bench_nanosleep0_slack1);
115 | 
116 | static void bench_nanosleep1_slack1(benchmark::State& state) {
117 |     int r = prctl(PR_SET_TIMERSLACK, 1, 0, 0, 0);
118 |     assert(!r);
119 |     struct timespec ts = { .tv_nsec = 1 };
120 |     for (auto _ : state) {
121 |         int r = nanosleep(&ts, 0);
122 |         assert(!r);
123 |     }
124 | }
125 | 
126 | BENCHMARK(bench_nanosleep1_slack1);
127 | 
128 | static void bench_pthread_cond_signal(benchmark::State& state) {
129 |     pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
130 |     for (auto _ : state) {
131 |         int r = pthread_cond_signal(&cv);
132 |         assert(!r);
133 |     }
134 | }
135 | 
136 | BENCHMARK(bench_pthread_cond_signal);
137 | 
138 | static void bench_assign(benchmark::State& state) {
139 |     double f = 0;
140 |     for (auto _ : state) {
141 |         f = 23;
142 |         benchmark::DoNotOptimize(f);
143 |     }
144 | }
145 | 
146 | BENCHMARK(bench_assign);
147 | 
148 | static void bench_sqrt(benchmark::State& state) {
149 |     double f = 23;
150 |     double g = 0;
151 |     for (auto _ : state) {
152 |         benchmark::DoNotOptimize(f);
153 |         g = sqrt(f);
154 |         benchmark::DoNotOptimize(g);
155 |     }
156 | }
157 | 
158 | BENCHMARK(bench_sqrt);
159 | 
160 | static void bench_sqrtrec(benchmark::State& state) {
161 |     double f = 23;
162 |     for (auto _ : state) {
163 |         f = sqrt(f);
164 |     }
165 | }
166 | 
167 | BENCHMARK(bench_sqrtrec);
168 | 
169 | static void bench_nothing(benchmark::State& state) {
170 |     unsigned i = 0;
171 |     for (auto _ : state) {
172 |         ++i;
173 |     }
174 | }
175 | 
176 | BENCHMARK(bench_nothing);
177 | 
178 | BENCHMARK_MAIN();
179 | 


--------------------------------------------------------------------------------
/util.c:
--------------------------------------------------------------------------------
  1 | 
  2 | // 2019, Georg Sauthoff <mail@gms.tf>
  3 | //
  4 | // SPDX-License-Identifier: GPL-3.0-or-later
  5 | 
  6 | #define _GNU_SOURCE
  7 | 
  8 | #include "util.h"
  9 | 
 10 | #include <stdio.h>
 11 | #include <string.h>
 12 | #include <stdlib.h>
 13 | #include <stdbool.h>
 14 | #include <assert.h>
 15 | #include <sys/types.h>
 16 | #include <sys/stat.h>
 17 | #include <fcntl.h>
 18 | #include <errno.h>
 19 | #include <unistd.h>
 20 | 
 21 | // perf_event_open() etc.
 22 | #include <asm/unistd.h>
 23 | #include <linux/perf_event.h>
 24 | #include <sys/mman.h>
 25 | 
 26 | void perror_e(int r, const char *msg)
 27 | {
 28 |     char buf[1024];
 29 |     fprintf(stderr, "%s: %s\n", msg, strerror_r(r, buf, sizeof buf));
 30 | }
 31 | 
 32 | static bool is_sorted(const uint32_t *xs, size_t n)
 33 | {
 34 |     if (!n)
 35 |         return true;
 36 |     uint32_t a = xs[0];
 37 |     for (size_t i = 1; i < n; ++i) {
 38 |         if (a > xs[i])
 39 |             return false;
 40 |         a = xs[i];
 41 |     }
 42 |     return true;
 43 | }
 44 | 
 45 | uint32_t percentile_u32(const uint32_t *x, size_t n, size_t a, size_t b)
 46 | {
 47 |     assert(is_sorted(x, n));
 48 | 
 49 |     if (!n)
 50 |         return 0;
 51 |     size_t i = n * a / b;
 52 |     assert(i < n);
 53 |     if (n % 2 || !i) {
 54 |         return x[i];
 55 |     } else {
 56 |         assert(i);
 57 |         return (x[i] + x[i-1])/2;
 58 |     }
 59 | }
 60 | 
 61 | // median absolute deviation
 62 | // a measure of dispersion (like the standard deviation)
 63 | uint32_t mad_u32(const uint32_t *x, uint32_t *y, size_t n)
 64 | {
 65 |     if (!n)
 66 |         return 0;
 67 |     uint32_t median = percentile_u32(x, n, 1, 2);
 68 |     for (size_t i = 0; i < n; ++i) {
 69 |         y[i] = labs((long)x[i] - (long)median);
 70 |     }
 71 |     qsort(y, n, sizeof y[0], cmp_u32);
 72 |     uint32_t mad = percentile_u32(y, n, 1, 2);
 73 |     return mad;
 74 | }
 75 | 
 76 | // This function is copied from
 77 | // https://elixir.bootlin.com/linux/v5.2.12/source/kernel/time/clocksource.c#L21
 78 | // File license: GPL-2.0+
 79 | // slightly modified
 80 | /**
 81 |  * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
 82 |  * @mult:	pointer to mult variable
 83 |  * @shift:	pointer to shift variable
 84 |  * @from:	frequency to convert from
 85 |  * @to:		frequency to convert to
 86 |  * @maxsec:	guaranteed runtime conversion range in seconds
 87 |  *
 88 |  * The function evaluates the shift/mult pair for the scaled math
 89 |  * operations of clocksources and clockevents.
 90 |  *
 91 |  * @to and @from are frequency values in HZ. For clock sources @to is
 92 |  * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
 93 |  * event @to is the counter frequency and @from is NSEC_PER_SEC.
 94 |  *
 95 |  * The @maxsec conversion range argument controls the time frame in
 96 |  * seconds which must be covered by the runtime conversion with the
 97 |  * calculated mult and shift factors. This guarantees that no 64bit
 98 |  * overflow happens when the input value of the conversion is
 99 |  * multiplied with the calculated mult factor. Larger ranges may
100 |  * reduce the conversion accuracy by chosing smaller mult and shift
101 |  * factors.
102 |  */
103 | void clocks_calc_mult_shift(
104 |         uint32_t *mult, uint32_t *shift, uint32_t from, uint32_t to,
105 |         uint32_t maxsec)
106 | {
107 | 	uint64_t tmp;
108 | 	uint32_t sft, sftacc= 32;
109 | 
110 | 	/*
111 | 	 * Calculate the shift factor which is limiting the conversion
112 | 	 * range:
113 | 	 */
114 | 	tmp = ((uint64_t)maxsec * from) >> 32;
115 | 	while (tmp) {
116 | 		tmp >>=1;
117 | 		sftacc--;
118 | 	}
119 | 
120 | 	/*
121 | 	 * Find the conversion shift/mult pair which has the best
122 | 	 * accuracy and fits the maxsec conversion range:
123 | 	 */
124 | 	for (sft = 32; sft > 0; sft--) {
125 | 		tmp = (uint64_t) to << sft;
126 | 		tmp += from / 2;
127 | 		// do_div(tmp, from);
128 |                 tmp = tmp / (uint64_t) from;
129 |                 
130 | 		if ((tmp >> sftacc) == 0)
131 | 			break;
132 | 	}
133 | 	*mult = tmp;
134 | 	*shift = sft;
135 | }
136 | 
137 | 
138 | 
139 | // as of Kernel 5.2.7 /sys/devices/system/cpu/cpu0/tsc_freq_khz
140 | // isn't provided by the mainline kernel
141 | // see https://github.com/trailofbits/ 
142 | // or even better https://github.com/trailofbits/tsc_freq_khz/pull/1
143 | // for a simple kernel module that provides this file
144 | static int get_tsc_khz_proc(uint32_t *tsc_khz)
145 | {
146 |     int fd = open("/sys/devices/system/cpu/cpu0/tsc_freq_khz", O_RDONLY);
147 |     if (fd == -1) {
148 |         if (errno == ENOENT)
149 |             return 1;
150 |         perror("opening /sys/devices/system/cpu/cpu0/tsc_freq_khz");
151 |         return -1;
152 |     }
153 |     char buf[16];
154 |     ssize_t r = read(fd, buf, sizeof buf - 1);
155 |     if (r == -1) {
156 |         perror("reading /sys/devices/system/cpu/cpu0/tsc_freq_khz");
157 |         close(fd);
158 |         return -1;
159 |     }
160 |     buf[r] = 0;
161 |     if (r && buf[r-1] == '\n')
162 |         buf[r-1] = 0;
163 |     *tsc_khz = atoi(buf);
164 |     int t = close(fd);
165 |     if (t == -1) {
166 |         perror("closing /sys/devices/system/cpu/cpu0/tsc_freq_khz");
167 |         return -1;
168 |     }
169 |     return 0;
170 | }
171 | 
172 | static int get_tsc_khz_cmd(const char *cmd, uint32_t *tsc_khz)
173 | {
174 |     FILE *f = popen(cmd, "re");
175 |     if (!f) {
176 |         perror("reading TSC khz from journalctl failed");
177 |         return 1;
178 |     }
179 |     char *line = 0;
180 |     size_t n = 0;
181 |     ssize_t l = getline(&line, &n, f);
182 |     if (l == -1) {
183 |         if (!feof(f)) {
184 |             perror("journal getline");
185 |             pclose(f);
186 |             return -1;
187 |         }
188 |     }
189 |     if (l > 15 + 7) {
190 |         fprintf(stderr, "buffer for TSC khz from journal too small\n");
191 |         return -1;
192 |     }
193 |     if (l < 11)
194 |         return 0;
195 |     char buf[16];
196 |     char *t = mempcpy(buf, line+1, l-1-8-1);
197 |     t = mempcpy(t, line+(l-7-1), 3);
198 |     *t = 0;
199 |     *tsc_khz = atoi(buf);
200 |     int r = pclose(f);
201 |     if (r == -1) {
202 |         perror("pclose journal");
203 |         return -1;
204 |     }
205 |     return 0;
206 | }
207 | 
208 | static int get_tsc_khz_journal(uint32_t *tsc_khz)
209 | {
210 | 
211 |     const char cmd[] = "journalctl -k 2>/dev/null | grep 'kernel: tsc:' -i "
212 |             "| cut -d' ' -f5- | grep -o ' [0-9]\\+\\.[0-9]\\{3\\} MHz' "
213 |             "| tail -n 1 ";
214 |     return get_tsc_khz_cmd(cmd, tsc_khz);
215 | }
216 | 
217 | // fall-back to dmesg on systems without journald or ones
218 | // where the user doesn't have enough permissions for journalctl --boot.
219 | // pitfall: the message might be already rotated out of the dmesg buffer,
220 | // on a long running system
221 | static int get_tsc_khz_dmesg(uint32_t *tsc_khz)
222 | {
223 |     const char cmd[] = "dmesg  | grep '\\] tsc:' -i"
224 |             "| cut -d' ' -f5- | grep -o ' [0-9]\\+\\.[0-9]\\{3\\} MHz' "
225 |             "| tail -n 1 ";
226 |     return get_tsc_khz_cmd(cmd, tsc_khz);
227 | }
228 | 
229 | // see also https://stackoverflow.com/a/57835630/427158 for
230 | // some ways to get the tick rate of the TSC
231 | int get_tsc_khz(uint32_t *tsc_khz)
232 | {
233 |     *tsc_khz = 0;
234 |     int r = get_tsc_khz_proc(tsc_khz);
235 |     if (r < 0)
236 |         return r;
237 |     if (!*tsc_khz) {
238 |         int r = get_tsc_khz_journal(tsc_khz);
239 |         if (r < 0)
240 |             return r;
241 |     }
242 |     if (!*tsc_khz) {
243 |         int r = get_tsc_khz_dmesg(tsc_khz);
244 |         if (r < 0)
245 |             return r;
246 |     }
247 |     if (!*tsc_khz) {
248 |         fprintf(stderr, "Couldn't determine TSC rate\n");
249 |         return -1;
250 |     }
251 |     return 0;
252 | }
253 | 
254 | 
255 | static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
256 |                    int cpu, int group_fd, unsigned long flags)
257 | {
258 |     return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
259 | }
260 | 
261 | // see also https://stackoverflow.com/a/57835630/427158
262 | //
263 | // Unfortunately, the kernel decreases precision of mult and shift
264 | // due to backwards compatibility:
265 | //
266 | // https://elixir.bootlin.com/linux/v5.19.17/source/arch/x86/kernel/tsc.c#L148
267 | //
268 | // Thus, for short durations, calling clocks_calc_mult_shift() with the true
269 | // TSC rate in user space is more precise.
270 | int get_tsc_perf(uint32_t *mult, uint32_t *shift)
271 | {
272 |     struct perf_event_attr pe = {
273 |         .type           = PERF_TYPE_HARDWARE,
274 |         .size           = sizeof(struct perf_event_attr),
275 |         .config         = PERF_COUNT_HW_INSTRUCTIONS,
276 |         .disabled       = 1,
277 |         .exclude_kernel = 1,
278 |         .exclude_hv     = 1
279 |     };
280 |     int fd = perf_event_open(&pe, 0, -1, -1, 0);
281 |     if (fd == -1) {
282 |         perror("perf_event_open failed");
283 |         return -1;
284 |     }
285 |     void *addr = mmap(NULL, 4*1024, PROT_READ, MAP_SHARED, fd, 0);
286 |     if (!addr) {
287 |         perror("mmap perf page failed");
288 |         return -1;
289 |     }
290 |     struct perf_event_mmap_page *pc = addr;
291 |     if (pc->cap_user_time != 1) {
292 |         fprintf(stderr, "Perf system doesn't support user time\n");
293 |         return -1;
294 |     }
295 |     *mult  = pc->time_mult;
296 |     *shift = pc->time_shift;
297 |     int r = munmap(addr, 4*1024);
298 |     if (r == -1) {
299 |         perror("munmap perf page");
300 |         return -1;
301 |     }
302 |     close(fd);
303 |     return 0;
304 | }
305 | 
306 | 


--------------------------------------------------------------------------------
/ptp-clock-offset.c:
--------------------------------------------------------------------------------
  1 | // Check what methods are available for PTP offset calculation
  2 | // and how they perform.
  3 | //
  4 | // 2020, Georg Sauthoff <mail@gms.tf>
  5 | //
  6 | // SPDX-License-Identifier: GPL-3.0-or-later
  7 | 
  8 | #define _GNU_SOURCE
  9 | 
 10 | #include <stdio.h>
 11 | #include <stdint.h>
 12 | #include <inttypes.h>
 13 | #include <stdbool.h>
 14 | 
 15 | 
 16 | #include <fcntl.h>
 17 | #include <sys/ioctl.h>
 18 | #include <sys/stat.h>
 19 | #include <sys/types.h>
 20 | #include <time.h>
 21 | 
 22 | #include <linux/ptp_clock.h>
 23 | #include "ptp-clock-future.h"
 24 | 
 25 | 
 26 | // for Solarflare private clock offset ioctl
 27 | #include <linux/sockios.h> // SIOCDEVPRIVATE
 28 | #include <net/if.h>        // ifreq
 29 | #include <sys/types.h>     // socket()
 30 | #include <sys/socket.h>    // socket()
 31 | #include <string.h>        // strcpy()
 32 | #include <unistd.h>        // close()
 33 | 
 34 | 
 35 | #include <linux/ethtool.h> // ethtool_ts_info
 36 | #include <linux/sockios.h> // SIOCETHTOOL
 37 | 
 38 | 
 39 | #include "tsc.h"
 40 | #include "util.h"
 41 | 
 42 | 
 43 | // as of 2020
 44 | static int64_t tai_off_ns = 37000000000l;
 45 | 
 46 | #ifndef PCO_READ_PERF
 47 | static uint32_t tsc_khz;
 48 | #endif
 49 | static uint32_t tsc_mult;
 50 | static uint32_t tsc_shift;
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | static int64_t pct2ns(const struct ptp_clock_time *ptc)
 57 | {
 58 |     return (int64_t)(ptc->sec * 1000000000) + (int64_t)ptc->nsec;
 59 | }
 60 | static int64_t pct2ns_tai(const struct ptp_clock_time *ptc)
 61 | {
 62 |     return pct2ns(ptc) + tai_off_ns;
 63 | }
 64 | 
 65 | static int64_t ts2ns(const struct timespec *ts)
 66 | {
 67 |     return (int64_t)(ts->tv_sec * 1000000000) + (int64_t)ts->tv_nsec;
 68 | }
 69 | static int64_t ts2ns_tai(const struct timespec *ts)
 70 | {
 71 |     return ts2ns(ts) + tai_off_ns;
 72 | }
 73 | 
 74 | static uint64_t tsc2ns(uint64_t cyc)
 75 | {
 76 |     return mul_u64_u32_shr(cyc, tsc_mult, tsc_shift);
 77 | }
 78 | 
 79 | // these 2 lines are from linuxptp's missing.h
 80 | #define CLOCKFD 3
 81 | #define FD_TO_CLOCKID(fd)	((clockid_t) ((((unsigned int) ~fd) << 3) | CLOCKFD))
 82 | 
 83 | static int read_clock_offset(int fd)
 84 | {
 85 |     int r[3];
 86 |     struct timespec ts[3];
 87 |     clockid_t clk_id = FD_TO_CLOCKID(fd);
 88 |     for (int i = 0; i < 5; ++i) {
 89 |         r[0] = clock_gettime(CLOCK_REALTIME, ts);
 90 |         r[1] = clock_gettime(clk_id, ts+1);
 91 |         r[2] = clock_gettime(CLOCK_REALTIME, ts+2);
 92 |         if (r[0] == -1) {
 93 |             perror("clock_gettime CLOCK_REALTIME 1");
 94 |             return 1;
 95 |         }
 96 |         if (r[1] == -1) {
 97 |             perror("clock_gettime ptp");
 98 |             return 1;
 99 |         }
100 |         if (r[2] == -1) {
101 |             perror("clock_gettime CLOCK_REALTIME 2");
102 |             return 1;
103 |         }
104 |         int64_t delay = ts2ns_tai(ts + 2) - ts2ns_tai(ts);
105 |         int64_t off  = (ts2ns_tai(ts) + ts2ns_tai(ts + 2)) / 2 - ts2ns(ts + 1);
106 |         printf("clock_gettime no %u: %" PRId64 " ns, delay: %" PRId64 " ns\n",
107 |                 i+1, off, delay);
108 |     }
109 |     return 0;
110 | }
111 | 
112 | static int read_ptp_offset(int fd)
113 | {
114 |     struct ptp_sys_offset pso = { .n_samples =  5};
115 |     uint64_t b = fenced_rdtsc();
116 |     int r = ioctl(fd, PTP_SYS_OFFSET, &pso);
117 |     uint64_t e = fenced_rdtscp();
118 |     if (r) {
119 |         perror("PTP_SYS_OFFSET");
120 |         return 1;
121 |     }
122 |     uint64_t sc_delay = tsc2ns(e - b);
123 |     unsigned k = 1;
124 |     for (unsigned i = 0; i < pso.n_samples * 2; i+=2, ++k) {
125 |         int64_t delay = pct2ns_tai(pso.ts + i+2) - pct2ns_tai(pso.ts + i);
126 |         int64_t off  = (pct2ns_tai(pso.ts + i) + pct2ns_tai(pso.ts + i+2)) / 2 - pct2ns(pso.ts + i+1);
127 |         printf("PTP_SYS_OFFSET no %u: %" PRId64 " ns, delay: %" PRId64 " ns, syscall: %" PRIu64 " ns\n",
128 |                 k, off, delay, sc_delay);
129 |     }
130 |     return 0;
131 | }
132 | 
133 | static int read_ptp_offset_extended(int fd)
134 | {
135 |     struct ptp_sys_offset_extended psoe = { .n_samples =  5};
136 |     uint64_t b = fenced_rdtsc();
137 |     int r = ioctl(fd, PTP_SYS_OFFSET_EXTENDED, &psoe);
138 |     uint64_t e = fenced_rdtscp();
139 |     if (r) {
140 |         perror("PTP_SYS_OFFSET_EXTENDED");
141 |         return 1;
142 |     }
143 |     uint64_t sc_delay = tsc2ns(e - b);
144 |     for (unsigned i = 0; i < psoe.n_samples; ++i) {
145 |         int64_t delay = pct2ns_tai(&psoe.ts[i][2]) - pct2ns_tai(&psoe.ts[i][0]);
146 |         int64_t off  = (pct2ns_tai(&psoe.ts[i][0]) + pct2ns_tai(&psoe.ts[i][2])) / 2
147 |                            - pct2ns(&psoe.ts[i][1]);
148 |         printf("PTP_SYS_OFFSET_EXTENDED no %u: %" PRId64 " ns, delay: %" PRId64 " ns, sycall: %" PRIu64 " ns\n",
149 |                 i+1, off, delay, sc_delay);
150 |     }
151 |     return 0;
152 | }
153 | 
154 | static int read_ptp_offset_precise(int fd)
155 | {
156 |     struct ptp_sys_offset_precise psop = { 0 };
157 |     uint64_t b = fenced_rdtsc();
158 |     int r = ioctl(fd, PTP_SYS_OFFSET_PRECISE, &psop);
159 |     uint64_t e = fenced_rdtscp();
160 |     if (r) {
161 |         perror("PTP_SYS_OFFSET_PRECISE");
162 |         return 1;
163 |     }
164 |     uint64_t sc_delay = tsc2ns(e - b);
165 |     int64_t off  = pct2ns_tai(&psop.sys_realtime) - pct2ns(&psop.device);
166 |     printf("PTP_SYS_OFFSET_PRECISE: %" PRId64 " ns, delay: 0 ns, syscall: %" PRIu64 " ns\n",
167 |                 off, sc_delay);
168 |     return 0;
169 | }
170 | 
171 | 
172 | static int mk_if_fd()
173 | {
174 |     int fd = socket(AF_INET, SOCK_DGRAM, 0);
175 |     if (fd == -1)
176 |         perror("creating if fd");
177 |     return fd;
178 | }
179 | 
180 | 
181 | static int get_ptp_dev(int fd, const char *if_name, const char **dev, bool *is_sfc)
182 | {
183 |     struct ethtool_ts_info tsi = {
184 |         .cmd = ETHTOOL_GET_TS_INFO,
185 |         .phc_index = 23
186 |     };
187 | 
188 |     struct ifreq ifr = {
189 |         .ifr_data = (void*) &tsi
190 |     };
191 |     strcpy(ifr.ifr_name, if_name);
192 | 
193 |     int r = ioctl(fd, SIOCETHTOOL, &ifr);
194 |     if (r == -1) {
195 |         perror("ioctl SIOCETHTOOL ETHTOOL_GET_TS_INFO");
196 |         return -1;
197 |     }
198 | 
199 |     if (tsi.phc_index == -1) {
200 |         fprintf(stderr, "%s has no PTP hardware clock device\n", if_name);
201 |         return -1;
202 |     }
203 |     char *s = 0;
204 |     r = asprintf(&s, "/dev/ptp%d", tsi.phc_index);
205 |     if (r == -1) {
206 |         perror("asprintf");
207 |         return -1;
208 |     }
209 |     *dev = s;
210 | 
211 |     struct ethtool_drvinfo di = {
212 |         .cmd = ETHTOOL_GDRVINFO
213 |     };
214 |     ifr.ifr_data = (void*) &di;
215 | 
216 |     r = ioctl(fd, SIOCETHTOOL, &ifr);
217 |     if (r == -1) {
218 |         perror("ioctl SIOCETHTOOL ETHTOOL_GDRVINFO");
219 |         return 1;
220 |     }
221 | 
222 |     if (!strcmp(di.driver, "sfc"))
223 |         *is_sfc = true;
224 | 
225 |     return 0;
226 | }
227 | 
228 | 
229 | struct sfc_ts {
230 |     int64_t sec;
231 |     int32_t nsec;
232 | };
233 | 
234 | static int64_t sfcts2ns(const struct sfc_ts *ts)
235 | {
236 |     return (int64_t)(ts->sec * 1000000000lu) + (int64_t)ts->nsec;
237 | }
238 | 
239 | const unsigned long SIOCEFX = SIOCDEVPRIVATE + 3;
240 | const uint16_t EFX_TS_SYNC = 0xef16;
241 | 
242 | static int read_sfc_offset(int fd, const char *name)
243 | {
244 |     struct ts_req {
245 |         uint16_t command;
246 |         uint16_t pad;
247 |         struct sfc_ts ts;
248 |     } __attribute__ ((packed));
249 |     struct ts_req d = {
250 |         .command = EFX_TS_SYNC
251 |     };
252 |     struct ifreq ifr = {
253 |         .ifr_data = (void*) &d
254 |     };
255 |     strcpy(ifr.ifr_name, name);
256 | 
257 | 
258 | 
259 |     uint64_t b = fenced_rdtsc();
260 |     int r = ioctl(fd, SIOCEFX, &ifr);
261 |     uint64_t e = fenced_rdtscp();
262 |     if (r) {
263 |         perror("SFC SIOCEFX");
264 |         return 1;
265 |     }
266 |     uint64_t sc_delay = tsc2ns(e - b);
267 |     struct sfc_ts t = d.ts;
268 |     int64_t off = sfcts2ns(&t);
269 | 
270 |     printf("SFC_OFFSET: %" PRId64 " ns, delay: ? ns, syscall: %" PRIu64 " ns\n",
271 |             off, sc_delay);
272 | 
273 | 
274 |     return 0;
275 | }
276 | 
277 | int main(int argc, char **argv)
278 | {
279 |     if (argc < 2) {
280 |         fprintf(stderr, "call: %s /dev/ptpX|ifname\n", argv[0]);
281 |         return 1;
282 |     }
283 | 
284 | #ifndef PCO_READ_PERF
285 |     int r = get_tsc_khz(&tsc_khz);
286 |     if (r) {
287 |         return 1;
288 |     }
289 |     clocks_calc_mult_shift(&tsc_mult, &tsc_shift,
290 |             tsc_khz, 1000000l, 0);
291 | #else
292 |     int r = get_tsc_perf(&tsc_mult, &tsc_shift);
293 |     if (r == -1)
294 |         return 1;
295 | #endif
296 | 
297 | 
298 |     bool is_sfc = false;
299 |     const char *if_name = 0;
300 |     int if_fd = -1;
301 |     const char *dev = argv[1];
302 | 
303 | 
304 |     if (*dev != '/') {
305 |         if_name = dev;
306 |         if_fd = mk_if_fd();
307 |         if (if_fd == -1)
308 |             return 1;
309 |         int r = get_ptp_dev(if_fd, if_name, &dev, &is_sfc);
310 |         if (r == -1)
311 |             return 1;
312 |     }
313 | 
314 |     int fd = open(dev, O_RDWR);
315 |     if (fd == -1) {
316 |         perror("open PTP device");
317 |         return 1;
318 |     }
319 | 
320 |     printf("## Testing clock_gettime\n");
321 |     read_clock_offset(fd);
322 | 
323 |     printf("## Testing PTP_SYS_OFFSET ioctl (%#lx)\n", PTP_SYS_OFFSET);
324 |     read_ptp_offset(fd);
325 |     printf("## Testing PTP_SYS_OFFSET_EXTENDED ioctl (%#lx)\n", PTP_SYS_OFFSET_EXTENDED);
326 |     read_ptp_offset_extended(fd);
327 |     printf("## Testing PTP_SYS_OFFSET_PRECISE ioctl (%#lx)\n", PTP_SYS_OFFSET_PRECISE);
328 |     read_ptp_offset_precise(fd);
329 | 
330 |     if (is_sfc) {
331 |         printf("## Testing Solarflare SIOCEFX / EFX_TS_SYNC ioctl (%#lx / %#" PRIx16 ")\n", SIOCEFX, EFX_TS_SYNC);
332 |         read_sfc_offset(if_fd, if_name);
333 |     }
334 | 
335 |     if (if_fd != -1)
336 |         close(if_fd);
337 |     close(fd);
338 | 
339 |     return 0;
340 | }
341 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | This repository contains OSjitter, Pingpong and other
  2 | latency/performance related utilities.
  3 | 
  4 | OSjitter is a tool for measuring how much
  5 | the operating system interrupts programs. Such interruptions
  6 | increase the latency of a program while the variation in latency
  7 | is called jitter.
  8 | 
  9 | This tool can be used to quickly measure a lower bound for the
 10 | latency of a given system configuration. Note that the OS jitter
 11 | depends on the kind of load a real-time program is applying to a
 12 | system. Thus, one still needs to execute a domain specific
 13 | test-suite to the real-time program of interest after a tool like
 14 | OSjitter shows good results.
 15 | 
 16 | The Pingong utility measures the overhead of several thread
 17 | notification mechanisms such as spinning on a atomic variable
 18 | (with/without pauses), POSIX condition variables, semaphores,
 19 | pipes and raw Linux futexes.
 20 | 
 21 | The ptp-clock-offset utility is a small program for checking
 22 | the availability of different PTP offset ioctls and how they
 23 | perform. Rule of thumb: using any PTP offset ioctl is better than
 24 | having to use `clock_gettime()` and smaller delays are better.
 25 | 
 26 | There is also a microbenchmark (`bench_syscalls.cc`) that measures
 27 | some (seemingly) low-overhead syscalls in order to measure the
 28 | userspace to kernelspace mode-switch costs. See also a [related
 29 | blog post](https://gms.tf/on-the-costs-of-syscalls.html) for some results.
 30 | 
 31 | 
 32 | 2019, Georg Sauthoff <mail@gms.tf>, GPLv3+
 33 | 
 34 | ## Example Session
 35 | 
 36 | Check out the help:
 37 | 
 38 |     $ ./osjitter -h
 39 | 
 40 | Isolating the last 3 cores on a 8 core system:
 41 | 
 42 |     $ cat /proc/cmdline
 43 |     [..] isolcpus=5-7 nohz=on nohz_full=5-7 rcu_nocbs=5-7 rcu_nocb_poll \
 44 |     nowatchdog mce=ignore_ce acpi_irq_nobalance pcie_aspm=off tsc=reliable
 45 | 
 46 | This system is a Supermicro one (running Fedora 29) with an Atom CPU:
 47 | 
 48 |     $ cat /proc/cpuinfo | grep model' name' | head -n 1
 49 |     model name	: Intel(R) Atom(TM) CPU C3758 @ 2.20GHz
 50 | 
 51 | First OSjitter run:
 52 | 
 53 |     $ ./osjitter  -t 60
 54 |      CPU  TSC_khz  #intr  #delta  ovfl_ns  invol_ctx  sum_intr_ns  iratio  rt_s  loop_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns   max_ns  mad_ns
 55 |        0  2200000  60240   60240        0       8065    283228653   0.005    60       22       3151    2989    4354    6047    7218    443376  9380037     220
 56 |        1  2200000  60192   60192        0       9809    216975033   0.004    60       22       2710    2339    3740    5314    6322     11774  4614206     432
 57 |        2  2200000  60199   60199        0       5942    180783353   0.003    60       22       2424    2219    3399    4847    7888     14611  1465586     223
 58 |        3  2200000  60193   60193        0       5465    171929486   0.003    60       22       2426    2236    3087    4246    6388     11487   592769     187
 59 |        4  2200000  60320   60320        0       6173    212338516   0.004    60       22       2548    2358    3468    5005    6280     40044  2262400     211
 60 |        5  2200000    156     156        0          1       576392   0.000    60       22       3681    2801    4044    4388   11667     12138    12286     428
 61 |        6  2200000    156     156        0          1       581260   0.000    60       22       3565    2788    3964    4270   12278     20279    28125     451
 62 |        7  2200000    126     126        0          1       450470   0.000    60       22       3703    2467    4003    4205    9163     11859    12198     352
 63 | 
 64 | => The threads on the isolated CPUs are much less interrupted the
 65 | other ones.
 66 | 
 67 | Move all interrupts away from the isolated CPUs:
 68 | 
 69 |     # tuna -q '*' -c 0-4 -m -x
 70 | 
 71 | OSjitter:
 72 | 
 73 |     $ ./osjitter  -t 60
 74 |      CPU  TSC_khz  #intr  #delta  ovfl_ns  invol_ctx  sum_intr_ns  iratio  rt_s  loop_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns   max_ns  mad_ns
 75 |        0  2200000  60342   60342        0       6207    272600031   0.005    60       22       3105    2980    4141    5898    7205    442155  4772690     144
 76 |        1  2200000  60334   60334        0       6483    196708372   0.003    60       22       2488    2293    3530    5014    6335     13491  4684815     236
 77 |        2  2200000  60330   60330        0       8479    211832782   0.004    60       22       2528    2296    3651    5269    9299     15708  5513140     347
 78 |        3  2200000  60256   60256        0       7973    237326578   0.004    60       22       2477    2261    3617    5155    7186     39479  5602172     325
 79 |        4  2200000  60280   60280        0       5149    197355746   0.003    60       22       2532    2345    3020    4026    6309     16298  2630389     175
 80 |        5  2200000      8       8        0          1        41371   0.000    60       22       3340    1869    8570   11288   11288     11288    11616    1470
 81 |        6  2200000      8       8        0          1        41025   0.000    60       22       3291    1706    8616   11429   11429     11429    11609    1585
 82 |        7  2200000     10      10        0          1        46852   0.000    60       22       2886    1927    8794   11968   11968     11968    12126     959
 83 | 
 84 | => Even less interruptions on the isolated CPU's
 85 | 
 86 | Move all moveable kernel threads away from the isolated CPUs:
 87 | 
 88 |     # tuna -U -t '*' -c 0-4 -m
 89 | 
 90 | OSjitter:
 91 | 
 92 |     $ ./osjitter  -t 60
 93 |      CPU  TSC_khz  #intr  #delta  ovfl_ns  invol_ctx  sum_intr_ns  iratio  rt_s  loop_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns   max_ns  mad_ns
 94 |        0  2200000  60246   60246        0       4333    231374600   0.004    60       22       3177    3040    3595    4465   10924     29714   469030     134
 95 |        1  2200000  60403   60403        0       5965    198823307   0.003    60       22       2490    2274    3425    4865    6387     16643  4743847     229
 96 |        2  2200000  60445   60445        0       5020    186508000   0.003    60       22       2402    2172    2959    3740    5762     12846  1716645     209
 97 |        3  2200000  60490   60490        0      10195    234402816   0.004    60       22       2825    2308    4398    5358    6915    112854  3997080     576
 98 |        4  2200000  60276   60276        0       7274    212001750   0.004    60       22       2531    2328    3668    5061    5747     13550  6431210     275
 99 |        5  2200000      8       8        0          1        34188   0.000    60       22       3197    1765    5095    8923    8923      8923    11685    1114
100 |        6  2200000      8       8        0          1        39910   0.000    60       22       3218    1616    8130   11231   11231     11231    11793    1601
101 |        7  2200000      5       5        0          0        16998   0.000    60       22       2091    2079    8506    8506    8506      8506     8506     574
102 | 
103 | => Isolated CPUs: Improvements in interruptions, few improvements
104 | in median, max and median absolute deviation (MAD).
105 | 
106 | Switch from throughput-performance based tuned profile to a latency-performance
107 | based one (i.e. disable CPU frequency scaling, longer stat interval, writeback
108 | cpubask etc.):
109 | 
110 |     # tuned-adm profile gs-latency
111 | 
112 | OSjitter:
113 | 
114 |     $ ./osjitter  -t 60
115 |      CPU  TSC_khz  #intr  #delta  ovfl_ns  invol_ctx  sum_intr_ns  iratio  rt_s  loop_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns   max_ns  mad_ns
116 |        0  2200000  60250   60250        0        686    213519597   0.004    60       22       3125    3008    3250    3323   13616     37892  1871887      97
117 |        1  2200000  60223   60223        0      26628    287996052   0.005    60       22       3118    2914    6182    6266    7117     17085  5240030     777
118 |        2  2200000  60241   60241        0      26289    272751612   0.005    60       22       3079    2889    6183    6260    6480      9952  1231324     728
119 |        3  2200000  60193   60193        0        167    163954807   0.003    60       22       2360    2123    2470    2526    3210     13830  8119388     124
120 |        4  2200000  60223   60223        0        120    161220610   0.003    60       22       2427    2231    2514    2566    3060     13410  1885120      99
121 |        5  2200000      5       5        0          1        14843   0.000    60       21       2255    1897    6112    6112    6112      6112     6112     402
122 |        6  2200000      5       5        0          0        17074   0.000    60       22       2144    1852    8859    8859    8859      8859     8859     389
123 |        7  2200000      5       5        0          0        16665   0.000    60       22       1922    1808    8630    8630    8630      8630     8630     234
124 | 
125 | => Isolated CPUs: less interruptions, less total interruptions, improvements in median, max and MAD
126 | 
127 | ## How it works
128 | 
129 | OSjitter creates a measurement thread for each selected CPU that
130 | polls the CPU's [Time Stamp Counter (TSC)][tsc]. In each
131 | iteration the previous counter value is subtracted from the
132 | previous one and if that duration is above the threshold
133 | (default: 100 ns) it's counted as an interruption.
134 | 
135 | Since the 1990ies, x86 CPUs feature a TSC, which can be read with
136 | a special instruction from any user-space program. The TSC on
137 | relatively modern CPUs is supposed to run constant and reliable,
138 | i.e. even during CPU-frequency changes and power-saving state
139 | changes. That means that the TSC frequency (although constant)
140 | may be different from the base frequency of the CPU. Since the
141 | TSC is integrated into the CPU, can be accessed like a register
142 | (with low overhead) and has a high accuracy it's well suited for
143 | measuring even short interruptions.
144 | 
145 | When a program is interrupted by the operating system the TSC
146 | ticks continue and thus after the program execution continues
147 | (otherwise transparently to the program) it can derive how long
148 | it was interrupted by looking at the current TSC value.
149 | 
150 | The actual TSC frequency is required to convert TSC counts to
151 | nanoseconds. OSjitter obtains the TSC frequency from the kernel,
152 | i.e. from `/sys/devices/system/cpu/cpu0/tsc_freq_khz` (if
153 | available) or it parses it from `journalctl --boot` ([relevant
154 | stackoverflow answer][2]).
155 | 
156 | ## How to build
157 | 
158 | For most utilities:
159 | 
160 |     $ make
161 | 
162 | The syscall benchmark:
163 | 
164 |     $ git submodule update --init
165 |     $ mkdir build
166 |     $ cd build
167 |     $ CXXFLAGS='-Wall -O3 -g' cmake .. -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 -GNinja
168 |     $ ninja
169 | 
170 | (or a similar cmake invocation)
171 | 
172 | ## Related Work
173 | 
174 | There is [sysjitter][sj] (1.4, GPLv3) which also reads the [TSC][tsc] in
175 | a loop to detect external interruptions. Some differences are:
176 | 
177 | - Sysjitter calibrates the TSC frequency against `gettimeofday()`
178 |   whereas OSJitter just obtains the Kernel's TSC frequency
179 |   (the Kernel is in a better position to calibrate the TSC
180 |   frequency and Linux contains a well-engineered calibration
181 |   logic including possible refinements after the first
182 |   calibration)
183 | - Sysjitter just invokes the RDTSC instruction while OSjitter
184 |   invokes RDTSC and RDTSCP in combination with fencing
185 |   instructions
186 | - OSjitter uses ISO C atomic operations while Sysjitter uses GCC
187 |   atomic intrinsics
188 | - In contrast to OSjitter, sysjitter doesn't allow to specify the
189 |   scheduling class/priority of the measurement threads
190 | - OSjitter's output includes a measure for dispersion (MAD)
191 | - Besides TSC on x86, sysjitter also support reading a timestamp
192 |   counter on POWER CPUs.
193 | 
194 | The Linux Kernel contains a [hardware latency detector][hwl] to
195 | check for interruptions caused outside of the operating system
196 | such as the [System Management Mode][smm] (SMM). It also queries
197 | the TSC in a loop.
198 | 
199 | The SMM is triggered by System Management Interrupts (SMI)
200 | which are transparent to the kernel and can only be detected
201 | indirectly. An alternative to the TSC approach for detecting and
202 | measuring SMIs is to query CPU counters the SMI changes
203 | ([relevant stackoverflow answer][1]).
204 | 
205 | [Cyclictest][cyc] measures OS latency by [setting
206 | timers][cyc2] and comparing the actual sleep time with the
207 | configured one.
208 | 
209 | Erik Rigtorp has published
210 | [hiccups](https://github.com/rigtorp/hiccups) to measure 'system
211 | induced jitter',
212 | [ipc-bench](https://github.com/rigtorp/ipc-bench) as a ping-pong
213 | latency benchmark and [c2clat](https://github.com/rigtorp/c2clat)
214 | to measure inter-core latency. The hiccups repository references
215 | [osnoise](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/trace/osnoise-tracer.rst),
216 | an OS jitter detector built into the Linux kernel which appeared
217 | in Linux 5.14 or so that complements the above mentioned hardware
218 | latency detector.
219 | 
220 | ## Pingpong Results
221 | 
222 | The doc directory contains some example Pingpong results for
223 | different configurations.
224 | 
225 | The results for condition variable, semaphore and futex are quite
226 | similar because, on Linux, condition variables and semaphores are
227 | implemented in terms of futex.
228 | 
229 | Notifying via a traditional UNIX pipe is more expensive than
230 | using a futex but it's the same order of magnitude.
231 | 
232 | Inserting a PAUSE instruction while spinning on an atomic
233 | variable increases the median absolute deviation (MAD) just a
234 | little bit, but yields similar median while reducing the number
235 | of executed instructions.
236 | 
237 | As documented in the kernel documentation, comparing the results
238 | with and without `full_hz=` show how this features increases
239 | context-switch overhead and thus increases latency for the
240 | syscall methods (e.g. by 0.6 us or so in the median, a few us in
241 | the other percentiles and maximum). On the other hand, more
242 | context-switch overhead isn't relevant for spinning on an atomic
243 | variable, thus, `full_hz=` really pays off for this use-case
244 | because the process is interrupted much less.
245 | 
246 | 
247 | [sj]: https://www.openonload.org/download.html
248 | [hwl]: https://www.kernel.org/doc/html/latest/trace/hwlat_detector.html
249 | [smm]: https://en.wikipedia.org/wiki/System_Management_Mode
250 | [1]: https://stackoverflow.com/a/57961772/427158
251 | [tsc]: https://en.wikipedia.org/wiki/Time_Stamp_Counter
252 | [cyc]: https://git.kernel.org/pub/scm/linux/kernel/git/clrkwllms/rt-tests.git
253 | [cyc2]: http://people.redhat.com/williams/latency-howto/rt-latency-howto.txt
254 | [2]: https://stackoverflow.com/a/57835630/427158
255 | 


--------------------------------------------------------------------------------
/osjitter.c:
--------------------------------------------------------------------------------
  1 | // OSjitter - measure program interruptions
  2 | //
  3 | // 2019, Georg Sauthoff <mail@gms.tf>
  4 | //
  5 | // SPDX-License-Identifier: GPL-3.0-or-later
  6 | 
  7 | #define _GNU_SOURCE
  8 | 
  9 | #include <assert.h>
 10 | #include <errno.h>
 11 | #include <fcntl.h>
 12 | #include <inttypes.h>
 13 | #include <pthread.h>
 14 | #include <sched.h>
 15 | #include <stdatomic.h>
 16 | #include <stdbool.h>
 17 | #include <stdint.h>
 18 | #include <stdio.h>
 19 | #include <string.h>
 20 | #include <sys/stat.h>
 21 | #include <sys/types.h>
 22 | #include <time.h>
 23 | #include <unistd.h>
 24 | 
 25 | #include <xmmintrin.h> // __mm_pause()
 26 | 
 27 | #include "util.h"
 28 | #include "tsc.h"
 29 | 
 30 | static atomic_bool start_work  = false;
 31 | static atomic_bool quit_thread = false;
 32 | 
 33 | 
 34 | struct Args {
 35 |     uint32_t  cpus;
 36 |     cpu_set_t cpu_set;
 37 | 
 38 |     int sched_policy;
 39 |     int sched_prio;
 40 | 
 41 |     uint32_t runtime_s;
 42 |     uint32_t thresh_ns;
 43 |  
 44 |     uint32_t tsc_khz;
 45 |     uint32_t mult;
 46 |     uint32_t shift;
 47 |     uint32_t tsc_thresh;
 48 |     uint64_t tsc_runtime;
 49 |     uint64_t samples;
 50 | 
 51 |     unsigned pid;
 52 |     size_t tid_off;
 53 | };
 54 | typedef struct Args Args;
 55 | 
 56 | static void help(FILE *f, const char *argv0)
 57 | {
 58 |     fprintf(f, "%s - measure involuntary program interruptions\n"
 59 |         "\n"
 60 |         "Options:\n"
 61 |         "  -t SEC     measurement period in s (default: 10 s)\n"
 62 |         "  -d NS      threshold for an interruption in ns (default: 100 ns)\n"
 63 |         "  --cpu X    CPU (Cores) that are part of the measurement (default: all);\n"
 64 |         "  --cpu X-Y  count from zero, single core or range\n"
 65 |         "  --sched X  scheduling policy for measurement threads (default: OTHER);\n"
 66 |         "             1:FIFO, 2:RR etc. WARNING: only specify a subset with --cpu\n"
 67 |         "             when setting a realtime policy\n"
 68 |         "  --prio X   realtime priority (default: 1)\n"
 69 |         "  --khz  X   frequency of TSC in kHz (default: read from\n"
 70 |         "             /sys/devices/system/cpu/cpu0/tsc_freq_khz if available or\n"
 71 |         "             journalctl --boot)\n"
 72 |         "\n"
 73 |         "How it works: a measurement thread is pinned on each selected CPU\n"
 74 |         "where it loops without making system calls and periodically reads\n"
 75 |         "the TSC to detect external interruptions. Thus, it detects latency\n"
 76 |         "introducing interruptions by the OS and possibly even by the SMM.\n"
 77 |         "\n"
 78 |         "Output columns:\n"
 79 |         "  CPU         - CPU/Core number, count from 0, cf. /proc/cpuinfo and lscpu\n"
 80 |         "  TSC_KHZ     - frequency of the Time Stamp Counter (TSC)\n"
 81 |         "                might be different from the CPU's base frequency\n"
 82 |         "  #intr       - number of interruptions (above the threshold, cf. -d)\n"
 83 |         "  #delta      - number of recorded interruptions (might overflow)\n"
 84 |         "  ovfl_ns     - time after which interrupt recording overflowed\n"
 85 |         "  invol_ctx   - number of involuntary context switches\n"
 86 |         "                (i.e. due to scheduling)\n"
 87 |         "  sum_intr_ns - sum of all interruptions in ns\n"
 88 |         "  iratio      - ratio of interruption time to runtime\n"
 89 |         "                (IOW off-program to program time)\n"
 90 |         "  rt_s        - measurement time in s (cf. -t)\n"
 91 |         "  loop_ns     - smallest loop runtime (likely of an uninterrupted iteration\n"
 92 |         "                is used to better approximate interruption time\n"
 93 |         "  median_ns   - Median of all recorded interruptions\n"
 94 |         "  pX_ns       - X/100 percentile\n"
 95 |         "  max_ns      - the longest interruption\n"
 96 |         "  mad_ns      - median absolute deviation of all recorded interruptions\n"
 97 |         "\n"
 98 |         "How much happens in a nanosecond?\n"
 99 |         "A CPU running at 3.6 GHz progresses by 3.6 cycles in 1 ns. And a\n"
100 |         "modern pipelined super-scalar CPU may execute up to 3 instructions\n"
101 |         "or so per cycle, on average.\n"
102 |         "\n"
103 |         "2019, Georg Sauthoff <mail@gms.tf>, GPLv3+\n"
104 |         , argv0);
105 | }
106 | 
107 | static int parse_args(Args *args, int argc, char **argv)
108 | {
109 |     *args = (const Args){0};
110 |     CPU_ZERO(&args->cpu_set);
111 | 
112 |     for (int i = 1; i < argc; ++i) {
113 |         if (!strcmp(argv[i], "--cpu")) {
114 |             ++i;
115 |             if (i >= argc) {
116 |                 fprintf(stderr, "--cpu argument is missing\n");
117 |                 return -1;
118 |             }
119 |             char *p = strchr(argv[i], '-');
120 |             if (p) { 
121 |                 *p = 0;
122 |                 unsigned b = atoi(argv[i]);
123 |                 unsigned e = atoi(p+1);
124 |                 if (b >= 1024 || e >= 1024) {
125 |                     fprintf(stderr, "--cpu range out of range\n");
126 |                     return -1;
127 |                 }
128 |                 for (unsigned k = b; k <= e; ++k) {
129 |                     CPU_SET(k, &args->cpu_set);
130 |                 }
131 |             } else {
132 |                 CPU_SET(atoi(argv[i]), &args->cpu_set);
133 |             }
134 |         } else if (!strcmp(argv[i], "-t")) {
135 |             ++i;
136 |             if (i >= argc) {
137 |                 fprintf(stderr, "-t argument is missing\n");
138 |                 return -1;
139 |             }
140 |             args->runtime_s = atoi(argv[i]);
141 |         } else if (!strcmp(argv[i], "-d")) {
142 |             ++i;
143 |             if (i >= argc) {
144 |                 fprintf(stderr, "-d argument is missing\n");
145 |                 return -1;
146 |             }
147 |             args->thresh_ns = atoi(argv[i]);
148 |         } else if (!strcmp(argv[i], "--sched")) {
149 |             ++i;
150 |             if (i >= argc) {
151 |                 fprintf(stderr, "--sched argument is missing\n");
152 |                 return -1;
153 |             }
154 |             args->sched_policy = atoi(argv[i]);
155 |             if (!args->sched_prio)
156 |                 args->sched_prio = 1;
157 |         } else if (!strcmp(argv[i], "--prio")) {
158 |             ++i;
159 |             if (i >= argc) {
160 |                 fprintf(stderr, "--prio argument is missing\n");
161 |                 return -1;
162 |             }
163 |             args->sched_prio = atoi(argv[i]);
164 |         } else if (!strcmp(argv[i], "--khz")) {
165 |             ++i;
166 |             if (i >= argc) {
167 |                 fprintf(stderr, "--khz argument is missing\n");
168 |                 return -1;
169 |             }
170 |             args->tsc_khz = atoi(argv[i]);
171 |         } else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) {
172 |             help(stdout, argv[0]);
173 |             exit(0);
174 |         } else {
175 |             fprintf(stderr, "unknown option: %s\n", argv[i]);
176 |             return -1;
177 |         }
178 |     }
179 | 
180 |     if (!args->runtime_s)
181 |         args->runtime_s = 10;
182 |     if (!args->thresh_ns)
183 |         args->thresh_ns = 100;
184 |     if (!args->samples)
185 |         args->samples = args->runtime_s * 105000;
186 | 
187 |     return 0;
188 | }
189 | 
190 | 
191 | 
192 | static int is_cpu_online(uint32_t cpu, bool *b)
193 | {
194 |     char filename[64];
195 |     snprintf(filename, sizeof filename, "/sys/devices/system/cpu/cpu%u/online",
196 |             cpu);
197 |     int fd = open(filename, O_RDONLY);
198 |     if (fd == -1) {
199 |         // CPU not hot-plugable
200 |         if (errno == ENOENT) {
201 |             *b = true;
202 |             return 0;
203 |         }
204 |         perror("opening /sys/devices/system/cpu/cpu%u/online");
205 |         return -1;
206 |     }
207 |     char buf[2] = {0};
208 |     ssize_t r = read(fd, buf, sizeof buf);
209 |     if (r == -1) {
210 |         perror("reading /sys/devices/system/cpu/cpu0/tsc_freq_khz");
211 |         close(fd);
212 |         return -1;
213 |     }
214 |     *b = buf[0] == '1' && buf[1] == '\n';
215 |     int t = close(fd);
216 |     if (t == -1) {
217 |         perror("closing /sys/devices/system/cpu/cpu%u/online");
218 |         return -1;
219 |     }
220 |     return 0;
221 | }
222 | 
223 | 
224 | // cf. gdb> ptype pthread
225 | // (requires glibc debuginfo installed)
226 | static size_t get_tid_off(void)
227 | {
228 |     pthread_t t = pthread_self();
229 |     const char *b;
230 |     memcpy(&b, (void*)t, sizeof b);
231 |     const char *e = b + 1024;
232 |     unsigned pid = getpid();
233 |     for (const char *p = b + 128; p < e; p+=4) {
234 |         unsigned x;
235 |         memcpy(&x, p, sizeof x);
236 |         if  (x == pid)
237 |             return p - b;
238 |     }
239 |     return 0;
240 | }
241 | 
242 | // alternative to calling gettid() in each child
243 | static unsigned pthread_to_tid(pthread_t t, size_t off)
244 | {
245 |     const char *p;
246 |     memcpy(&p, (void*)t, sizeof p);
247 |     unsigned tid;
248 |     memcpy(&tid, p + off, sizeof tid);
249 |     return tid;
250 | }
251 | 
252 | 
253 | static int set_params(Args *args)
254 | {
255 |     args->pid = getpid();
256 |     args->tid_off = get_tid_off();
257 | 
258 |     args->cpus = sysconf(_SC_NPROCESSORS_CONF);
259 |     if (!CPU_COUNT(&args->cpu_set)) {
260 |         for (unsigned k = 0; k <= args->cpus; ++k) {
261 |             bool b = false;
262 |             int r = is_cpu_online(k, &b);
263 |             if (r)
264 |                 return r;
265 |             if (b)
266 |                 CPU_SET(k, &args->cpu_set);
267 |         }
268 |     }
269 | 
270 |     if (!args->tsc_khz) {
271 |         int r = get_tsc_khz(&args->tsc_khz);
272 |         if (r < 0)
273 |             return r;
274 |     }
275 |     clocks_calc_mult_shift(&args->mult, &args->shift,
276 |             args->tsc_khz, 1000000l, 0);
277 |     {
278 |         double d = 1000000000l;
279 |         d /= args->thresh_ns;
280 |         double e = args->tsc_khz;
281 |         e *= 1000;
282 |         e /= d;
283 |         args->tsc_thresh = (uint32_t) e;
284 |     }
285 |     {
286 |         double d = args->tsc_khz;
287 |         d *= 1000;
288 |         d *= args->runtime_s;
289 |         args->tsc_runtime = (uint64_t) d;
290 |     }
291 |     return 0;
292 | }
293 | 
294 | static Args global_args;
295 | 
296 | struct Worker {
297 |     pthread_t worker_id;
298 |     unsigned  tid;
299 |     uint32_t  cpu_id;
300 | 
301 |     uint32_t *deltas;       // array of interruptions
302 |     uint64_t samples;       // #used array entries
303 |     uint64_t thresh_cnt;    // counted interruptions
304 | 
305 |     uint64_t tsc_start;     // start of measurements
306 |     uint64_t tsc_overflow;  // when it overflowed (or 0 for no overflow)
307 |     uint64_t tsc_total_int; // sum of interruptions
308 |     uint64_t tsc_delta_min; // minimum loop time
309 | 
310 |     uint64_t invol_switch;  // involuntary context switches
311 | };
312 | typedef struct Worker Worker;
313 | 
314 | static int check_cpuinfo(void)
315 | {
316 |     FILE *f = popen("grep '^flags' /proc/cpuinfo | tr ' ' '\\n'"
317 |             " | grep '^\\(constant\\|nonstop\\)_tsc$'", "re");
318 |     if (!f) {
319 |         perror("popen");
320 |         return -1;
321 |     }
322 |     char *line = 0;
323 |     size_t n = 0;
324 |     bool constant_tsc = false;
325 |     bool nonstop_tsc  = false;
326 |     for (;;) {
327 |         ssize_t l = getline(&line, &n, f);
328 |         if (l == -1) {
329 |             if (feof(f)) {
330 |                 break;
331 |             } else {
332 |                 perror("getline");
333 |                 pclose(f);
334 |                 return -1;
335 |             }
336 |         }
337 |         if (!strcmp(line, "constant_tsc\n"))
338 |             constant_tsc = true;
339 |         if (!strcmp(line, "nonstop_tsc\n"))
340 |             nonstop_tsc = true;
341 |     }
342 |     int r = pclose(f);
343 |     if (r == -1) {
344 |         perror("pclose");
345 |         return -1;
346 |     }
347 |     r = 0;
348 |     if (!constant_tsc) {
349 |         fprintf(stderr, "CPU doesn't support a constant TSC\n");
350 |         r = 1;
351 |     }
352 |     if (!nonstop_tsc) {
353 |         fprintf(stderr, "CPU's TSC stops in sleep states\n");
354 |         r = 1;
355 |     }
356 |     return r;
357 | }
358 | 
359 | // Note that /proc/%u/task/%u/sched is gone after the thread
360 | // returned from its main function,
361 | // i.e. even before the parent called pthread_join()
362 | static int read_proc_sched(unsigned pid, unsigned tid, Worker *w)
363 | {
364 |     char filename[64];
365 |     snprintf(filename, sizeof filename, "/proc/%u/task/%u/sched", pid, tid);
366 |     int fd = open(filename, O_RDONLY);
367 |     if (fd == -1) {
368 |         perror("opening /proc/%u/task/%u/sched");
369 |         return -1;
370 |     }
371 |     char buf[4*1024] = {0};
372 |     ssize_t n = read(fd, buf, sizeof buf);
373 |     if (n == -1) {
374 |         perror("reading /proc/%u/task/%u/sched");
375 |         close(fd);
376 |         return -1;
377 |     }
378 |     const char q[] = "nr_involuntary_switches";
379 |     char *p = memmem(buf, n, q, sizeof q - 1);
380 |     if (!p) {
381 |         fprintf(stderr,
382 |                 "Couldn't find involuntary switches in /proc/.../sched\n");
383 |         return -1;
384 |     }
385 |     p += sizeof q - 1;
386 |     char *e = memchr(p, '\n', n - (p-buf));
387 |     if (!e) {
388 |         fprintf(stderr, "Couldn't find end in /proc/.../sched\n");
389 |         return -1;
390 |     }
391 |     *e = 0;
392 |     char *m = memrchr(p, ' ', e-p);
393 |     if (!m) {
394 |         fprintf(stderr, "Couldn't find begin in /proc/.../sched\n");
395 |         return -1;
396 |     }
397 |     ++m;
398 |     w->invol_switch = atol(m);
399 |     int r = close(fd);
400 |     if (r == -1) {
401 |         perror("closing /proc/%u/task/%u/sched");
402 |         return -1;
403 |     }
404 |     return 0;
405 | }
406 | 
407 | 
408 | static void *worker_main(void *p)
409 | {
410 |     Worker *w = p;
411 |     Args args = global_args;
412 |     size_t n  = args.samples;
413 |     // uint32_t is big enough to store interruptions of up to ~ 1 s
414 |     // when using a TSC that runs at 4 GHz
415 |     uint32_t *ds = calloc(n, sizeof ds[0]);
416 |     if (!ds) {
417 |         fprintf(stderr, "Failed to allocate delta array on core %" PRIu32 "\n",
418 |                 w->cpu_id);
419 |         return NULL;
420 |     }
421 |     size_t i =  0;
422 |     while(!atomic_load_explicit(&start_work, memory_order_consume)) {
423 |         _mm_pause();
424 |     }
425 |     for (unsigned i = 0; i < 1000; ++i)
426 |         _mm_pause();
427 | 
428 |     uint64_t tsc_total_int = 0;
429 |     uint64_t tsc_overflow  = 0;
430 |     uint64_t tsc_thresh    = args.tsc_thresh;
431 |     uint64_t tsc_delta_min = UINT64_MAX;
432 | 
433 |     uint64_t start = fenced_rdtsc();
434 |     uint64_t limit = start + args.tsc_runtime;
435 |     uint64_t tsc   = start;
436 | 
437 |     // unroll the loop one time for a more 'realistic' tsc_delta_min
438 |     if (tsc < limit) {
439 |         uint64_t t     = fenced_rdtscp();
440 |         uint64_t delta = t - tsc;
441 |         tsc = t;
442 |         if (delta > tsc_thresh) {
443 |             tsc_total_int += delta;
444 |             if (i < n) {
445 |                 ds[i] = delta > UINT32_MAX ? UINT32_MAX : delta;
446 |             } else if (!tsc_overflow) {
447 |                 tsc_overflow = t;
448 |             }
449 |             ++i;
450 |         }
451 |         if  (delta < tsc_delta_min)
452 |             tsc_delta_min = delta;
453 |     }
454 |     tsc_delta_min = UINT64_MAX; // throw the first tsc_delta_min away
455 |     while (tsc < limit) {
456 |         uint64_t t     =  fenced_rdtscp();
457 |         uint32_t delta = t - tsc;
458 |         tsc = t;
459 |         if (delta > tsc_thresh) {
460 |             tsc_total_int += delta;
461 |             if (i < n) {
462 |                 ds[i] = delta > UINT32_MAX ? UINT32_MAX : delta;
463 |             } else if (!tsc_overflow) {
464 |                 tsc_overflow = t;
465 |             }
466 |             ++i;
467 |         }
468 |         if  (delta < tsc_delta_min)
469 |             tsc_delta_min = delta;
470 |     }
471 | 
472 |     while(!atomic_load_explicit(&quit_thread, memory_order_consume)) {
473 |         _mm_pause();
474 |     }
475 | 
476 |     w->deltas        = ds;
477 |     w->samples       = i < n ? i : n;
478 |     w->thresh_cnt    = i;
479 |     w->tsc_start     = start;
480 |     w->tsc_overflow  = tsc_overflow;
481 |     w->tsc_total_int = tsc_total_int - (tsc_delta_min*i);
482 |     w->tsc_delta_min = tsc_delta_min;
483 | 
484 |     for (size_t i = 0; i < w->samples; ++i) {
485 |         // Assuming that we have some loop iterations without any interruption
486 |         w->deltas[i] -= w->tsc_delta_min;
487 |     }
488 |     qsort(w->deltas, w->samples, sizeof w->deltas[0], cmp_u32);
489 | 
490 |     // no need release/consume/aquire those values because
491 |     // the main thread calls pthread_join() before reading those values
492 |     // which acts as a memory barrier
493 | 
494 |     return w;
495 | }
496 | 
497 | 
498 | static int pp_results(const Worker *ws, FILE *f)
499 | {
500 |     Args *args = &global_args;
501 |     fprintf(f, " CPU  TSC_khz  #intr  #delta  ovfl_ns  invol_ctx  sum_intr_ns  iratio  rt_s  loop_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns   max_ns  mad_ns\n");
502 |     uint32_t *ys = 0;
503 |     for (unsigned cpu = 0; cpu < args->cpus; ++cpu) {
504 |         if (!CPU_ISSET(cpu, &args->cpu_set))
505 |             continue;
506 |         const Worker *w = ws+cpu;
507 |         uint64_t intr_ns = mul_u64_u32_shr(w->tsc_total_int,
508 |                 args->mult, args->shift);
509 |         ys = realloc(ys, (w->samples ? w->samples : 1) * sizeof ys[0]);
510 |         if (!ys) {
511 |             fprintf(stderr, "realloc in pp_results failed\n");
512 |             return -1;
513 |         }
514 |         uint32_t mad = mad_u32(w->deltas, ys, w->samples);
515 |         fprintf(f, "%4u %8" PRIu32 " %6" PRIu64 " %7" PRIu64
516 |                 " %8" PRIu64
517 |                 " %10" PRIu64
518 |                 " %12" PRIu64 " %7.3f"
519 |                 " %5" PRIu32
520 |                 " %8" PRIu64
521 |                 " %10" PRIu64
522 |                 " %7" PRIu64
523 |                 " %7" PRIu64
524 |                 " %7" PRIu64
525 |                 " %7" PRIu64
526 |                 " %9" PRIu64
527 |                 " %8" PRIu64
528 |                 " %7" PRIu64
529 |                 "\n",
530 |                 cpu, args->tsc_khz, w->thresh_cnt, w->samples,
531 |                 w->tsc_overflow ? mul_u64_u32_shr(w->tsc_overflow - w->tsc_start,
532 |                     args->mult, args->shift) : 0,
533 |                 w->invol_switch,
534 |                 intr_ns, (double)intr_ns/((double)args->runtime_s*1000000000),
535 |                 args->runtime_s,
536 |                 mul_u64_u32_shr(w->tsc_delta_min, args->mult, args->shift),
537 |                 mul_u64_u32_shr(percentile_u32(w->deltas, w->samples, 1, 2),
538 |                     args->mult, args->shift),
539 |                 mul_u64_u32_shr(percentile_u32(w->deltas, w->samples, 1, 5),
540 |                     args->mult, args->shift),
541 |                 mul_u64_u32_shr(percentile_u32(w->deltas, w->samples, 4, 5),
542 |                     args->mult, args->shift),
543 |                 mul_u64_u32_shr(percentile_u32(w->deltas, w->samples, 90, 100),
544 |                     args->mult, args->shift),
545 |                 mul_u64_u32_shr(percentile_u32(w->deltas, w->samples, 99, 100),
546 |                     args->mult, args->shift),
547 |                 mul_u64_u32_shr(percentile_u32(w->deltas, w->samples, 999, 1000),
548 |                     args->mult, args->shift),
549 |                 mul_u64_u32_shr(w->samples ? w->deltas[w->samples - 1] : 0,
550 |                         args->mult, args->shift),
551 |                 mul_u64_u32_shr(mad, args->mult, args->shift)
552 |                );
553 |     }
554 |     free(ys);
555 |     return 0;
556 | }
557 | 
558 | static int create_workers(Worker *ws)
559 | {
560 |     Args *args = &global_args;
561 |     for (unsigned cpu = 0; cpu < args->cpus; ++cpu) {
562 |         ws[cpu].cpu_id = cpu;
563 |         // => no need to synchronize this thread parameter because pthread_join
564 |         // acts as a memory barrier
565 |         if (!CPU_ISSET(cpu, &args->cpu_set))
566 |             continue;
567 | 
568 |         pthread_attr_t attr;
569 |         int r = pthread_attr_init(&attr);
570 |         if (r) {
571 |             perror_e(r, "pthread_attr_init failed");
572 |             return 1;
573 |         }
574 |         cpu_set_t cpus;
575 |         CPU_ZERO(&cpus);
576 |         CPU_SET(cpu, &cpus);
577 |         r = pthread_attr_setaffinity_np(&attr, sizeof cpus, &cpus);
578 |         if (r) {
579 |             perror_e(r, "pthread_attr_setaffinity_np failed");
580 |             return 1;
581 |         }
582 |         if (args->sched_policy) {
583 |             r = pthread_attr_setschedpolicy(&attr, args->sched_policy);
584 |             if (r) {
585 |                 perror_e(r, "pthread_attr_setschedpolicy failed");
586 |                 return 1;
587 |             }
588 |             // without any prio pthread_create complains about 'Invalid argument'
589 |             struct sched_param param = { .sched_priority = args->sched_prio };
590 |             r = pthread_attr_setschedparam(&attr, &param);
591 |             if (r) {
592 |                 perror_e(r, "pthread_attr_setschedparam failed");
593 |                 return 1;
594 |             }
595 |             r = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
596 |             if (r) {
597 |                 perror_e(r, "pthread_attr_setinheritsched failed");
598 |                 return 1;
599 |             }
600 |         }
601 |         r = pthread_create(&ws[cpu].worker_id, &attr, worker_main, ws+cpu);
602 |         if (r) {
603 |             perror_e(r, "pthread_create failed");
604 |             return 1;
605 |         }
606 |         ws[cpu].tid = pthread_to_tid(ws[cpu].worker_id, args->tid_off);
607 |         if (!ws[cpu].tid) {
608 |             fprintf(stderr, "Couldn't get TID of created thread\n");
609 |             return 1;
610 |         }
611 |         r = pthread_attr_destroy(&attr);
612 |         if (r) {
613 |             perror_e(r, "pthread_attr_init failed");
614 |             return 1;
615 |         }
616 |     }
617 |     return 0;
618 | }
619 | 
620 | static int join_workers(Worker *ws)
621 | {
622 |     Args *args = &global_args;
623 |     bool error_in_thread = false;
624 |     for (unsigned cpu = 0; cpu < args->cpus; ++cpu) {
625 |         if (!CPU_ISSET(cpu, &args->cpu_set))
626 |             continue;
627 |         void *w_ret = 0;
628 |         int r = pthread_join(ws[cpu].worker_id, &w_ret);
629 |         if (r) {
630 |             perror_e(r, "pthread_join failed");
631 |             return 1;
632 |         }
633 |         if (!w_ret)
634 |             error_in_thread = true;
635 |     }
636 |     if (error_in_thread) {
637 |         fprintf(stderr, "One thread reported an error\n");
638 |         return 1;
639 |     }
640 |     return 0;
641 | }
642 | 
643 | 
644 | int main(int argc, char **argv)
645 | {
646 |     int r = check_cpuinfo();
647 |     if (r) {
648 |         fprintf(stderr, "CPU doesn't have constant_tsc+nonstop_tsc features\n");
649 |         return 1;
650 |     }
651 |     Args *args = &global_args;
652 |     r = parse_args(args, argc, argv);
653 |     if (r) {
654 |         fprintf(stderr, "Parsing arguments failed\n");
655 |         return 1;
656 |     }
657 |     r = set_params(args);
658 |     if (r) {
659 |         fprintf(stderr, "Setting parameters failed\n");
660 |         return 1;
661 |     }
662 | 
663 | 
664 |     Worker *ws = calloc(args->cpus, sizeof ws[0]);
665 |     if (!ws) {
666 |         perror("workers allocation");
667 |         return 1;
668 |     }
669 |     r = create_workers(ws);
670 |     if (r) {
671 |         return 1;
672 |     }
673 | 
674 |     atomic_store_explicit(&start_work, true, memory_order_release);
675 | 
676 |     struct timespec ts = { .tv_sec = args->runtime_s, .tv_nsec = 100 * 1000};
677 |     r = nanosleep(&ts, NULL);
678 |     if (r == -1) {
679 |         perror("sleep of control thread was interrupted");
680 |         return 1;
681 |     }
682 | 
683 |     for (unsigned cpu = 0; cpu < args->cpus; ++cpu) {
684 |         if (!CPU_ISSET(cpu, &args->cpu_set))
685 |             continue;
686 |         int r = read_proc_sched(args->pid, ws[cpu].tid, ws + cpu);
687 |         if (r) {
688 |             return 1;
689 |         }
690 |     }
691 | 
692 |     atomic_store_explicit(&quit_thread, true, memory_order_release);
693 | 
694 |     r = join_workers(ws);
695 |     if (r) {
696 |         return 1;
697 |     }
698 | 
699 |     r = pp_results(ws, stdout);
700 |     if (r) {
701 |         return 1;
702 |     }
703 | 
704 |     free(ws);
705 | 
706 |     return 0;
707 | }
708 | 


--------------------------------------------------------------------------------
/pingpong.c:
--------------------------------------------------------------------------------
  1 | // pingpong - measure thread notification overhead
  2 | //
  3 | // 2019, Georg Sauthoff <mail@gms.tf>
  4 | //
  5 | // SPDX-License-Identifier: GPL-3.0-or-later
  6 | 
  7 | #define _GNU_SOURCE
  8 | 
  9 | #include <assert.h>
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <stdatomic.h>
 13 | #include <stdalign.h>
 14 | #include <stdbool.h>
 15 | #include <stdint.h>
 16 | #include <inttypes.h>
 17 | #include <string.h>
 18 | #include <pthread.h>
 19 | #include <unistd.h>
 20 | #include <x86intrin.h> // __rdtsc(), _mm_lfence(), ...
 21 | #include <sys/syscall.h>
 22 | #include <linux/futex.h>
 23 | #include <errno.h>
 24 | #include <semaphore.h>
 25 | 
 26 | #include "util.h"
 27 | #include "tsc.h"
 28 | 
 29 | static atomic_bool start_work;
 30 | 
 31 | // make sure that both variables go into different cachelines
 32 | // (intel/amd CPUs have 64 byte cache lines)
 33 | // without C11 support
 34 | //static _Atomic uint64_t g_tsc   __attribute__ ((aligned (64)));
 35 | //static alignas(64) _Atomic uint64_t g_tsc;
 36 | 
 37 | 
 38 | struct Cell {
 39 |     alignas(64) _Atomic uint64_t tsc;
 40 | };
 41 | typedef struct Cell Cell;
 42 | 
 43 | static Cell g_cell[2];
 44 | 
 45 | // without C11 support:
 46 | // struct Item { ... } __attribute__ ((aligned (64)));
 47 | 
 48 | struct Item {
 49 |     // aligning the first field is equivalent to aligning the struct itself
 50 |     alignas(64) pthread_mutex_t mutex;
 51 |     pthread_cond_t cond_var;
 52 |     uint64_t tsc;
 53 | };
 54 | typedef struct Item Item;
 55 | 
 56 | static_assert(sizeof(Item) % 64 == 0, "Item is not aligned");
 57 | 
 58 | static Item g_item[2] = {
 59 |     { PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER },
 60 |     { PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER }
 61 | };
 62 | 
 63 | static_assert(alignof(g_item) == 64, "Item array is not aligned");
 64 | 
 65 | static int g_pipes[2][2];
 66 | 
 67 | 
 68 | struct Follicle {
 69 |     alignas(64) _Atomic int futex;
 70 |     uint64_t tsc;
 71 | };
 72 | typedef struct Follicle Follicle;
 73 | static Follicle g_follicle[2];
 74 | 
 75 | static int
 76 | atomic_futex(_Atomic int *uaddr, int futex_op, int val,
 77 |       const struct timespec *timeout, int *uaddr2, int val3)
 78 | {
 79 |     (void)uaddr2;
 80 |     return syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr, val3);
 81 | }
 82 | 
 83 | static int futex_lock(_Atomic int *f)
 84 | {
 85 |     for (;;) {
 86 |         int zero = 0;
 87 |         if (atomic_compare_exchange_weak(f, &zero, 1))
 88 |             return 0;
 89 |         int r = atomic_futex(f, FUTEX_WAIT_PRIVATE, 1, NULL, NULL, 0);
 90 |         if (r == -1) {
 91 |             if (errno != EAGAIN)
 92 |                 return r;
 93 |         }
 94 |     }
 95 |     return 0;
 96 | }
 97 | 
 98 | // returns 1 if one thread was woken up
 99 | static int futex_unlock(_Atomic int *f)
100 | {
101 |     int one = 1;
102 |     if (atomic_compare_exchange_strong(f, &one, 0)) {
103 |         int r = atomic_futex(f, FUTEX_WAKE_PRIVATE, 1, NULL, NULL, 0);
104 |         return r;
105 |     } else {
106 |         return -2;
107 |     }
108 |     return 0;
109 | }
110 | 
111 | struct Stripe {
112 |     alignas(64) sem_t sem;
113 |     uint64_t tsc;
114 | };
115 | typedef struct Stripe Stripe;
116 | static Stripe g_stripe[2];
117 | 
118 | enum Method {
119 |     METHOD_SPIN,
120 |     METHOD_SPIN_PAUSE,
121 |     METHOD_SPIN_PAUSE_MORE,
122 |     METHOD_COND_VAR,
123 |     METHOD_NULL,
124 |     METHOD_PIPE,
125 |     METHOD_FUTEX,
126 |     METHOD_SEMAPHORE
127 | };
128 | typedef enum Method Method;
129 | struct Args {
130 |     uint32_t tsc_khz;
131 |     uint32_t mult;
132 |     uint32_t shift;
133 |     unsigned n;    // number of iterations
134 |     unsigned k; // number of pause iterations before each store
135 |     unsigned p; // number of pause iterations after each test
136 |     unsigned pin[2];
137 |     bool json;
138 |     Method method;
139 | };
140 | typedef struct Args Args;
141 | 
142 | static void help(FILE *f, const char *argv0)
143 | {
144 |     fprintf(f, "pingpong - measure inter thread notification overhead\n"
145 |             "\n"
146 |             "call: %s [OPT..]\n"
147 |             "\n"
148 |             "Options:\n"
149 |             "  --khz KHZ         TSC frequency (default: parse journalctl, read /proc)\n"
150 |             "  -n                ping-pong iterations (default: 10^6)\n"
151 |             "  -k                #iterations pause before storing (default: 1000)\n"
152 |             "  --pin THREAD CPU  0 <= THREAD <= 1, pin each thread to a CPU/core\n"
153 |             "                    (default: no pinning)\n"
154 |             "  --json            write raw values to JSON file (default: false)\n"
155 |             "  --spin            loop on an atomic variable (default)\n"
156 |             "  --spin-pause      pause after each atomic load\n"
157 |             "  -p                #pauses after each atomic load\n"
158 |             "  --cv              use a condition variable for ping pong\n"
159 |             "  --pipe            use a UNIX pipe for ping pong\n"
160 |             "  --futex           use a Linux futex for ping pong\n"
161 |             "  --sem             use a POSIX semaphore for ping ping\n"
162 |             "  --null            signal nothing\n"
163 |             "\n"
164 |             "2019, Georg Sauthoff <mail@gms.tf>, GPLv3+\n"
165 |             , argv0);
166 | }
167 | 
168 | static int parse_args(Args *args, int argc, char **argv)
169 | {
170 |     *args = (const Args){0};
171 |     for (int i = 1; i < argc; ++i) {
172 |         if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) {
173 |             help(stdout, argv[0]);
174 |             exit(0);
175 |         } else if (!strcmp(argv[i], "--khz")) {
176 |             ++i;
177 |             if (i >= argc) {
178 |                 fprintf(stderr, "--khz argument is missing\n");
179 |                 return -1;
180 |             }
181 |             args->tsc_khz = atoi(argv[i]);
182 |         } else if (!strcmp(argv[i], "-n")) {
183 |             ++i;
184 |             if (i >= argc) {
185 |                 fprintf(stderr, "-n argument is missing\n");
186 |                 return -1;
187 |             }
188 |             args->n = atoi(argv[i]);
189 |         } else if (!strcmp(argv[i], "-k")) {
190 |             ++i;
191 |             if (i >= argc) {
192 |                 fprintf(stderr, "-k argument is missing\n");
193 |                 return -1;
194 |             }
195 |             args->k = atoi(argv[i]);
196 |         } else if (!strcmp(argv[i], "-p")) {
197 |             ++i;
198 |             if (i >= argc) {
199 |                 fprintf(stderr, "-p argument is missing\n");
200 |                 return -1;
201 |             }
202 |             args->p = atoi(argv[i]);
203 |         } else if (!strcmp(argv[i], "--pin")) {
204 |             if (i+2 >= argc) {
205 |                 fprintf(stderr, "--pin THREAD CPU arguments are missing\n");
206 |                 return -1;
207 |             }
208 |             unsigned j = atoi(argv[++i]);
209 |             unsigned cpu = atoi(argv[++i]);
210 |             if (j > 1) {
211 |                 fprintf(stderr, "--pin THREAD CPU - 0 <= THREAD <= 1\n");
212 |                 return -1;
213 |             }
214 |             args->pin[j] = cpu + 1;
215 |         } else if (!strcmp(argv[i], "--json")) {
216 |             args->json = true;
217 |         } else if (!strcmp(argv[i], "--spin")) {
218 |             args->method = METHOD_SPIN;
219 |         } else if (!strcmp(argv[i], "--spin-pause")) {
220 |             args->method = METHOD_SPIN_PAUSE;
221 |         } else if (!strcmp(argv[i], "--cv")) {
222 |             args->method = METHOD_COND_VAR;
223 |         } else if (!strcmp(argv[i], "--null")) {
224 |             args->method = METHOD_NULL;
225 |         } else if (!strcmp(argv[i], "--pipe")) {
226 |             args->method = METHOD_PIPE;
227 |         } else if (!strcmp(argv[i], "--futex")) {
228 |             args->method = METHOD_FUTEX;
229 |         } else if (!strcmp(argv[i], "--sem")) {
230 |             args->method = METHOD_SEMAPHORE;
231 |         } else {
232 |             fprintf(stderr, "Unknown argument: %s\n", argv[i]);
233 |             exit(1);
234 |         }
235 |     }
236 |     if (!args->n)
237 |         args-> n = 1000 * 1000;
238 |     if (!args->k)
239 |         args-> k = 1000;
240 |     if (args->method == METHOD_SPIN_PAUSE && args->p)
241 |         args->method = METHOD_SPIN_PAUSE_MORE;
242 |     return 0;
243 | }
244 | 
245 | struct Worker {
246 |     pthread_t worker_id;
247 |     unsigned init; // 0 -> start with send, 1 -> start with receive
248 |     unsigned n;    // number of iterations
249 |     unsigned k;
250 |     unsigned p;
251 |     uint32_t *raw_ds;  // delta values
252 |     uint32_t *ds;  // delta values
253 |     unsigned ds_size; // #delta values
254 | };
255 | typedef struct Worker Worker;
256 | 
257 | 
258 | static void *spin_main_finalize(Worker *x, uint32_t *ds, unsigned j)
259 | {
260 |     assert(j <= x->n/2);
261 |     uint32_t *raw_ds = malloc(j * sizeof raw_ds[0]);
262 |     if (!raw_ds) {
263 |         fprintf(stderr, "Failed to allocate delta array in thread\n");
264 |         return 0;
265 |     }
266 |     memcpy(raw_ds, ds, j * sizeof ds[0]);
267 |     qsort(ds, j, sizeof ds[0], cmp_u32);
268 |     x->ds = ds;
269 |     x->raw_ds = raw_ds;
270 |     x->ds_size = j;
271 |     return x;
272 | }
273 | 
274 | static void *spin_main(void *p)
275 | {
276 |     Worker *x = (Worker*) p;
277 |     Worker w = *x;
278 | 
279 |     uint64_t tsc = 1;
280 |     unsigned j = 0;
281 |     uint32_t *ds = calloc(w.n/2, sizeof ds[0]);
282 |     if (!ds) {
283 |         fprintf(stderr, "Failed to allocate delta array in thread\n");
284 |         return 0;
285 |     }
286 | 
287 |     while(!atomic_load_explicit(&start_work, memory_order_consume)) {
288 |         _mm_pause();
289 |     }
290 | 
291 |     for (unsigned i = 0; i < w.n; ++i) {
292 |         if (i % 2 == w.init) { // sender
293 |             unsigned k = i < 2 ? w.k : w.k * 2;
294 |             for (unsigned j = 0; j < k; ++j)
295 |                 _mm_pause();
296 |             uint64_t t;
297 |             for (;;) {
298 |                 t = fenced_rdtsc();
299 |                 if (t <= tsc)
300 |                     continue;
301 |                 atomic_store_explicit(&g_cell[!w.init].tsc, t,
302 |                         memory_order_release);
303 |                 break;
304 |             }
305 |         } else { // receiver
306 |             uint64_t new_tsc;
307 |             for (;;) {
308 |                 new_tsc = atomic_load_explicit(&g_cell[w.init].tsc,
309 |                         memory_order_consume);
310 |                 if (new_tsc > tsc) {
311 |                     break;
312 |                 }
313 |             }
314 |             uint64_t now   = fenced_rdtscp();
315 |             uint64_t delta = now - new_tsc;
316 |             ds[j++] = delta;
317 |             tsc = new_tsc;
318 |         }
319 |     }
320 |     return spin_main_finalize(x, ds, j);
321 | }
322 | 
323 | 
324 | static void *spin_null_main(void *p)
325 | {
326 |     Worker *x = (Worker*) p;
327 |     Worker w = *x;
328 | 
329 |     unsigned j = 0;
330 |     uint32_t *ds = calloc(w.n/2, sizeof ds[0]);
331 |     if (!ds) {
332 |         fprintf(stderr, "Failed to allocate delta array in thread\n");
333 |         return 0;
334 |     }
335 | 
336 |     while(!atomic_load_explicit(&start_work, memory_order_consume)) {
337 |         _mm_pause();
338 |     }
339 | 
340 |     for (unsigned i = 0; i < w.n/2; ++i) {
341 |         uint64_t new_tsc = fenced_rdtsc();
342 |         uint64_t now     = fenced_rdtscp();
343 |         uint64_t delta   = now - new_tsc;
344 |         ds[j++] = delta;
345 |     }
346 |     return spin_main_finalize(x, ds, j);
347 | }
348 | 
349 | static void *spin_pause_main(void *p)
350 | {
351 |     Worker *x = (Worker*) p;
352 |     Worker w = *x;
353 | 
354 |     uint64_t tsc = 1;
355 |     unsigned j = 0;
356 |     uint32_t *ds = calloc(w.n/2, sizeof ds[0]);
357 |     if (!ds) {
358 |         fprintf(stderr, "Failed to allocate delta array in thread\n");
359 |         return 0;
360 |     }
361 | 
362 |     while(!atomic_load_explicit(&start_work, memory_order_consume)) {
363 |         _mm_pause();
364 |     }
365 | 
366 |     for (unsigned i = 0; i < w.n; ++i) {
367 |         if (i % 2 == w.init) { // sender
368 |             unsigned k = i < 2 ? w.k : w.k * 2;
369 |             for (unsigned j = 0; j < k; ++j)
370 |                 _mm_pause();
371 |             uint64_t t;
372 |             for (;;) {
373 |                 t = fenced_rdtsc();
374 |                 if (t <= tsc)
375 |                     continue;
376 |                 atomic_store_explicit(&g_cell[!w.init].tsc, t,
377 |                         memory_order_release);
378 |                 break;
379 |             }
380 |         } else { // receiver
381 |             uint64_t new_tsc;
382 |             for (;;) {
383 |                 new_tsc = atomic_load_explicit(&g_cell[w.init].tsc,
384 |                         memory_order_consume);
385 |                 if (new_tsc > tsc) {
386 |                     break;
387 |                 }
388 |                 _mm_pause();
389 |             }
390 |             uint64_t now   = fenced_rdtscp();
391 |             uint64_t delta = now - new_tsc;
392 |             ds[j++] = delta;
393 |             tsc = new_tsc;
394 |         }
395 |     }
396 |     return spin_main_finalize(x, ds, j);
397 | }
398 | 
399 | static void *spin_pause_more_main(void *p)
400 | {
401 |     Worker *x = (Worker*) p;
402 |     Worker w = *x;
403 | 
404 |     uint64_t tsc = 1;
405 |     unsigned j = 0;
406 |     uint32_t *ds = calloc(w.n/2, sizeof ds[0]);
407 |     if (!ds) {
408 |         fprintf(stderr, "Failed to allocate delta array in thread\n");
409 |         return 0;
410 |     }
411 | 
412 |     while(!atomic_load_explicit(&start_work, memory_order_consume)) {
413 |         _mm_pause();
414 |     }
415 | 
416 |     for (unsigned i = 0; i < w.n; ++i) {
417 |         if (i % 2 == w.init) { // sender
418 |             unsigned k = i < 2 ? w.k : w.k * 2;
419 |             for (unsigned j = 0; j < k; ++j)
420 |                 _mm_pause();
421 |             uint64_t t;
422 |             for (;;) {
423 |                 t = fenced_rdtsc();
424 |                 if (t <= tsc)
425 |                     continue;
426 |                 atomic_store_explicit(&g_cell[!w.init].tsc, t,
427 |                         memory_order_release);
428 |                 break;
429 |             }
430 |         } else { // receiver
431 |             uint64_t new_tsc;
432 |             for (;;) {
433 |                 new_tsc = atomic_load_explicit(&g_cell[w.init].tsc,
434 |                         memory_order_consume);
435 |                 if (new_tsc > tsc) {
436 |                     break;
437 |                 }
438 |                 for (unsigned j = 0; j < w.p; ++j)
439 |                     _mm_pause();
440 |             }
441 |             uint64_t now   = fenced_rdtscp();
442 |             uint64_t delta = now - new_tsc;
443 |             ds[j++] = delta;
444 |             tsc = new_tsc;
445 |         }
446 |     }
447 |     return spin_main_finalize(x, ds, j);
448 | }
449 | 
450 | 
451 | static void *cv_main(void *p)
452 | {
453 |     Worker *x = (Worker*) p;
454 |     Worker w = *x;
455 | 
456 |     uint64_t tsc = 1;
457 |     unsigned j = 0;
458 |     uint32_t *ds = calloc(w.n/2, sizeof ds[0]);
459 |     if (!ds) {
460 |         fprintf(stderr, "Failed to allocate delta array in thread\n");
461 |         return 0;
462 |     }
463 | 
464 |     while(!atomic_load_explicit(&start_work, memory_order_consume)) {
465 |         _mm_pause();
466 |     }
467 | 
468 |     for (unsigned i = 0; i < w.n; ++i) {
469 |         if (i % 2 == w.init) { // sender
470 |             unsigned k = i < 2 ? w.k : w.k * 2;
471 |             for (unsigned j = 0; j < k; ++j)
472 |                 _mm_pause();
473 |             uint64_t t;
474 |             for (;;) {
475 |                 t = fenced_rdtsc();
476 |                 if (t <= tsc)
477 |                     continue;
478 |                 int r = pthread_mutex_lock(&g_item[!w.init].mutex);
479 |                 if (r) {
480 |                     perror_e(r, "sender: mutex lock");
481 |                     return 0;
482 |                 }
483 |                 g_item[!w.init].tsc = t;
484 |                 r = pthread_mutex_unlock(&g_item[!w.init].mutex);
485 |                 if (r) {
486 |                     perror_e(r, "sender: mutex unlock");
487 |                     return 0;
488 |                 }
489 |                 r = pthread_cond_signal(&g_item[!w.init].cond_var);
490 |                 if (r) {
491 |                     perror_e(r, "cond signal: mutex lock");
492 |                     return 0;
493 |                 }
494 |                 break;
495 |             }
496 |         } else { // receiver
497 |             int r = pthread_mutex_lock(&g_item[w.init].mutex);
498 |             if (r) {
499 |                 perror_e(r, "retrieve: mutex lock");
500 |                 return 0;
501 |             }
502 |             while (g_item[w.init].tsc <= tsc) {
503 |                 r = pthread_cond_wait(&g_item[w.init].cond_var,
504 |                         &g_item[w.init].mutex);
505 |                 if (r) {
506 |                     perror_e(r, "cond_wait");
507 |                     return 0;
508 |                 }
509 |             }
510 |             uint64_t new_tsc = g_item[w.init].tsc;
511 |             r = pthread_mutex_unlock(&g_item[w.init].mutex);
512 |             if (r) {
513 |                 perror_e(r, "retrieve: mutex unlock");
514 |                 return 0;
515 |             }
516 |             uint64_t now   = fenced_rdtscp();
517 |             uint64_t delta = now - new_tsc;
518 |             ds[j++] = delta;
519 |             tsc = new_tsc;
520 |         }
521 |     }
522 |     return spin_main_finalize(x, ds, j);
523 | }
524 | 
525 | static void *pipe_main(void *p)
526 | {
527 |     Worker *x = (Worker*) p;
528 |     Worker w = *x;
529 | 
530 |     uint64_t tsc = 1;
531 |     unsigned j = 0;
532 |     uint32_t *ds = calloc(w.n/2, sizeof ds[0]);
533 |     if (!ds) {
534 |         fprintf(stderr, "Failed to allocate delta array in thread\n");
535 |         return 0;
536 |     }
537 | 
538 |     while(!atomic_load_explicit(&start_work, memory_order_consume)) {
539 |         _mm_pause();
540 |     }
541 | 
542 |     for (unsigned i = 0; i < w.n; ++i) {
543 |         if (i % 2 == w.init) { // sender
544 |             unsigned k = i < 2 ? w.k : w.k * 2;
545 |             for (unsigned j = 0; j < k; ++j)
546 |                 _mm_pause();
547 |             uint64_t t;
548 |             for (;;) {
549 |                 t = fenced_rdtsc();
550 |                 if (t <= tsc)
551 |                     continue;
552 |                 ssize_t l = write(g_pipes[!w.init][1], &t, sizeof t);
553 |                 if (l == -1) {
554 |                     perror("pipe write");
555 |                     return 0;
556 |                 }
557 |                 if (l != sizeof t) {
558 |                     fprintf(stderr, "written into pipe less than expected\n");
559 |                     return 0;
560 |                 }
561 |                 break;
562 |             }
563 |         } else { // receiver
564 |             uint64_t new_tsc;
565 |             ssize_t l = read(g_pipes[w.init][0], &new_tsc, sizeof new_tsc);
566 |             if (l == -1) {
567 |                 perror("pipe read");
568 |                 return 0;
569 |             }
570 |             if (l != sizeof new_tsc) {
571 |                 fprintf(stderr, "read from pipe less than expected\n");
572 |                 return 0;
573 |             }
574 |             uint64_t now   = fenced_rdtscp();
575 |             uint64_t delta = now - new_tsc;
576 |             ds[j++] = delta;
577 |             tsc = new_tsc;
578 |         }
579 |     }
580 |     return spin_main_finalize(x, ds, j);
581 | }
582 | 
583 | static void *semaphore_main(void *p)
584 | {
585 |     Worker *x = (Worker*) p;
586 |     Worker w = *x;
587 | 
588 |     uint64_t tsc = 1;
589 |     unsigned j = 0;
590 |     uint32_t *ds = calloc(w.n/2, sizeof ds[0]);
591 |     if (!ds) {
592 |         fprintf(stderr, "Failed to allocate delta array in thread\n");
593 |         return 0;
594 |     }
595 | 
596 |     while(!atomic_load_explicit(&start_work, memory_order_consume)) {
597 |         _mm_pause();
598 |     }
599 | 
600 |     for (unsigned i = 0; i < w.n; ++i) {
601 |         if (i % 2 == w.init) { // sender
602 |             int r = sem_wait(&g_stripe[w.init].sem);
603 |             if (r == -1) {
604 |                 perror("sem wait");
605 |                 return 0;
606 |             }
607 | 
608 |             unsigned k = i < 2 ? w.k : w.k * 2;
609 |             for (unsigned j = 0; j < k; ++j)
610 |                 _mm_pause();
611 |             uint64_t t;
612 |             for (;;) {
613 |                 t = fenced_rdtsc();
614 |                 if (t <= tsc)
615 |                     continue;
616 |                 g_stripe[!w.init].tsc = t;
617 | 
618 |                 int r = sem_post(&g_stripe[!w.init].sem);
619 |                 if (r == -1) {
620 |                     perror("sem post");
621 |                     return 0;
622 |                 }
623 | 
624 |                 break;
625 |             }
626 |         } else { // receiver
627 |             uint64_t new_tsc;
628 | 
629 |             int r = sem_wait(&g_stripe[w.init].sem);
630 |             if (r == -1) {
631 |                 perror("sem wait");
632 |                 return 0;
633 |             }
634 |             new_tsc = g_stripe[w.init].tsc;
635 | 
636 |             uint64_t now   = fenced_rdtscp();
637 |             uint64_t delta = now - new_tsc;
638 |             ds[j++] = delta;
639 |             tsc = new_tsc;
640 | 
641 |             r = sem_post(&g_stripe[w.init].sem);
642 |             if (r == -1) {
643 |                 perror("sem post");
644 |                 return 0;
645 |             }
646 |         }
647 |     }
648 |     return spin_main_finalize(x, ds, j);
649 | }
650 | 
651 | // note that this lock/unlock scheme doesn't work with posix mutexes
652 | // because unlocking a locked posix mutex from a different thread
653 | // is undefined behaviour
654 | static void *futex_main(void *p)
655 | {
656 |     Worker *x = (Worker*) p;
657 |     Worker w = *x;
658 | 
659 |     uint64_t tsc = 1;
660 |     unsigned j = 0;
661 |     uint32_t *ds = calloc(w.n/2, sizeof ds[0]);
662 |     if (!ds) {
663 |         fprintf(stderr, "Failed to allocate delta array in thread\n");
664 |         return 0;
665 |     }
666 | 
667 |     while(!atomic_load_explicit(&start_work, memory_order_consume)) {
668 |         _mm_pause();
669 |     }
670 | 
671 |     for (unsigned i = 0; i < w.n; ++i) {
672 |         if (i % 2 == w.init) { // sender
673 |             int r = futex_lock(&g_follicle[w.init].futex);
674 |             if (r == -1 ) {
675 |                 perror("futex wait");
676 |                 return 0;
677 |             }
678 | 
679 |             unsigned k = i < 2 ? w.k : w.k * 2;
680 |             for (unsigned j = 0; j < k; ++j)
681 |                 _mm_pause();
682 |             uint64_t t;
683 |             for (;;) {
684 |                 t = fenced_rdtsc();
685 |                 if (t <= tsc)
686 |                     continue;
687 |                 g_follicle[!w.init].tsc = t;
688 |                 int r = futex_unlock(&g_follicle[!w.init].futex);
689 |                 if (r == -1) {
690 |                     perror("futex wake");
691 |                     return 0;
692 |                 }
693 |                 if (r == -2) {
694 |                     fprintf(stderr, "%u: unexpectedly unlocked\n", w.init);
695 |                     abort();
696 |                 }
697 |                 break;
698 |             }
699 |         } else { // receiver
700 |             uint64_t new_tsc;
701 | 
702 |             int r = futex_lock(&g_follicle[w.init].futex);
703 |             if (r == -1 ) {
704 |                 perror("futex wait");
705 |                 return 0;
706 |             }
707 |             new_tsc = g_follicle[w.init].tsc;
708 | 
709 |             uint64_t now   = fenced_rdtscp();
710 |             uint64_t delta = now - new_tsc;
711 |             ds[j++] = delta;
712 |             tsc = new_tsc;
713 | 
714 |             r = futex_unlock(&g_follicle[w.init].futex);
715 |             if (r == -1 ) {
716 |                 perror("futex wake");
717 |                 return 0;
718 |             }
719 |             if (r == -2) {
720 |                 fprintf(stderr, "%u: unexpectedly unlocked\n", w.init);
721 |                 abort();
722 |             }
723 |         }
724 |     }
725 |     return spin_main_finalize(x, ds, j);
726 | }
727 | 
728 | static int print_json(const Args *args, const Worker *ws, FILE *f)
729 | {
730 |     fprintf(f, "[\n");
731 |     for (unsigned i = 0; i < 2; ++i) {
732 |         const Worker *w = ws + i;
733 |         fprintf(f, "    [");
734 |         if (w->ds_size) {
735 |             fprintf(f, " %" PRIu64,
736 |                     mul_u64_u32_shr(w->raw_ds[0], args->mult, args->shift));
737 |         }
738 |         for (unsigned j = 1; j < w->ds_size; ++j) {
739 |             fprintf(f, ", %" PRIu64,
740 |                     mul_u64_u32_shr(w->raw_ds[j], args->mult, args->shift));
741 |         }
742 |         fprintf(f, "]");
743 |         if (!i)
744 |             fprintf(f, ",\n");
745 |     }
746 |     fprintf(f, "\n]\n");
747 |     return 0;
748 | }
749 | 
750 | static int pp_results(const Args *args, const Worker *ws, FILE *f)
751 | {
752 |     fprintf(f, "Thread  TSC_khz  #delta  min_ns  max_ns  median_ns  p20_ns  p80_ns  p90_ns  p99_ns  p99.9_ns  mad_ns\n");
753 |     uint32_t *ys = 0;
754 |     for (unsigned i = 0; i < 2; ++i) {
755 |         const Worker *w = ws + i;
756 |         ys = realloc(ys, w->ds_size * sizeof ys[0]);
757 |         if (!ys) {
758 |             fprintf(stderr, "realloc in pp_results failed\n");
759 |             return -1;
760 |         }
761 |         uint32_t mad = mad_u32(w->ds, ys, w->ds_size);
762 |         if (!w->ds_size)
763 |             continue;
764 |         fprintf(f, "%6u %8" PRIu32  " %7u "
765 |                 "%7" PRIu64 " "
766 |                 "%7" PRIu64 " "
767 |                 "%10" PRIu64 " "
768 |                 "%7" PRIu64 " "
769 |                 "%7" PRIu64 " "
770 |                 "%7" PRIu64 " "
771 |                 "%7" PRIu64 " "
772 |                 "%9" PRIu64 " "
773 |                 "%7" PRIu64 " "
774 |                 "\n",
775 |                 i, args->tsc_khz, w->ds_size,
776 |                 mul_u64_u32_shr(w->ds[0],
777 |                     args->mult, args->shift),
778 |                 mul_u64_u32_shr(w->ds[w->ds_size - 1],
779 |                     args->mult, args->shift),
780 |                 mul_u64_u32_shr(percentile_u32(w->ds, w->ds_size, 1, 2),
781 |                     args->mult, args->shift),
782 |                 mul_u64_u32_shr(percentile_u32(w->ds, w->ds_size, 1, 5),
783 |                     args->mult, args->shift),
784 |                 mul_u64_u32_shr(percentile_u32(w->ds, w->ds_size, 4, 5),
785 |                     args->mult, args->shift),
786 |                 mul_u64_u32_shr(percentile_u32(w->ds, w->ds_size, 90, 100),
787 |                     args->mult, args->shift),
788 |                 mul_u64_u32_shr(percentile_u32(w->ds, w->ds_size, 99, 100),
789 |                     args->mult, args->shift),
790 |                 mul_u64_u32_shr(percentile_u32(w->ds, w->ds_size, 999, 1000),
791 |                     args->mult, args->shift),
792 |                 mul_u64_u32_shr(mad, args->mult, args->shift)
793 |                );
794 |     }
795 |     free(ys);
796 |     return 0;
797 | }
798 | 
799 | static int spin_pingpong(const Args *args)
800 | {
801 |     Worker ws[2] = {0};
802 |     for (unsigned i = 0; i < 2; ++i) {
803 |         ws[i].n = args->n;
804 |         ws[i].k = args->k;
805 |         ws[i].p = args->p;
806 |         ws[i].init = i;
807 |         pthread_attr_t attr;
808 |         int r = pthread_attr_init(&attr);
809 |         if (r) {
810 |             perror_e(r, "pthread_attr_init failed");
811 |             return 1;
812 |         }
813 |         if (args->pin[i]) {
814 |             cpu_set_t cpus;
815 |             CPU_ZERO(&cpus);
816 |             CPU_SET(args->pin[i] - 1, &cpus);
817 |             r = pthread_attr_setaffinity_np(&attr, sizeof cpus, &cpus);
818 |             if (r) {
819 |                 perror_e(r, "pthread_attr_setaffinity_np failed");
820 |                 return 1;
821 |             }
822 |         }
823 |         switch (args->method) {
824 |             case METHOD_SPIN:
825 |                 r = pthread_create(&ws[i].worker_id, &attr, spin_main, ws+i);
826 |                 break;
827 |             case METHOD_SPIN_PAUSE:
828 |                 r = pthread_create(&ws[i].worker_id, &attr, spin_pause_main,
829 |                         ws+i);
830 |                 break;
831 |             case METHOD_SPIN_PAUSE_MORE:
832 |                 r = pthread_create(&ws[i].worker_id, &attr,
833 |                         spin_pause_more_main, ws+i);
834 |                 break;
835 |             case METHOD_COND_VAR:
836 |                 r = pthread_create(&ws[i].worker_id, &attr, cv_main, ws+i);
837 |                 break;
838 |             case METHOD_PIPE:
839 |                 r = pipe(g_pipes[i]);
840 |                 if (r == -1) {
841 |                     perror("pipe");
842 |                     return 1;
843 |                 }
844 |                 r = pthread_create(&ws[i].worker_id, &attr, pipe_main, ws+i);
845 |                 break;
846 |             case METHOD_FUTEX:
847 |                 g_follicle[i].futex = i;
848 |                 r = pthread_create(&ws[i].worker_id, &attr, futex_main, ws+i);
849 |                 break;
850 |             case METHOD_SEMAPHORE:
851 |                 r = sem_init(&g_stripe[i].sem, 0, !i);
852 |                 if (r == -1) {
853 |                     perror("sem_init");
854 |                     return 1;
855 |                 }
856 |                 r = pthread_create(&ws[i].worker_id, &attr, semaphore_main, ws+i);
857 |                 break;
858 |             case METHOD_NULL:
859 |                 r = pthread_create(&ws[i].worker_id, &attr, spin_null_main,
860 |                         ws+i);
861 |                 break;
862 |         }
863 |         if (r) {
864 |             perror_e(r, "pthread_create failed");
865 |             return 1;
866 |         }
867 |         r = pthread_attr_destroy(&attr);
868 |         if (r) {
869 |             perror_e(r, "pthread_attr_init failed");
870 |             return 1;
871 |         }
872 |     }
873 | 
874 |     atomic_store_explicit(&start_work, true, memory_order_release);
875 | 
876 |     bool error_in_thread = false;
877 |     for (unsigned i = 0; i < 2; ++i) {
878 |         void *w_ret = 0;
879 |         int r = pthread_join(ws[i].worker_id, &w_ret);
880 |         if (r) {
881 |             perror_e(r, "pthread_join failed");
882 |             return 1;
883 |         }
884 |         if (!w_ret)
885 |             error_in_thread = true;
886 |     }
887 |     if (error_in_thread) {
888 |         fprintf(stderr, "One thread reported an error\n");
889 |         return 1;
890 |     }
891 |     if (args->json)
892 |         print_json(args, ws, stdout);
893 |     else
894 |         pp_results(args, ws, stdout);
895 |     for (unsigned i = 0; i < 2; ++i) {
896 |         free(ws[i].ds);
897 |         free(ws[i].raw_ds);
898 |     }
899 |     return 0;
900 | }
901 | 
902 | 
903 | int main(int argc, char **argv)
904 | {
905 |     Args args;
906 |     int r = parse_args(&args, argc, argv);
907 |     if (r) {
908 |         return 1;
909 |     }
910 |     if (!args.tsc_khz) {
911 |         int r = get_tsc_khz(&args.tsc_khz);
912 |         if (r < 0)
913 |             return 1;
914 |     }
915 |     clocks_calc_mult_shift(&args.mult, &args.shift,
916 |             args.tsc_khz, 1000000l, 0);
917 | 
918 |     r = spin_pingpong(&args);
919 |     if (r)
920 |         return 1;
921 |     return 0;
922 | }
923 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------