├── .gitignore ├── Makefile ├── README.md ├── algo-common.hpp ├── basic-impls.cpp ├── basic-impls.hpp ├── catch.hpp ├── clock └── clock.h ├── common-cxx.hpp ├── cpuid.cpp ├── cpuid.hpp ├── cycle-timer.c ├── cycle-timer.h ├── dbg.h ├── env.hpp ├── exact-int └── exact-int.h ├── generate-event-code.py ├── hedley.h ├── impl-list.cpp ├── impl-list.hpp ├── jevents ├── Makefile ├── README.md ├── cache.c ├── cpustr.c ├── event-rmap.c ├── examples │ ├── Makefile │ ├── addr.c │ ├── cpu.c │ ├── cpu.h │ ├── hist.cc │ ├── hist.h │ ├── jestat.c │ ├── rtest.c │ ├── rtest2.c │ └── rtest3.c ├── interrupts.c ├── interrupts.h ├── jevents-internal.h ├── jevents.c ├── jevents.h ├── jsession.h ├── jsmn.c ├── jsmn.h ├── json.c ├── json.h ├── libjevents.spec ├── listevents.c ├── measure.c ├── measure.h ├── perf-iter.c ├── perf-iter.h ├── perf_event_open.c ├── rawevent.c ├── rdpmc.c ├── rdpmc.h ├── resolve.c ├── session.c ├── showevent.c └── util.h ├── main-test.cpp ├── main.cpp ├── misc.cpp ├── misc.hpp ├── msr-access.c ├── msr-access.h ├── nonstd ├── LICENSE.txt └── span.hpp ├── opt-control.h ├── perf-timer-events.cpp ├── perf-timer-events.hpp ├── perf-timer.cpp ├── perf-timer.hpp ├── scripts ├── common.sh ├── data.sh ├── plot-csv.py └── plots.sh ├── tsc-support.cpp ├── tsc-support.hpp ├── unit-test.cpp ├── util.hpp └── voltmon.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # by default exclude anythign in jevents without an extension since it's 2 | # probably a binary file 3 | /jevents/** 4 | !/jevents/**/ 5 | !/jevents/**/*.* 6 | !/jevents/**/Makefile 7 | 8 | /.* 9 | !.gitignore 10 | *.o 11 | *.a 12 | *.log 13 | __pycache__ 14 | /tmp/* 15 | *.d 16 | /out* 17 | 18 | # exes 19 | /test 20 | /bench 21 | /voltmon 22 | 23 | perf.data 24 | perf.data.* 25 | 26 | /results 27 | /temp 28 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | STD ?= c++14 2 | ARCH := haswell 3 | 4 | ifeq ($(DEBUG), 1) 5 | OPT := -Og 6 | else 7 | CPPFLAGS += -DNDEBUG 8 | OPT := -O2 9 | endif 10 | 11 | CPPFLAGS += -MMD -Wall -Wextra -Werror $(OPT) -g -march=$(ARCH) -Wno-unused-parameter $(CPPEXTRA) \ 12 | -Wno-error=unused-variable \ 13 | -Wno-unknown-pragmas \ 14 | # -funroll-loops 15 | 16 | CXXFLAGS += -std=$(STD) -DENABLE_TIMER=1 17 | 18 | # uncomment to use the fast gold linker 19 | # LDFLAGS = -use-ld=gold 20 | 21 | 22 | TARGETS := bench test voltmon 23 | MAINOS := main.o main-test.o voltmon.o 24 | 25 | TESTSRCS:= $(wildcard *-test.c *-test.cpp) 26 | TESTOBJS:= $(patsubst %.c,%.o,$(TESTSRCS)) 27 | TESTOBJS:= $(patsubst %.cpp,%.o,$(TESTOBJS)) 28 | 29 | SRCS := $(wildcard *.c *.cpp xxHash/xxhash.c) 30 | OBJECTS := $(patsubst %.c,%.o,$(SRCS)) 31 | OBJECTS := $(patsubst %.cpp,%.o,$(OBJECTS)) 32 | ALLOBJS := $(OBJECTS) 33 | OBJECTS := $(filter-out $(MAINOS) $(TESTOBJS), $(OBJECTS)) 34 | 35 | DEPS := $(ALLOBJS:%.o=%.d) 36 | 37 | # $(info $$DEPS is [${DEPS}]) 38 | # $(info $$TESTOBJS is [${TESTOBJS}]) 39 | # $(info $$OBJECTS is [${OBJECTS}]) 40 | 41 | JE_LIB := jevents/libjevents.a 42 | JE_SRC := $(wildcard jevents/*.c jevents/*.h) 43 | 44 | LDFLAGS := -lm 45 | 46 | ifneq ($(LD), ld) 47 | LDFLAGS += $(if $(LD),-fuse-ld=$(LD)) 48 | endif 49 | 50 | ifeq ($(ASAN), 1) 51 | CPPFLAGS += -fsanitize=address -fno-omit-frame-pointer 52 | LDFLAGS += -lasan 53 | endif 54 | 55 | default: $(TARGETS) 56 | 57 | -include $(DEPS) 58 | 59 | bench : $(OBJECTS) main.o 60 | 61 | voltmon : voltmon.o msr-access.o 62 | 63 | test : $(OBJECTS) $(TESTOBJS) 64 | 65 | $(TARGETS) : $(JE_LIB) 66 | $(CXX) $(CFLAGS) $(CPPFLAGS) $^ $(LDFLAGS) -o $@ $(JE_LIB) 67 | 68 | %.o: %.c 69 | $(CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $< 70 | 71 | %.o: %.cpp 72 | $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CXXEXTRA) -c -o $@ $< 73 | 74 | $(JE_LIB): $(JE_SRC) 75 | cd jevents && $(MAKE) MAKEFLAGS= 76 | 77 | clean: 78 | rm -f $(TARGETS) 79 | rm -f *.o *.d 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is the supporting benchmark for the blog post [_Gathering Intel on Intel AVX-512 Transitions_](https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html). 2 | 3 | This benchmark relies heavily on the [`perf_events`](http://man7.org/linux/man-pages/man2/perf_event_open.2.html) subsystem and so only runs on Linux. 4 | 5 | ## Build 6 | 7 | This project provides a makefile and building should be as simple as: 8 | 9 | make 10 | 11 | ## Checking perf_event_paranoid 12 | 13 | To run these tests you need a `/proc/sys/kernel/perf_event_paranoid` setting of 1 or less, or to be running as root. Many modern system are setting this to 3, a very conservative setting. If the value on your system is greater than 1, you can either set it (until reboot) to 1 like so: 14 | 15 | echo 1 | sudo tee /proc/sys/kernel/perf_event_paranoid 16 | 17 | Or you can run the tests as root (e.g., `sudo ./bench) 18 | 19 | ## Running 20 | 21 | The benchmark includes several tests. You can run them all like this (this will crash pretty quickly on platforms without AVX-512): 22 | 23 | ./bench 24 | 25 | You can run a specific test by providing its name: 26 | 27 | ./bench vporymm 28 | 29 | You can list the available tests: 30 | 31 | ./bench list tests 32 | 33 | 34 | ## Generating Results 35 | 36 | You can run all the test required for generating the data used in the post using the ./data.sh scripts. First you should set your TSC (time stamp counter as read by `rstsc`) frequency as the `MHZ` variable in the environment. The benchmark application itself has detection of the TSC frequency, you can check the value on your system by running: 37 | 38 | ./bench dummy |& grep 'tsc freq' 39 | 40 | Set that value as `MHZ` in your environment, like so: 41 | 42 | export MHZ=3200 43 | 44 | Next, choose a prefix for your result files to identify them, this defaults to `test` if you don't set any: 45 | 46 | export PREFIX=blah 47 | # this second prefix is used specificaly for the voltage related tests 48 | export PREFIXV=blah 49 | 50 | Then you can collect most of the data like this: 51 | 52 | scripts/data.sh 53 | 54 | This generates the results in the `./results` dir by default. This generates all the results _except_ for those related to voltage. To generate those, you need to jump through a few hoops, because voltage readings require MSR access. First, you must have `msr-tools` installed, however that works on your platform (e.g., `apt install msr-tools` on Ubuntu). Then you can either run data.sh as root, like so: 55 | 56 | sudo MHZ=$MHZ DO_VOLTS=1 scripts/data.sh 57 | 58 | The `DO_VOLTS=1` indicates to the script that it should run only the voltage tests. This causes the files written to `./results` to be owned by root (unless the files already existed with different ownership), so you should `chown` them to your current user after. You can also run this as a non-root user but you have to jump though some hoops (this is probably not entirely safe, certainly not on any host with untrusted users or code): 59 | 60 | ~~~ 61 | # set the special msr files world readable 62 | sudo chmod a+r /dev/cpu/*/msr 63 | # give the bench binary the caps needed to read the msr files (not needed on all systems) 64 | sudo setcap cap_sys_rawio=ep ./bench 65 | # run the test as non-root 66 | DO_VOLTS=1 scripts/data.sh 67 | ~~~ 68 | 69 | ## Generating Plots 70 | 71 | Once the results are generated, you can generate the plots with: 72 | 73 | scripts/plots.sh 74 | 75 | That will show the plots, one by one, in matplotlib's interactive viewer. To write them out to SVG files, just provide the `OUTDIR` environment variable: 76 | 77 | ~~~ 78 | mkdir plots 79 | OUTDIR=./plots scripts/plots.sh 80 | ~~~ 81 | 82 | The script expects all the result files to exist, feel free to comment out some lines if you've just generated partial results. You can also pass the script a regex and it will only generate plots whose title matches the regex. For example, the following generates only the 3 voltage plots: 83 | 84 | ./scripts/plots.sh Voltage 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /algo-common.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ALGO_COMMON_H_ 2 | #define ALGO_COMMON_H_ 3 | 4 | #include "hedley.h" 5 | 6 | #include 7 | 8 | #endif 9 | 10 | -------------------------------------------------------------------------------- /basic-impls.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Implementation of various very basic algorithms, mostly as a litmus test for the more complicated ones. 3 | */ 4 | 5 | #include "basic-impls.hpp" 6 | #include "algo-common.hpp" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | void vporxmm(bench_args args) { 14 | asm volatile ("vpor %xmm0, %xmm0, %xmm0\n"); 15 | } 16 | 17 | void vporymm(bench_args args) { 18 | asm volatile ("vpor %ymm0, %ymm0, %ymm0\n"); 19 | } 20 | 21 | void vporzmm(bench_args args) { 22 | asm volatile ("vpord %zmm0, %zmm0, %zmm0\n"); 23 | } 24 | 25 | void vporxmm_vz(bench_args args) { 26 | asm volatile ( 27 | "vpor %xmm0, %xmm0, %xmm0\n" 28 | "vzeroupper\n" 29 | ); 30 | } 31 | 32 | void vporymm_vz(bench_args args) { 33 | asm volatile ( 34 | "vpor %ymm0, %ymm0, %ymm0\n" 35 | "vzeroupper\n" 36 | ); 37 | } 38 | 39 | void vporzmm_vz(bench_args args) { 40 | asm volatile ( 41 | "vpord %zmm0, %zmm0, %zmm0\n" 42 | "vzeroupper\n" 43 | ); 44 | } 45 | 46 | /** 47 | * 1000 copies of instr 48 | */ 49 | #define MAKE_MANY(name,instr,regd,regs) \ 50 | void name##_vz100(bench_args args) { \ 51 | asm volatile ( \ 52 | ".rept 1000\n\t" \ 53 | #instr " %" #regs ", %" #regs ", %" #regd "\n\t" \ 54 | ".endr\n\t" \ 55 | "vzeroupper\n\t" \ 56 | ); \ 57 | } 58 | 59 | // latency 60 | MAKE_MANY(vporxmm, vpor, xmm0, xmm0) 61 | MAKE_MANY(vporymm, vpor, ymm0, ymm0) 62 | MAKE_MANY(vporzmm, vpord, zmm0, zmm0) 63 | 64 | // throughput 65 | MAKE_MANY(vporxmm_tput, vpor, xmm0, xmm1) 66 | MAKE_MANY(vporymm_tput, vpor, ymm0, ymm1) 67 | MAKE_MANY(vporzmm_tput, vpord, zmm0, zmm1) 68 | 69 | // vpermd 70 | MAKE_MANY(vpermdzmm , vpermd, zmm0, zmm0) 71 | MAKE_MANY(vpermdzmm_tput, vpermd, zmm0, zmm1) 72 | 73 | /** 74 | * 1000 copies of instr 75 | */ 76 | #define MAKE_MANY250(name,instr1,instr2,regd1,regs1,regd2,regs2) \ 77 | void name(bench_args args) { \ 78 | asm volatile ( \ 79 | ".rept 500\n\t" \ 80 | #instr1 " %" #regs1 ", %" #regs1 ", %" #regd1 "\n\t" \ 81 | #instr2 " %" #regs2 ", %" #regs2 ", %" #regd2 "\n\t" \ 82 | #instr2 " %" #regs2 ", %" #regs2 ", %" #regd2 "\n\t" \ 83 | #instr2 " %" #regs2 ", %" #regs2 ", %" #regd2 "\n\t" \ 84 | ".endr\n\t" \ 85 | "vzeroupper\n\t" \ 86 | ); \ 87 | } 88 | 89 | // MAKE_MANY250(vporxymm250, vpor , vpor, ymm0, ymm0, xmm0, xmm0); 90 | // MAKE_MANY250(vporyzmm250, vpord, vpor, zmm0, zmm0, ymm0, ymm0); 91 | 92 | #define MAKE_MANY3(name,instr1,instr2,instr3) \ 93 | void name(bench_args args) { \ 94 | asm volatile ( \ 95 | ".rept 250\n\t" \ 96 | instr1 \ 97 | instr2 \ 98 | instr3 \ 99 | ".endr\n\t" \ 100 | ); \ 101 | } 102 | 103 | MAKE_MANY3(vporxymm250, \ 104 | "vpor %ymm0, %ymm0, %ymm0\n\t", 105 | "vmovd %xmm0, %eax\n\t", 106 | "vmovd %eax, %xmm0\n\t"); 107 | 108 | MAKE_MANY3(vporyzmm250, \ 109 | "vpor %xmm0, %xmm0, %xmm0\n\t", 110 | "vmovd %xmm0, %eax\n\t", 111 | "vmovd %eax, %xmm0\n\t"); 112 | 113 | #define MAKE250(name,rep,instr) \ 114 | void name##_##rep(bench_args args) { \ 115 | asm volatile ( \ 116 | "vzeroupper\n\t" \ 117 | ".rept 10\n\t" \ 118 | "vpor %ymm0, %ymm0, %ymm0\n\t" \ 119 | ".rept " #rep "\n\t" \ 120 | instr \ 121 | ".endr\n\t" \ 122 | ".endr\n\t" \ 123 | ); \ 124 | } \ 125 | 126 | #define MAKE250ADD(rep) MAKE250(vporxymm250, rep, "addl $0, %eax\n\t") 127 | 128 | ALL_RATIOS_X(MAKE250ADD) 129 | 130 | MAKE250(mulxymm250, 10, "imull $0, %eax, %eax\n\t") 131 | 132 | 133 | void dummy(bench_args args) {} 134 | -------------------------------------------------------------------------------- /basic-impls.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BASIC_IMPLS_H_ 2 | #define BASIC_IMPLS_H_ 3 | 4 | #include "common-cxx.hpp" 5 | 6 | bench_fn dummy; 7 | 8 | bench_fn vporxmm; 9 | bench_fn vporymm; 10 | bench_fn vporzmm; 11 | 12 | bench_fn vporxmm_vz; 13 | bench_fn vporymm_vz; 14 | bench_fn vporzmm_vz; 15 | 16 | bench_fn vporxmm_vz100; 17 | bench_fn vporymm_vz100; 18 | bench_fn vporzmm_vz100; 19 | 20 | bench_fn vporxmm_tput_vz100; 21 | bench_fn vporymm_tput_vz100; 22 | bench_fn vporzmm_tput_vz100; 23 | 24 | bench_fn vpermdxmm_vz100; 25 | bench_fn vpermdymm_vz100; 26 | bench_fn vpermdzmm_vz100; 27 | 28 | bench_fn vpermdxmm_tput_vz100; 29 | bench_fn vpermdymm_tput_vz100; 30 | bench_fn vpermdzmm_tput_vz100; 31 | 32 | bench_fn vporxymm250; 33 | bench_fn vporyzmm250; 34 | 35 | #define DEFINE250(rep) bench_fn vporxymm250_##rep; 36 | 37 | ALL_RATIOS_X(DEFINE250) 38 | 39 | bench_fn mulxymm250_10; 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /common-cxx.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SI_COMMON_CXX_H_ 2 | #define SI_COMMON_CXX_H_ 3 | 4 | /* to align with C++20 std::span */ 5 | #define span_CONFIG_INDEX_TYPE size_t 6 | 7 | #include // needed by span.hpp 8 | 9 | #include "hedley.h" 10 | #include "inttypes.h" 11 | #include "nonstd/span.hpp" 12 | 13 | #include 14 | 15 | using char_span = nonstd::span; 16 | 17 | /** 18 | * Bundles all the arguments. 19 | */ 20 | struct bench_args {}; 21 | 22 | using bench_fn = void (bench_args args); 23 | 24 | #define ALL_RATIOS_X(f) \ 25 | f(1) \ 26 | f(2) \ 27 | f(3) \ 28 | f(4) \ 29 | f(5) \ 30 | f(6) \ 31 | f(7) \ 32 | f(8) \ 33 | f(9) \ 34 | f(10) \ 35 | f(20) \ 36 | f(30) \ 37 | f(40) \ 38 | f(50) \ 39 | f(60) \ 40 | f(70) \ 41 | f(80) \ 42 | f(90) \ 43 | f(100) \ 44 | f(120) \ 45 | f(140) \ 46 | f(160) \ 47 | f(180) \ 48 | f(200) \ 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /cpuid.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * cpuid.cpp 3 | */ 4 | 5 | #include "cpuid.hpp" 6 | 7 | #include 8 | 9 | using std::uint8_t; 10 | using std::uint32_t; 11 | 12 | 13 | std::string cpuid_result::to_string() { 14 | std::string s; 15 | s += "eax = " + std::to_string(eax) + ", "; 16 | s += "ebx = " + std::to_string(ebx) + ", "; 17 | s += "ecx = " + std::to_string(ecx) + ", "; 18 | s += "edx = " + std::to_string(edx); 19 | return s; 20 | } 21 | 22 | uint32_t cpuid_highest_leaf_inner() { 23 | return cpuid(0).eax; 24 | } 25 | 26 | uint32_t cpuid_highest_leaf() { 27 | static uint32_t cached = cpuid_highest_leaf_inner(); 28 | return cached; 29 | } 30 | 31 | cpuid_result cpuid(int leaf, int subleaf) { 32 | cpuid_result ret = {}; 33 | asm ("cpuid" 34 | : 35 | "=a" (ret.eax), 36 | "=b" (ret.ebx), 37 | "=c" (ret.ecx), 38 | "=d" (ret.edx) 39 | : 40 | "a" (leaf), 41 | "c" (subleaf) 42 | ); 43 | return ret; 44 | } 45 | 46 | cpuid_result cpuid(int leaf) { 47 | return cpuid(leaf, 0); 48 | } 49 | 50 | family_model gfm_inner() { 51 | auto cpuid1 = cpuid(1); 52 | family_model ret; 53 | ret.family = (cpuid1.eax >> 8) & 0xF; 54 | ret.model = (cpuid1.eax >> 4) & 0xF; 55 | ret.stepping = (cpuid1.eax ) & 0xF; 56 | if (ret.family == 15) { 57 | ret.family += (cpuid1.eax >> 20) & 0xFF; // extended family 58 | } 59 | if (ret.family == 15 || ret.family == 6) { 60 | ret.model += ((cpuid1.eax >> 16) & 0xF) << 4; // extended model 61 | } 62 | return ret; 63 | } 64 | 65 | family_model get_family_model() { 66 | static family_model cached_family_model = gfm_inner(); 67 | return cached_family_model; 68 | } 69 | 70 | std::string get_brand_string() { 71 | auto check = cpuid(0x80000000); 72 | if (check.eax < 0x80000004) { 73 | return std::string("unkown (eax =") + std::to_string(check.eax) +")"; 74 | } 75 | std::string ret; 76 | for (uint32_t eax : {0x80000002, 0x80000003, 0x80000004}) { 77 | char buf[17]; 78 | auto fourchars = cpuid(eax); 79 | memcpy(buf + 0, &fourchars.eax, 4); 80 | memcpy(buf + 4, &fourchars.ebx, 4); 81 | memcpy(buf + 8, &fourchars.ecx, 4); 82 | memcpy(buf + 12, &fourchars.edx, 4); 83 | buf[16] = '\0'; 84 | ret += buf; 85 | } 86 | return ret; 87 | } 88 | 89 | /* get bits [start:end] inclusive of the given value */ 90 | uint32_t get_bits(uint32_t value, int start, int end) { 91 | value >>= start; 92 | uint32_t mask = ((uint64_t)-1) << (end - start + 1); 93 | return value & ~mask; 94 | } 95 | 96 | /** 97 | * Get the shift amount for unique physical core IDs 98 | */ 99 | int get_smt_shift() 100 | { 101 | if (cpuid_highest_leaf() < 0xb) { 102 | return -1; 103 | } 104 | uint32_t smtShift = -1u; 105 | for (uint32_t subleaf = 0; ; subleaf++) { 106 | cpuid_result leafb = cpuid(0xb, subleaf); 107 | uint32_t type = get_bits(leafb.ecx, 8 ,15); 108 | if (!get_bits(leafb.ebx,0,15) || type == 0) { 109 | // done 110 | break; 111 | } 112 | if (type == 1) { 113 | // here's the value we are after: make sure we don't have more than one entry for 114 | // this type though! 115 | if (smtShift != -1u) { 116 | fprintf(stderr, "Warning: more than one level of type 1 in the x2APIC hierarchy"); 117 | } 118 | smtShift = get_bits(leafb.eax, 0, 4); 119 | } 120 | } 121 | return smtShift; 122 | } 123 | 124 | -------------------------------------------------------------------------------- /cpuid.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * cpuid.hpp 3 | */ 4 | 5 | #ifndef CPUID_HPP_ 6 | #define CPUID_HPP_ 7 | 8 | #include 9 | #include 10 | 11 | struct cpuid_result { 12 | std::uint32_t eax, ebx, ecx, edx; 13 | std::string to_string(); 14 | }; 15 | 16 | struct family_model { 17 | uint8_t family; 18 | uint8_t model; 19 | uint8_t stepping; 20 | std::string to_string() { 21 | std::string s; 22 | s += "family = " + std::to_string(family) + ", "; 23 | s += "model = " + std::to_string(model) + ", "; 24 | s += "stepping = " + std::to_string(stepping); 25 | return s; 26 | } 27 | }; 28 | 29 | 30 | /** the highest supported leaf value */ 31 | uint32_t cpuid_highest_leaf(); 32 | 33 | /* return the CPUID result for querying the given leaf (EAX) and no subleaf (ECX=0) */ 34 | cpuid_result cpuid(int leaf); 35 | 36 | /* return the CPUID result for querying the given leaf (EAX) and subleaf (ECX) */ 37 | cpuid_result cpuid(int leaf, int subleaf); 38 | 39 | family_model get_family_model(); 40 | 41 | std::string get_brand_string(); 42 | 43 | int get_smt_shift(); 44 | 45 | /* get bits [start:end] inclusive of the given value */ 46 | uint32_t get_bits(uint32_t value, int start, int end); 47 | 48 | #endif /* CPUID_HPP_ */ 49 | -------------------------------------------------------------------------------- /cycle-timer.c: -------------------------------------------------------------------------------- 1 | /* 2 | * cycle-timer.c 3 | * 4 | * Implementation for cycle-timer.h 5 | */ 6 | 7 | #include "clock/clock.h" 8 | 9 | #include "cycle-timer.h" 10 | #include "hedley.h" 11 | 12 | #include 13 | #include 14 | 15 | 16 | const size_t ITERS = 10000; 17 | const size_t TRIES = 11; 18 | const size_t WARMUP = 1000; 19 | 20 | volatile size_t sink; 21 | /** 22 | * Calibration loop that relies on store throughput being exactly 1 per cycle 23 | * on all modern x86 chips, and the loop overhead running totally in parallel. 24 | */ 25 | HEDLEY_NEVER_INLINE 26 | __attribute__((aligned(32))) 27 | void store_calibration(size_t iters) { 28 | do { 29 | sink = iters; 30 | } while (--iters > 0); 31 | } 32 | 33 | int intcompare(const void *l_, const void *r_) { 34 | int64_t l = *(const uint64_t *)l_; 35 | int64_t r = *(const uint64_t *)r_; 36 | return (l > r) - (l < r); 37 | } 38 | 39 | /* 40 | * Calculate the frequency of the CPU based on timing a tight loop that we expect to 41 | * take one iteration per cycle. 42 | * 43 | * ITERS is the base number of iterations to use: the calibration routine is actually 44 | * run twice, once with ITERS iterations and once with 2*ITERS, and a delta is used to 45 | * remove measurement overhead. 46 | */ 47 | HEDLEY_NEVER_INLINE 48 | static double get_ghz(bool print) { 49 | 50 | const char *force = getenv("CYCLE_TIMER_FORCE_MHZ"); 51 | if (force) { 52 | int mhz = atoi(force); 53 | if (mhz) { 54 | double ghz = mhz / 1000.; 55 | if (print) fprintf(stderr, "Forced CPU speed (CYCLE_TIMER_FORCE_MHZ): %5.2f GHz\n", ghz); 56 | return ghz; 57 | } else { 58 | if (print) fprintf(stderr, "Bad value for CYCLE_TIMER_FORCE_MHZ: '%s' (falling back to cal loop)\n", force); 59 | } 60 | } 61 | 62 | int64_t results[TRIES]; 63 | 64 | for (size_t w = 0; w < WARMUP + 1; w++) { 65 | for (size_t r = 0; r < TRIES; r++) { 66 | cl_timepoint t0 = cl_now(); 67 | store_calibration(ITERS); 68 | cl_timepoint t1 = cl_now(); 69 | store_calibration(ITERS * 2); 70 | cl_timepoint t2 = cl_now(); 71 | results[r] = cl_delta(t1, t2).nanos - cl_delta(t0, t1).nanos; 72 | } 73 | } 74 | 75 | // return the median value 76 | qsort(results, TRIES, sizeof(results[0]), intcompare); 77 | double ghz = ((double)ITERS / results[TRIES/2]); 78 | if (print) fprintf(stderr, "Estimated CPU speed: %5.2f GHz\n", ghz); 79 | return ghz; 80 | } 81 | 82 | static bool is_init = false; 83 | double ghz; 84 | 85 | void cl_init(bool print) { 86 | if (HEDLEY_UNLIKELY(!is_init)) { 87 | ghz = get_ghz(print); 88 | is_init = true; 89 | } 90 | }; 91 | 92 | cl_timepoint cl_now() { 93 | struct PsnipClockTimespec spec; 94 | if (psnip_clock_monotonic_get_time(&spec)) { 95 | return (cl_timepoint){0}; 96 | } else { 97 | return (cl_timepoint){spec.seconds * 1000000000ll + spec.nanoseconds}; 98 | } 99 | } 100 | 101 | /* 102 | * Take an interval value and convert it to cycles based on the 103 | * detected frequency of this host. 104 | */ 105 | double cl_to_cycles(cl_interval interval) { 106 | cl_init(false); 107 | return interval.nanos * ghz; 108 | } 109 | 110 | /* 111 | * Take an interval value and "convert" it to nanos. 112 | */ 113 | double cl_to_nanos(cl_interval interval) { 114 | return interval.nanos; 115 | } 116 | -------------------------------------------------------------------------------- /cycle-timer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * cycle-timer.h 3 | * 4 | * A timer that returns results in CPU cycles in addition to nanoseconds. 5 | * It measures cycles indirectly by measuring the wall-time, and then converting 6 | * that to a cycle count based on a calibration loop performed once at startup. 7 | */ 8 | 9 | #ifndef CYCLE_TIMER_H_ 10 | #define CYCLE_TIMER_H_ 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | #include 17 | #include 18 | 19 | /** 20 | * A point in time, or an interval when subtracted. You should probably 21 | * treat this as an opaque struct, in case I change the implementation 22 | * someday. 23 | */ 24 | struct cl_timepoint_ { 25 | int64_t nanos; 26 | }; 27 | typedef struct cl_timepoint_ cl_timepoint; 28 | 29 | /** 30 | * An interval created by subtracting two points in time, measured 31 | * in nanoseconds. 32 | */ 33 | struct cl_interval_ { 34 | int64_t nanos; 35 | }; 36 | typedef struct cl_interval_ cl_interval; 37 | 38 | /* return the current moment in time as a cycletimer_result */ 39 | cl_timepoint cl_now(); 40 | 41 | /* 42 | * Return the interval between timepoints first and second. 43 | * This value is positive iff second occus after first. 44 | */ 45 | static inline cl_interval cl_delta(cl_timepoint first, cl_timepoint second) { 46 | return (cl_interval){second.nanos - first.nanos}; 47 | } 48 | 49 | /* 50 | * Take an interval value and convert it to cycles based on the 51 | * detected frequency of this host. 52 | */ 53 | double cl_to_cycles(cl_interval interval); 54 | 55 | double cl_to_nanos(cl_interval interval); 56 | 57 | /* 58 | * Initialize the cycletimer infrastructure. Mostly this just means calculating 59 | * the cycle to nanoseconds value (i.e., the CPU frequency). You never *need* to 60 | * use this function, if you haven't call it, it will happens automatically when 61 | * init is necessary (usually lazily - when accessing the cl_to_cycles), 62 | * but may be lengthy, so this method is offfered so that the user can trigger 63 | * it at a time of their choosing (and allowing the user to elect whether to 64 | * print out diagnostic information about the calibration). 65 | * 66 | * If you pass true for print, dignostic information like the detected CPU 67 | * frequency is printed to stderr. 68 | */ 69 | void cl_init(bool print); 70 | 71 | #ifdef __cplusplus 72 | } 73 | #endif 74 | 75 | #endif /* CYCLE_TIMER_HPP_ */ 76 | -------------------------------------------------------------------------------- /dbg.h: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | 3 | dbg(...) macro 4 | 5 | License (MIT): 6 | 7 | Copyright (c) 2019 David Peter 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to 11 | deal in the Software without restriction, including without limitation the 12 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 13 | sell copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in 17 | all copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | 27 | *****************************************************************************/ 28 | 29 | #ifndef DBG_MACRO_DBG_H 30 | #define DBG_MACRO_DBG_H 31 | 32 | #pragma message("WARNING: the 'dbg.h' header is included in your code base") 33 | 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | #include 42 | 43 | #if __cplusplus >= 201703L 44 | #include 45 | #endif 46 | 47 | namespace dbg_macro { 48 | 49 | namespace pretty_function { 50 | 51 | // Compiler-agnostic version of __PRETTY_FUNCTION__ and constants to 52 | // extract the template argument in `type_name_impl` 53 | 54 | #if defined(__clang__) 55 | #define DBG_MACRO_PRETTY_FUNCTION __PRETTY_FUNCTION__ 56 | static constexpr size_t PREFIX_LENGTH = 57 | sizeof("const char *dbg_macro::type_name_impl() [T = ") - 1; 58 | static constexpr size_t SUFFIX_LENGTH = sizeof("]") - 1; 59 | #elif defined(__GNUC__) && !defined(__clang__) 60 | #define DBG_MACRO_PRETTY_FUNCTION __PRETTY_FUNCTION__ 61 | static constexpr size_t PREFIX_LENGTH = 62 | sizeof("const char* dbg_macro::type_name_impl() [with T = ") - 1; 63 | static constexpr size_t SUFFIX_LENGTH = sizeof("]") - 1; 64 | #elif defined(_MSC_VER) 65 | #define DBG_MACRO_PRETTY_FUNCTION __FUNCSIG__ 66 | static constexpr size_t PREFIX_LENGTH = 67 | sizeof("const char *__cdecl dbg_macro::type_name_impl<") - 1; 68 | static constexpr size_t SUFFIX_LENGTH = sizeof(">(void)") - 1; 69 | #else 70 | #error "This compiler is currently not supported by dbg_macro." 71 | #endif 72 | 73 | } // namespace pretty_function 74 | 75 | // Implementation of 'type_name()' 76 | 77 | template 78 | const char* type_name_impl() { 79 | return DBG_MACRO_PRETTY_FUNCTION; 80 | } 81 | 82 | template 83 | struct type_tag {}; 84 | 85 | template 86 | std::string get_type_name(type_tag) { 87 | namespace pf = pretty_function; 88 | 89 | std::string type = type_name_impl(); 90 | return type.substr(pf::PREFIX_LENGTH, 91 | type.size() - pf::PREFIX_LENGTH - pf::SUFFIX_LENGTH); 92 | } 93 | 94 | template 95 | std::string type_name() { 96 | if (std::is_volatile::value) { 97 | if (std::is_pointer::value) { 98 | return type_name::type>() + " volatile"; 99 | } else { 100 | return "volatile " + type_name::type>(); 101 | } 102 | } 103 | if (std::is_const::value) { 104 | if (std::is_pointer::value) { 105 | return type_name::type>() + " const"; 106 | } else { 107 | return "const " + type_name::type>(); 108 | } 109 | } 110 | if (std::is_pointer::value) { 111 | return type_name::type>() + "*"; 112 | } 113 | if (std::is_lvalue_reference::value) { 114 | return type_name::type>() + "&"; 115 | } 116 | if (std::is_rvalue_reference::value) { 117 | return type_name::type>() + "&&"; 118 | } 119 | return get_type_name(type_tag{}); 120 | } 121 | 122 | inline std::string get_type_name(type_tag) { 123 | return "short"; 124 | } 125 | 126 | inline std::string get_type_name(type_tag) { 127 | return "unsigned short"; 128 | } 129 | 130 | inline std::string get_type_name(type_tag) { 131 | return "long"; 132 | } 133 | 134 | inline std::string get_type_name(type_tag) { 135 | return "unsigned long"; 136 | } 137 | 138 | inline std::string get_type_name(type_tag) { 139 | return "std::string"; 140 | } 141 | 142 | template 143 | std::string get_type_name(type_tag>>) { 144 | return "std::vector<" + type_name() + ">"; 145 | } 146 | 147 | // Implementation of 'is_detected' to specialize for container-like types 148 | 149 | namespace detail_detector { 150 | 151 | struct nonesuch { 152 | nonesuch() = delete; 153 | ~nonesuch() = delete; 154 | nonesuch(nonesuch const&) = delete; 155 | void operator=(nonesuch const&) = delete; 156 | }; 157 | 158 | template 159 | using void_t = void; 160 | 161 | template 164 | class Op, 165 | class... Args> 166 | struct detector { 167 | using value_t = std::false_type; 168 | using type = Default; 169 | }; 170 | 171 | template class Op, class... Args> 172 | struct detector>, Op, Args...> { 173 | using value_t = std::true_type; 174 | using type = Op; 175 | }; 176 | 177 | } // namespace detail_detector 178 | 179 | template