├── results ├── .gitignore ├── rob_size │ ├── amd_zen_1.png │ ├── amd_zen_2.png │ ├── ibm_power_8.png │ ├── arm_cortex_a72.png │ ├── arm_neoverse_n1.png │ ├── intel_broadwell.png │ ├── intel_cascade_lake.png │ ├── intel_ivy_bridge_ep.png │ └── README.md └── instruction_latency │ ├── intel_broadwell.csv │ ├── loongson_3a6000.csv │ ├── loongson_3c5000.csv │ └── huawei_kunpeng920.csv ├── .clangd ├── agner ├── testp │ ├── TestScripts │ │ ├── MSRdrvL.h │ │ ├── PMCTestLinux.h │ │ ├── PMCTestB32.nasm │ │ ├── my_branch5.sh2 │ │ ├── my_phr_length.sh2 │ │ ├── my_branch.sh2 │ │ ├── init64.sh │ │ ├── pack_results.sh │ │ ├── my_branch2.sh2 │ │ ├── my_branch4.sh2 │ │ ├── my_branch3.sh2 │ │ ├── a64.sh │ │ ├── c64.sh │ │ ├── a32.sh │ │ ├── c32.sh │ │ ├── allsh2.sh │ │ ├── allcsv.sh │ │ ├── my_phr_length.inc │ │ ├── testmemcpyalign.sh2 │ │ ├── allsh1.sh │ │ ├── returnstack.sh2 │ │ ├── instruct_boundaries.inc │ │ ├── my_branch5.py │ │ ├── my_phr_length.py │ │ ├── instruct_boundaries.sh2 │ │ ├── memcpy.inc │ │ ├── alltests.sh │ │ ├── cache_banks.inc │ │ ├── stack_sync_uops.sh2 │ │ ├── my_branch2.py │ │ ├── pointer_chasing.inc │ │ ├── my_branch4.py │ │ ├── returnstack.inc │ │ ├── out_of_order.sh2 │ │ ├── unaligned_mem.sh2 │ │ ├── latencycf.inc │ │ ├── my_branch.inc │ │ ├── 32bitinstr.sh1 │ │ ├── unaligned_mem.inc │ │ ├── MSRDriver.h │ │ ├── my_branch.py │ │ ├── jmp.sh2 │ │ ├── my_branch2.inc │ │ ├── my_branch4.inc │ │ ├── my_branch5.inc │ │ ├── ucache_misprediction.sh2 │ │ ├── out_of_order.inc │ │ ├── read_write_bandwidth.sh2 │ │ ├── my_branch3.inc │ │ ├── fused_branch.sh2 │ │ ├── warmup_fp.inc │ │ ├── ucache_misprediction.inc │ │ ├── length_chg_prefix.sh2 │ │ ├── 32bitinstr.inc │ │ ├── my_branch3.py │ │ ├── loop_buffer.sh2 │ │ ├── mul.sh1 │ │ ├── pushpop.sh1 │ │ ├── shift.inc │ │ ├── stack_sync_uops.inc │ │ ├── daxpy.sh2 │ │ └── read_write_bandwidth.inc │ ├── testp.pdf │ ├── PMCTest │ │ ├── MSRdrvL.h │ │ ├── PMCTestA.cpp │ │ ├── PMCTestB.cpp │ │ ├── PMCTestB32.asm │ │ ├── PMCTestB32.nasm │ │ ├── PMCTestB64.asm │ │ ├── PMCTestLinux.h │ │ ├── uninstall.cpp │ │ ├── stopcounters.bat │ │ ├── startcounters.bat │ │ ├── a64.sh │ │ ├── c64.sh │ │ ├── a32.sh │ │ ├── c32.sh │ │ ├── make_a_obj.bat │ │ ├── MSRDriver.h │ │ └── timingtest.h │ └── DriverSrcLinux │ │ ├── MSRdrv.c │ │ ├── uninstall.sh │ │ ├── MSRdrv1.c │ │ ├── MSRdrvL.h │ │ ├── install1.sh │ │ ├── install.sh │ │ ├── Makefile │ │ ├── DriverSrcLinux.txt │ │ └── MSRDriver.h ├── NOTES.md └── .gitignore ├── reports └── dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon │ ├── .gitignore │ ├── plot_pht_tag_bits_xor_oryon.png │ ├── plot_phr_target_bits_location.png │ ├── plot_pht_associativity_oryon.png │ ├── plot_pht_associativity_firestorm.png │ ├── plot_pht_tag_bits_xor_firestorm.png │ ├── plot_pht_tag_bits_xor_phr_oryon.png │ ├── plot_phr_branch_bits_location_oryon.png │ ├── plot_pht_tag_bits_xor_phr_firestorm.png │ ├── plot_phr_branch_bits_location_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_pc_1st_oryon.png │ ├── plot_pht_index_bits_xor_phrt_pc_2nd_oryon.png │ ├── plot_pht_index_bits_xor_phrb_pc_3rd_firestorm.png │ ├── plot_pht_index_bits_xor_phrb_phrb_3rd_oryon.png │ ├── plot_pht_index_bits_xor_phrb_phrb_4th_oryon.png │ ├── plot_pht_index_bits_xor_phrb_phrt_3rd_oryon.png │ ├── plot_pht_index_bits_xor_phrb_phrt_4th_oryon.png │ ├── plot_pht_index_bits_xor_phrt_pc_1st_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_pc_2nd_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_pc_3rd_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_pc_3rd_pc8_oryon.png │ ├── plot_pht_index_bits_xor_phrt_phrb_1st_oryon.png │ ├── plot_pht_index_bits_xor_phrt_phrb_2nd_oryon.png │ ├── plot_pht_index_bits_xor_phrt_phrb_3rd_oryon.png │ ├── plot_pht_index_bits_xor_phrt_phrb_4th_oryon.png │ ├── plot_pht_index_bits_xor_phrt_phrt_1st_oryon.png │ ├── plot_pht_index_bits_xor_phrt_phrt_2nd_oryon.png │ ├── plot_pht_index_bits_xor_phrt_phrt_3rd_oryon.png │ ├── plot_pht_index_bits_xor_phrt_phrt_4th_oryon.png │ ├── plot_pht_index_bits_xor_phrb_phrb_3rd_firestorm.png │ ├── plot_pht_index_bits_xor_phrb_phrb_4th_firestorm.png │ ├── plot_pht_index_bits_xor_phrb_phrb_5th_firestorm.png │ ├── plot_pht_index_bits_xor_phrb_phrt_4th_firestorm.png │ ├── plot_pht_index_bits_xor_phrb_phrt_5th_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_pc_3rd_pc10_oryon.png │ ├── plot_pht_index_bits_xor_phrt_phrb_1st_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_phrb_2nd_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_phrb_3rd_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_phrb_4th_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_phrb_5th_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_phrt_1st_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_phrt_2nd_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_phrt_3rd_firestorm.png │ ├── plot_pht_index_bits_xor_phrt_phrt_4th_firestorm.png │ └── plot_pht_index_bits_xor_phrt_phrt_5th_firestorm.png ├── .gitignore ├── src ├── itlb_size_gen.cpp ├── pht_index_bits_xor_phr_gen.cpp ├── pht_index_bits_xor_gen.cpp ├── ghr_size.cpp ├── phr_size.cpp ├── btb_size_basic.cpp ├── ras_size.cpp ├── elimination.cpp ├── instruction_latency.cpp ├── itlb_size.cpp ├── ras_size_lib.cpp ├── ghr_size_lib.cpp ├── rob_size.cpp ├── pht_associativity.cpp ├── ras_size_gen.cpp ├── rob_size_gen.cpp ├── pht_index_tag_bits.cpp ├── ghr_size_gen.cpp ├── elimination_lib.cpp ├── pht_tag_bits_xor_phr.cpp ├── pht_tag_bits_xor.cpp ├── phr_size_lib.cpp ├── btb_size_basic_lib.cpp ├── find_branch_misses_pmu.cpp ├── phr_branch_target_xor.cpp ├── detect_uarch.cpp └── fp_peak.cpp ├── meson_options.txt ├── aarch64-linux-cross.txt ├── android-cross.txt ├── default.nix ├── pyproject.toml ├── include ├── counters.h ├── uarch.h └── counters_mapping.h ├── figures ├── plot_dtlb_size.py ├── plot_rob_size.py ├── plot_ras_size.py ├── plot_phr_size.py ├── plot_bp_size.py ├── plot_pht_index_bits_xor.py ├── plot_memory_latency.py ├── plot_btb_size.py ├── plot_pht_associativity.py ├── plot_pht_tag_bits_xor.py ├── plot_pht_tag_bits_xor_phr.py ├── plot_phr_branch_bits_location.py ├── plot_pht_index_bits_xor_phr.py └── plot_phr_target_bits_location.py ├── README.md ├── stale.yml ├── ANDROID-PERF.md ├── ios-cross.txt └── Makefile /results/.gitignore: -------------------------------------------------------------------------------- 1 | !*.png 2 | !*.csv 3 | -------------------------------------------------------------------------------- /.clangd: -------------------------------------------------------------------------------- 1 | CompileFlags: 2 | CompilationDatabase: builddir/ 3 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/MSRdrvL.h: -------------------------------------------------------------------------------- 1 | ../DriverSrcLinux/MSRdrvL.h -------------------------------------------------------------------------------- /agner/testp/TestScripts/PMCTestLinux.h: -------------------------------------------------------------------------------- 1 | ../PMCTest/PMCTestLinux.h -------------------------------------------------------------------------------- /agner/NOTES.md: -------------------------------------------------------------------------------- 1 | testp downloaded from https://www.agner.org/optimize/#testp 2 | -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/.gitignore: -------------------------------------------------------------------------------- 1 | !*.png 2 | -------------------------------------------------------------------------------- /agner/testp/testp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/testp.pdf -------------------------------------------------------------------------------- /agner/testp/PMCTest/MSRdrvL.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/MSRdrvL.h -------------------------------------------------------------------------------- /results/rob_size/amd_zen_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/amd_zen_1.png -------------------------------------------------------------------------------- /results/rob_size/amd_zen_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/amd_zen_2.png -------------------------------------------------------------------------------- /agner/testp/PMCTest/PMCTestA.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/PMCTestA.cpp -------------------------------------------------------------------------------- /agner/testp/PMCTest/PMCTestB.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/PMCTestB.cpp -------------------------------------------------------------------------------- /results/rob_size/ibm_power_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/ibm_power_8.png -------------------------------------------------------------------------------- /agner/testp/DriverSrcLinux/MSRdrv.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/DriverSrcLinux/MSRdrv.c -------------------------------------------------------------------------------- /agner/testp/DriverSrcLinux/uninstall.sh: -------------------------------------------------------------------------------- 1 | # Uninstall MSR driver 2 | rm -f /dev/MSRdrv 3 | rmmod MSRdrv 4 | # modprobe -r MSRdrv 5 | -------------------------------------------------------------------------------- /agner/testp/PMCTest/PMCTestB32.asm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/PMCTestB32.asm -------------------------------------------------------------------------------- /agner/testp/PMCTest/PMCTestB32.nasm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/PMCTestB32.nasm -------------------------------------------------------------------------------- /agner/testp/PMCTest/PMCTestB64.asm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/PMCTestB64.asm -------------------------------------------------------------------------------- /agner/testp/PMCTest/PMCTestLinux.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/PMCTestLinux.h -------------------------------------------------------------------------------- /agner/testp/PMCTest/uninstall.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/uninstall.cpp -------------------------------------------------------------------------------- /results/rob_size/arm_cortex_a72.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/arm_cortex_a72.png -------------------------------------------------------------------------------- /agner/testp/DriverSrcLinux/MSRdrv1.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/DriverSrcLinux/MSRdrv1.c -------------------------------------------------------------------------------- /agner/testp/DriverSrcLinux/MSRdrvL.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/DriverSrcLinux/MSRdrvL.h -------------------------------------------------------------------------------- /results/rob_size/arm_neoverse_n1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/arm_neoverse_n1.png -------------------------------------------------------------------------------- /results/rob_size/intel_broadwell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/intel_broadwell.png -------------------------------------------------------------------------------- /agner/testp/TestScripts/PMCTestB32.nasm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/TestScripts/PMCTestB32.nasm -------------------------------------------------------------------------------- /results/rob_size/intel_cascade_lake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/intel_cascade_lake.png -------------------------------------------------------------------------------- /results/rob_size/intel_ivy_bridge_ep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/intel_ivy_bridge_ep.png -------------------------------------------------------------------------------- /agner/testp/DriverSrcLinux/install1.sh: -------------------------------------------------------------------------------- 1 | # Install MSR driver 2 | mknod /dev/MSRdrv c 249 0 3 | chmod 666 /dev/MSRdrv 4 | insmod -f MSRdrv.ko 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | builddir 2 | build 3 | .vscode 4 | *.png 5 | *.pdf 6 | *.csv 7 | result 8 | .cache 9 | *.swp 10 | *.bin 11 | venv 12 | perf.data 13 | perf.data.old 14 | -------------------------------------------------------------------------------- /src/itlb_size_gen.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(int argc, char *argv[]) { 4 | FILE *fp = fopen(argv[1], "w"); 5 | // jit only 6 | fclose(fp); 7 | return 0; 8 | } 9 | -------------------------------------------------------------------------------- /agner/.gitignore: -------------------------------------------------------------------------------- 1 | *.zip 2 | x 3 | *.o 4 | *.mod 5 | *.mod.c 6 | a.out 7 | *.lst 8 | *.ko 9 | *.order 10 | *.symvers 11 | *.cmd 12 | *.exe 13 | *.sys 14 | *.obj 15 | cpugetinfo 16 | countertypes.inc 17 | cpuinfo.txt 18 | info.txt 19 | 20 | -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_phr_target_bits_location.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_phr_target_bits_location.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_associativity_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_associativity_oryon.png -------------------------------------------------------------------------------- /src/pht_index_bits_xor_phr_gen.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // check index bit conflict xor PHR vs PHR 4 | int main(int argc, char *argv[]) { 5 | FILE *fp = fopen(argv[1], "w"); 6 | // jit only 7 | fclose(fp); 8 | return 0; 9 | } 10 | -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_associativity_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_associativity_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_phr_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_phr_oryon.png -------------------------------------------------------------------------------- /meson_options.txt: -------------------------------------------------------------------------------- 1 | option('ios', type : 'boolean', value : false) 2 | option('android', type : 'boolean', value : false) 3 | option('gem5', type : 'boolean', value : false) 4 | option('linux-cross', type : 'combo', choices: ['none', 'aarch64'], value : 'none') 5 | -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_phr_branch_bits_location_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_phr_branch_bits_location_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_phr_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_phr_firestorm.png -------------------------------------------------------------------------------- /aarch64-linux-cross.txt: -------------------------------------------------------------------------------- 1 | [binaries] 2 | c = ['aarch64-linux-gnu-gcc'] 3 | cpp = ['aarch64-linux-gnu-g++'] 4 | ar = 'ar' 5 | strip = 'strip' 6 | 7 | [host_machine] 8 | system = 'linux' 9 | cpu_family = 'aarch64' 10 | cpu = 'aarch64' 11 | endian = 'little' 12 | -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_phr_branch_bits_location_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_phr_branch_bits_location_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_1st_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_1st_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_2nd_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_2nd_oryon.png -------------------------------------------------------------------------------- /android-cross.txt: -------------------------------------------------------------------------------- 1 | [binaries] 2 | c = ['aarch64-linux-android31-clang'] 3 | cpp = ['aarch64-linux-android31-clang++'] 4 | ar = 'ar' 5 | strip = 'strip' 6 | 7 | [host_machine] 8 | system = 'linux' 9 | cpu_family = 'aarch64' 10 | cpu = 'aarch64' 11 | endian = 'little' 12 | -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_pc_3rd_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_pc_3rd_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_3rd_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_3rd_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_4th_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_4th_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_3rd_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_3rd_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_4th_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_4th_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_1st_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_1st_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_2nd_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_2nd_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_3rd_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_3rd_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_3rd_pc8_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_3rd_pc8_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_1st_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_1st_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_2nd_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_2nd_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_3rd_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_3rd_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_4th_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_4th_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_1st_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_1st_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_2nd_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_2nd_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_3rd_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_3rd_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_4th_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_4th_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_3rd_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_3rd_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_4th_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_4th_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_5th_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_5th_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_4th_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_4th_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_5th_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_5th_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_3rd_pc10_oryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_3rd_pc10_oryon.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_1st_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_1st_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_2nd_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_2nd_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_3rd_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_3rd_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_4th_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_4th_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_5th_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_5th_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_1st_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_1st_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_2nd_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_2nd_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_3rd_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_3rd_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_4th_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_4th_firestorm.png -------------------------------------------------------------------------------- /reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_5th_firestorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_5th_firestorm.png -------------------------------------------------------------------------------- /src/pht_index_bits_xor_gen.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | 3 | // check index bit conflict xor PC vs PHR 4 | int main(int argc, char *argv[]) { 5 | FILE *fp = fopen(argv[1], "w"); 6 | assert(fp); 7 | // use jit exclusively 8 | fclose(fp); 9 | return 0; 10 | } 11 | -------------------------------------------------------------------------------- /default.nix: -------------------------------------------------------------------------------- 1 | with import {}; 2 | 3 | stdenv.mkDerivation { 4 | name = "cpu-micro-benchmarks"; 5 | version = "1.0"; 6 | 7 | src = ./.; 8 | 9 | nativeBuildInputs = [ 10 | meson 11 | ninja 12 | ]; 13 | 14 | buildInputs = [ 15 | ]; 16 | } 17 | 18 | -------------------------------------------------------------------------------- /results/instruction_latency/intel_broadwell.csv: -------------------------------------------------------------------------------- 1 | name,latency,throughput 2 | int_add,1.00,0.00 3 | int_andn,1.00,0.50 4 | int_lea_add,1.00,0.50 5 | sse_addpd,3.00,0.00 6 | sse_addsd,3.00,0.00 7 | sse_mulpd,3.00,0.00 8 | sse_mulsd,3.00,0.00 9 | sse_subpd,3.00,0.00 10 | sse_subsd,3.00,0.00 11 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch5.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nasm -f elf64 -l b64.lst -o b64.o -Dbranchalign=$1 -Dcounters=1,9,201,250 -Pmy_branch5.inc MyTemplateB64.nasm 3 | if [ $? -ne 0 ] ; then exit ; fi 4 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 5 | if [ $? -ne 0 ] ; then exit ; fi 6 | ./x 7 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_phr_length.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nasm -f elf64 -l b64.lst -o b64.o -Ddummybranches=$1 -Dcounters=1,9,201,250 -Pmy_phr_length.inc MyTemplateB64.nasm 3 | if [ $? -ne 0 ] ; then exit ; fi 4 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 5 | if [ $? -ne 0 ] ; then exit ; fi 6 | ./x 7 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nasm -f elf64 -l b64.lst -o b64.o -Dbranchalign=$1 -Dtargetalign=$2 -Dcounters=1,9,201,250 -Pmy_branch.inc MyTemplateB64.nasm 3 | if [ $? -ne 0 ] ; then exit ; fi 4 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 5 | if [ $? -ne 0 ] ; then exit ; fi 6 | ./x 7 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/init64.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # init64.sh 2016-09-25 Agner Fog 3 | 4 | # Initialization of files before running test scripts 5 | # Run 64-bit mode only 6 | # (c) Copyright 2016 by Agner Fog. GNU General Public License www.gnu.org/licenses 7 | 8 | ./init.sh 64 9 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/pack_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # alltests.sh 2016-10-27 Agner Fog 3 | # (c) Copyright 2013-2016 by Agner Fog. GNU General Public License www.gnu.org/licenses 4 | 5 | # pack all results into zipfile 6 | zip -q allresults.zip results/* results1/* results2/* 7 | -------------------------------------------------------------------------------- /src/ghr_size.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern void ghr_size(FILE *fp); 6 | int main(int argc, char *argv[]) { 7 | FILE *fp = fopen("ghr_size.csv", "w"); 8 | assert(fp); 9 | ghr_size(fp); 10 | printf("Results are written to ghr_size.csv\n"); 11 | return 0; 12 | } 13 | -------------------------------------------------------------------------------- /src/phr_size.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern void phr_size(FILE *fp); 6 | int main(int argc, char *argv[]) { 7 | FILE *fp = fopen("phr_size.csv", "w"); 8 | assert(fp); 9 | phr_size(fp); 10 | printf("Results are written to phr_size.csv\n"); 11 | return 0; 12 | } 13 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch2.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nasm -f elf64 -l b64.lst -o b64.o -Dbranchalign=$1 -Dtargetalign=$2 -Dbranchtoggle=$3 -Dtargettoggle=$4 -Dcounters=1,9,201,250 -Pmy_branch2.inc MyTemplateB64.nasm 3 | if [ $? -ne 0 ] ; then exit ; fi 4 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 5 | if [ $? -ne 0 ] ; then exit ; fi 6 | ./x 7 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch4.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nasm -f elf64 -l b64.lst -o b64.o -Dbranchalign=$1 -Dtargetalign=$2 -Dbranchtoggle=$3 -Dtargettoggle=$4 -Dcounters=1,9,201,250 -Pmy_branch4.inc MyTemplateB64.nasm 3 | if [ $? -ne 0 ] ; then exit ; fi 4 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 5 | if [ $? -ne 0 ] ; then exit ; fi 6 | ./x 7 | -------------------------------------------------------------------------------- /agner/testp/PMCTest/stopcounters.bat: -------------------------------------------------------------------------------- 1 | rem Example .bat script to stop PMC counters. 2 | rem Must run as administrator 3 | 4 | rem set to current path 5 | @setlocal enableextensions 6 | @cd /d "%~dp0" 7 | 8 | rem Stop counters. The numbers don't have to match the values used for starting. 9 | 10 | pmctest.exe stopcounters 1 9 100 311 11 | 12 | pause -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch3.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nasm -f elf64 -l b64.lst -o b64.o -Dbranchalign=$1 -Dtargetalign=$2 -Dbranchtoggle=$3 -Dtargettoggle=$4 -Ddummybranches=$5 -Dcounters=1,9,201,250 -Pmy_branch3.inc MyTemplateB64.nasm 3 | if [ $? -ne 0 ] ; then exit ; fi 4 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 5 | if [ $? -ne 0 ] ; then exit ; fi 6 | ./x 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "cpu-micro-benchmarks" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Jiajie Chen "] 6 | 7 | [tool.poetry.dependencies] 8 | python = "^3.8" 9 | matplotlib = "^3.6.0" 10 | 11 | [tool.poetry.dev-dependencies] 12 | 13 | [build-system] 14 | requires = ["poetry-core>=1.0.0"] 15 | build-backend = "poetry.core.masonry.api" 16 | -------------------------------------------------------------------------------- /src/btb_size_basic.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | extern void btb_size_basic(FILE *fp); 7 | int main(int argc, char *argv[]) { 8 | FILE *fp = fopen("btb_size_basic.csv", "w"); 9 | assert(fp); 10 | btb_size_basic(fp); 11 | printf("Results are written to btb_size_basic.csv\n"); 12 | return 0; 13 | } 14 | -------------------------------------------------------------------------------- /src/ras_size.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | extern void ras_size(FILE *fp); 8 | int main(int argc, char *argv[]) { 9 | FILE *fp = fopen("ras_size.csv", "w"); 10 | assert(fp); 11 | ras_size(fp); 12 | printf("Results are written to ras_size.csv\n"); 13 | return 0; 14 | } 15 | -------------------------------------------------------------------------------- /agner/testp/DriverSrcLinux/install.sh: -------------------------------------------------------------------------------- 1 | # Install MSR driver, alternative method 2 | make clean 3 | make 4 | 5 | mknod /dev/MSRdrv c 249 0 6 | chmod 666 /dev/MSRdrv 7 | #insmod -f MSRdrv.ko 8 | #instead of insmod: 9 | KERNELDIR=/lib/modules/`uname -r` 10 | mkdir $KERNELDIR/extra 11 | cp MSRdrv.ko $KERNELDIR/extra 12 | depmod -ae 13 | modprobe MSRdrv 14 | #modprobe --force-vermagic MSRdrv 15 | -------------------------------------------------------------------------------- /agner/testp/PMCTest/startcounters.bat: -------------------------------------------------------------------------------- 1 | rem Example .bat script to start PMC counters. 2 | rem Must run as administrator 3 | 4 | rem set to current path 5 | @setlocal enableextensions 6 | @cd /d "%~dp0" 7 | 8 | rem Set counters. Modify the numbers to fit your purpose. 9 | rem See the end of PMCTestA.cpp for possible numbers 10 | 11 | pmctest.exe startcounters 1 9 100 311 12 | 13 | pause -------------------------------------------------------------------------------- /src/elimination.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | extern void elimination(FILE *fp); 8 | int main(int argc, char *argv[]) { 9 | FILE *fp = fopen("elimination.csv", "w"); 10 | assert(fp); 11 | elimination(fp); 12 | 13 | printf("Results are written to elimination.csv\n"); 14 | return 0; 15 | } 16 | -------------------------------------------------------------------------------- /include/counters.h: -------------------------------------------------------------------------------- 1 | // declare all raw counters 2 | #include "include/utils.h" 3 | DECLARE_RAW_COUNTER(cycles) 4 | DECLARE_RAW_COUNTER(instructions) 5 | DECLARE_RAW_COUNTER(branch_misses) 6 | DECLARE_RAW_COUNTER(cond_branch_misses) 7 | DECLARE_RAW_COUNTER(llc_misses) 8 | DECLARE_RAW_COUNTER(llc_loads) 9 | 10 | // declare computed counters 11 | DECLARE_COMPUTED_COUNTER(counter_per_cycle, instructions_per_cycle) -------------------------------------------------------------------------------- /agner/testp/PMCTest/a64.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #compile and run PMCTest in 64 bit mode with yasm assembly syntax 3 | 4 | # Compile A file if modified 5 | if [ PMCTestA.cpp -nt a64.o ] ; then 6 | g++ -O2 -c -m64 -oa64.o PMCTestA.cpp 7 | fi 8 | 9 | nasm -f elf64 -l b64.lst -o b64.o -DWINDOWS=0 PMCTestB64.nasm 10 | if [ $? -ne 0 ] ; then exit ; fi 11 | 12 | g++ a64.o b64.o -ox -lpthread -z noexecstack 13 | if [ $? -ne 0 ] ; then exit ; fi 14 | 15 | ./x 16 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/a64.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #compile and run PMCTest in 64 bit mode with yasm assembly syntax 3 | 4 | # Compile A file if modified 5 | if [ PMCTestA.cpp -nt a64.o ] ; then 6 | g++ -O2 -c -m64 -oa64.o PMCTestA.cpp 7 | fi 8 | 9 | nasm -f elf64 -l b64.lst -o b64.o -DWINDOWS=0 PMCTestB64.nasm 10 | if [ $? -ne 0 ] ; then exit ; fi 11 | 12 | g++ a64.o b64.o -ox -lpthread -z noexecstack 13 | if [ $? -ne 0 ] ; then exit ; fi 14 | 15 | ./x 16 | -------------------------------------------------------------------------------- /agner/testp/PMCTest/c64.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Compile and run PMCTest in 64 bit mode using C++ 3 | # (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses 4 | 5 | # Compile A file if modified 6 | if [ PMCTestA.cpp -nt a64.o ] ; then 7 | g++ -O2 -c -m64 -oa64.o PMCTestA.cpp 8 | fi 9 | 10 | # Compile B file and link 11 | g++ -O2 -m64 a64.o PMCTestB.cpp -lpthread 12 | if [ $? -ne 0 ] ; then exit ; fi 13 | ./a.out 14 | 15 | # read -p "Press [Enter]" 16 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/c64.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Compile and run PMCTest in 64 bit mode using C++ 3 | # (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses 4 | 5 | # Compile A file if modified 6 | if [ PMCTestA.cpp -nt a64.o ] ; then 7 | g++ -O2 -c -m64 -oa64.o PMCTestA.cpp 8 | fi 9 | 10 | # Compile B file and link 11 | g++ -O2 -m64 a64.o PMCTestB.cpp -lpthread 12 | if [ $? -ne 0 ] ; then exit ; fi 13 | ./a.out 14 | 15 | # read -p "Press [Enter]" 16 | -------------------------------------------------------------------------------- /figures/plot_dtlb_size.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import csv 3 | 4 | 5 | size_data = [] 6 | cycles_data = [] 7 | 8 | with open("dtlb_size.csv", newline="") as f: 9 | r = csv.DictReader(f) 10 | for row in r: 11 | size_data.append(float(row["pages"])) 12 | cycles_data.append(float(row["cycles"])) 13 | 14 | plt.figure(figsize=(7, 6)) 15 | plt.plot(size_data, cycles_data) 16 | plt.ylabel("Cycles") 17 | plt.xlabel("Pages") 18 | plt.grid() 19 | plt.savefig("plot_dtlb_size.png") 20 | plt.cla() 21 | -------------------------------------------------------------------------------- /agner/testp/PMCTest/a32.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Compile and run PMCTest in 32 bit mode using yasm assembler syntax 3 | # In 32-bit Linux: Remove -m32 flag on g++ commands 4 | # In 64-bit Linux: Must install g++-multilib first 5 | 6 | # Compile A file if modified 7 | if [ PMCTestA.cpp -nt a32.o ] ; then 8 | g++ -O2 -c -m32 -oa32.o PMCTestA.cpp 9 | fi 10 | 11 | nasm -f elf32 -l b32.lst -o b32.o PMCTestB32.nasm 12 | if [ $? -ne 0 ] ; then exit ; fi 13 | 14 | g++ -m32 a32.o b32.o -ox -lpthread -z noexecstack 15 | if [ $? -ne 0 ] ; then exit ; fi 16 | 17 | ./x -------------------------------------------------------------------------------- /agner/testp/TestScripts/a32.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Compile and run PMCTest in 32 bit mode using yasm assembler syntax 3 | # In 32-bit Linux: Remove -m32 flag on g++ commands 4 | # In 64-bit Linux: Must install g++-multilib first 5 | 6 | # Compile A file if modified 7 | if [ PMCTestA.cpp -nt a32.o ] ; then 8 | g++ -O2 -c -m32 -oa32.o PMCTestA.cpp 9 | fi 10 | 11 | nasm -f elf32 -l b32.lst -o b32.o PMCTestB32.nasm 12 | if [ $? -ne 0 ] ; then exit ; fi 13 | 14 | g++ -m32 a32.o b32.o -ox -lpthread -z noexecstack 15 | if [ $? -ne 0 ] ; then exit ; fi 16 | 17 | ./x -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cpu-micro-benchmarks 2 | 3 | Inspired by: 4 | 5 | - https://github.com/travisdowns/robsize 6 | - https://github.com/Veedrac/microarchitecturometer 7 | - https://github.com/ChipsandCheese/Microbenchmarks 8 | 9 | ## Paper 10 | 11 | This repo contains code and results for paper [Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon for Software Optimization and Architectural Analysis](https://arxiv.org/abs/2411.13900), please refer to [the report](./reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/README.md) for details. 12 | -------------------------------------------------------------------------------- /agner/testp/PMCTest/c32.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Compile and run PMCTest in 32 bit mode using C++ 3 | # (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses 4 | 5 | # In 32-bit Linux: Remove -m32 flag on g++ commands 6 | # In 64-bit Linux: Must install g++-multilib first 7 | 8 | # Compile A file if modified 9 | if [ PMCTestA.cpp -nt a32.o ] ; then 10 | g++ -O2 -c -m32 -oa32.o PMCTestA.cpp 11 | fi 12 | 13 | # Compile B file and link 14 | g++ -O2 -m32 a32.o PMCTestB.cpp -lpthread 15 | if [ $? -ne 0 ] ; then exit ; fi 16 | ./a.out 17 | 18 | # read -p "Press [Enter]" 19 | -------------------------------------------------------------------------------- /figures/plot_rob_size.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import csv 3 | 4 | size_data = [] 5 | min_data = [] 6 | avg_data = [] 7 | 8 | with open('rob_size.csv', newline='') as f: 9 | r = csv.DictReader(f) 10 | for row in r: 11 | size_data.append(float(row["size"])) 12 | min_data.append(float(row["min"])) 13 | avg_data.append(float(row["avg"])) 14 | 15 | plt.plot(size_data, min_data, label="min") 16 | plt.plot(size_data, avg_data, label="avg") 17 | plt.ylabel('Time') 18 | plt.xlabel('Instruction Block Size') 19 | plt.legend() 20 | plt.savefig('plot_rob_size.png') 21 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/c32.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Compile and run PMCTest in 32 bit mode using C++ 3 | # (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses 4 | 5 | # In 32-bit Linux: Remove -m32 flag on g++ commands 6 | # In 64-bit Linux: Must install g++-multilib first 7 | 8 | # Compile A file if modified 9 | if [ PMCTestA.cpp -nt a32.o ] ; then 10 | g++ -O2 -c -m32 -oa32.o PMCTestA.cpp 11 | fi 12 | 13 | # Compile B file and link 14 | g++ -O2 -m32 a32.o PMCTestB.cpp -lpthread 15 | if [ $? -ne 0 ] ; then exit ; fi 16 | ./a.out 17 | 18 | # read -p "Press [Enter]" 19 | -------------------------------------------------------------------------------- /figures/plot_ras_size.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import csv 3 | 4 | size_data = [] 5 | min_data = [] 6 | avg_data = [] 7 | 8 | with open('ras_size.csv', newline='') as f: 9 | r = csv.DictReader(f) 10 | for row in r: 11 | size_data.append(float(row["size"])) 12 | min_data.append(float(row["min"])) 13 | avg_data.append(float(row["avg"])) 14 | 15 | plt.plot(size_data, min_data, label="min") 16 | plt.plot(size_data, avg_data, label="avg") 17 | plt.ylabel('Time') 18 | plt.xlabel('Call Depth') 19 | plt.legend() 20 | plt.grid() 21 | plt.savefig('plot_ras_size.png') 22 | -------------------------------------------------------------------------------- /figures/plot_phr_size.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import csv 3 | 4 | size_data = [] 5 | min_data = [] 6 | 7 | with open("phr_size.csv", newline="") as f: 8 | r = csv.DictReader(f) 9 | for row in r: 10 | size_data.append(float(row["size"])) 11 | min_data.append(float(row["avg"]) * 100) 12 | 13 | plt.figure(figsize=(5, 2)) 14 | plt.plot(size_data, min_data) 15 | plt.yticks([0, 50]) 16 | plt.ylabel("Misprediction rate (%)") 17 | plt.xlabel("# Branches before the last conditional branch") 18 | plt.savefig("plot_phr_size.png") 19 | plt.savefig("plot_phr_size.pdf", bbox_inches="tight") 20 | -------------------------------------------------------------------------------- /stale.yml: -------------------------------------------------------------------------------- 1 | name: 'Close stale issues and PRs' 2 | on: 3 | schedule: 4 | - cron: '0 0 * * *' 5 | 6 | jobs: 7 | stale: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/stale@v9 11 | with: 12 | stale-issue-message: 'This issue is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 7 days.' 13 | stale-pr-message: 'This pull request is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 7 days.' 14 | days-before-stale: 60 15 | days-before-close: 7 16 | -------------------------------------------------------------------------------- /agner/testp/DriverSrcLinux/Makefile: -------------------------------------------------------------------------------- 1 | # Make MSR driver for Linux 2 | # Last modified: 2020-08-17 Agner Fog 3 | # See https://www.kernel.org/doc/html/latest/kbuild/modules.html 4 | 5 | 6 | KERNELDIR := /lib/modules/`uname -r`/build 7 | obj-m := MSRdrv.o 8 | PWD := $(shell pwd) 9 | 10 | default: 11 | $(MAKE) -C $(KERNELDIR) M=$(PWD) modules 12 | 13 | #$(MAKE) -C $(KERNELDIR) SUBDIRS=$(PWD) modules 14 | 15 | clean: 16 | rm -f .MSRdrv.* 17 | rm -f -r .tmp_versions 18 | rm -f *~ 19 | rm -f MSRdrv.ko 20 | rm -f MSRdrv.o 21 | rm -f MSRdrv.mod.* 22 | rm -f linux.mdl 23 | rm -f Modules.symvers 24 | 25 | install: 26 | ./install.sh 27 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/allsh2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 2016-10-27 Agner Fog 3 | # Compile and run PMCTest with various scripts 4 | # looping through scripts with extension .sh2 5 | 6 | # (c) 2012-2016 by Agner Fog. GNU General Public License www.gnu.org/licenses 7 | 8 | # various initializations (only necessary first time): 9 | 10 | # mkdir results2 11 | 12 | . vars.sh 13 | 14 | # warm up processor to max clock frequency 15 | echo -e "\nwarmup\n" 16 | 17 | ./warmup_fp.sh2 18 | 19 | # run all test scripts 20 | for xscript in *.sh2 21 | do 22 | echo -e "\n$xscript" 23 | ./$xscript 24 | done 25 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/allcsv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 2016-10-27 Agner Fog 3 | # Compile and run PMCTest with various instructions defined in comma-separated lists 4 | # looping through all lists with extension .csv 5 | 6 | # (c) 2016 by Agner Fog. GNU General Public License www.gnu.org/licenses 7 | 8 | # various initializations (only necessary first time): 9 | 10 | # mkdir results 11 | 12 | . vars.sh 13 | 14 | export outdir=results 15 | 16 | # warm up processor to max clock frequency 17 | # echo -e "\nwarmup\n" 18 | # ./warmup_fp.sh2 19 | 20 | # run all test scripts 21 | for xscript in *.csv 22 | do 23 | echo -e "\n$xscript" 24 | ./runlist.sh $xscript 25 | done 26 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_phr_length.inc: -------------------------------------------------------------------------------- 1 | ; reproduce Figure 2 of Half&Half 2 | 3 | ; number of dummy branches 4 | %ifndef dummybranches 5 | %define dummybranches 200 6 | %endif 7 | 8 | %macro testinit3 0 9 | mov rdi, 1000 10 | READ_PMC_START 11 | 12 | loop_begin: 13 | 14 | ; train branch 15 | rdrand eax 16 | and eax, 1 17 | jnz first_target 18 | first_target: 19 | 20 | ; dummy branches 21 | %assign i 0 22 | %rep dummybranches 23 | jmp dummy_branch_%+ i 24 | dummy_branch_%+ i: 25 | %assign i i+1 26 | %endrep 27 | 28 | ; test branch 29 | test eax, eax 30 | jnz second_target 31 | second_target: 32 | 33 | dec rdi 34 | jnz loop_begin 35 | 36 | READ_PMC_END 37 | 38 | 39 | %endmacro -------------------------------------------------------------------------------- /results/instruction_latency/loongson_3a6000.csv: -------------------------------------------------------------------------------- 1 | name,latency,throughput 2 | fp_fadd_d,3.00,0.25 3 | fp_fadd_s,3.00,0.25 4 | fp_fmadd_d,5.00,0.50 5 | fp_fmadd_s,5.00,0.50 6 | fp_fmul_d,5.00,0.50 7 | fp_fmul_s,5.00,0.50 8 | int_add32,1.00,0.25 9 | int_add64,1.00,0.25 10 | int_crc_w_b_w,1.00,0.50 11 | int_crc_w_d_w,1.00,0.50 12 | int_crc_w_h_w,1.00,0.50 13 | int_crc_w_w_w,1.00,0.50 14 | int_crcc_w_d_w,1.00,0.00 15 | int_crcc_w_w_w,1.00,0.00 16 | int_div64,19.00,0.00 17 | int_mul32,4.00,0.50 18 | int_mul64,4.00,0.50 19 | lasx_fp_xvfadd_d,3.00,0.25 20 | lasx_fp_xvfmadd_d,5.00,0.50 21 | lasx_fp_xvfmul_d,5.00,0.50 22 | lasx_int_xvadd_d,1.00,0.25 23 | lasx_int_xvmul_d,4.00,0.50 24 | lsx_fp_vfadd_d,3.00,0.25 25 | lsx_fp_vfmadd_d,5.00,0.50 26 | lsx_fp_vfmul_d,5.00,0.50 27 | lsx_int_vadd_d,1.00,0.25 28 | lsx_int_vmul_d,4.00,0.50 29 | -------------------------------------------------------------------------------- /results/instruction_latency/loongson_3c5000.csv: -------------------------------------------------------------------------------- 1 | name,latency,throughput 2 | fp_fadd_d,5.00,0.50 3 | fp_fadd_s,5.00,0.50 4 | fp_fmadd_d,5.00,0.50 5 | fp_fmadd_s,5.00,0.50 6 | fp_fmul_d,5.00,0.50 7 | fp_fmul_s,5.00,0.50 8 | int_add32,1.00,0.25 9 | int_add64,1.00,0.25 10 | int_crc_w_b_w,5.00,3.50 11 | int_crc_w_d_w,19.00,10.50 12 | int_crc_w_h_w,7.00,4.50 13 | int_crc_w_w_w,11.00,6.50 14 | int_crcc_w_d_w,19.00,0.00 15 | int_crcc_w_w_w,11.00,0.00 16 | int_div64,4.00,0.00 17 | int_mul32,4.00,0.50 18 | int_mul64,4.00,0.50 19 | lasx_fp_xvfadd_d,5.00,0.50 20 | lasx_fp_xvfmadd_d,5.00,0.50 21 | lasx_fp_xvfmul_d,5.00,0.50 22 | lasx_int_xvadd_d,1.00,0.50 23 | lasx_int_xvmul_d,4.00,0.50 24 | lsx_fp_vfadd_d,5.00,0.50 25 | lsx_fp_vfmadd_d,5.00,0.50 26 | lsx_fp_vfmul_d,5.00,0.50 27 | lsx_int_vadd_d,1.00,0.50 28 | lsx_int_vmul_d,4.00,0.50 29 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/testmemcpyalign.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 2013-08-07 Agner Fog 3 | 4 | # Compile and run PMCTest for different implementations of memcpy 5 | # (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses 6 | 7 | . vars.sh 8 | 9 | # Assemble testmemcpyal.nasm file 10 | $ass -f elf64 -o t64.o testmemcpyal.nasm 11 | if [ $? -ne 0 ] ; then exit ; fi 12 | 13 | # Compile cpp file and link 14 | g++ -O2 -m64 testmemcpyalign.cpp t64.o -ox.exe 15 | if [ $? -ne 0 ] ; then exit ; fi 16 | 17 | echo -e "Test if memory copying has penalty for false dependence between source and destination addressses\n" > results2/testmemcpyalign.txt 18 | 19 | # Run test 20 | ./x.exe >> results2/testmemcpyalign.txt 21 | 22 | echo -e "\n" >> results2/testmemcpyalign.txt 23 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/allsh1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 2016-10-27 Agner Fog 3 | # Compile and run PMCTest with various scripts 4 | # looping through scripts with extension .sh1 5 | # (c) Copyright 2012-2016 by Agner Fog. GNU General Public License www.gnu.org/licenses 6 | 7 | # various initializations (only necessary first time): 8 | . vars.sh 9 | 10 | # warm up processor to max clock frequency 11 | echo -e "\nwarmup\n" 12 | $ass -f elf64 -o b64.o -Dinstruct=nop -DWARMUPCOUNT=10000000 -Dnthreads=1 TemplateB64.nasm 13 | if [ $? -ne 0 ] ; then exit ; fi 14 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread -z noexecstack 15 | if [ $? -ne 0 ] ; then exit ; fi 16 | ./x >> /dev/null 17 | 18 | # run all test scripts 19 | for xscript in *.sh1 20 | do 21 | echo -e "\n$xscript" 22 | ./$xscript 23 | done 24 | 25 | -------------------------------------------------------------------------------- /results/instruction_latency/huawei_kunpeng920.csv: -------------------------------------------------------------------------------- 1 | name,latency,throughput 2 | asimd_fp_fadd_double,4.00,1.00 3 | asimd_fp_fadd_single,5.00,0.50 4 | asimd_fp_fmla_single,4.00/5.00,0.00 5 | asimd_fp_fmul_double,5.00,1.00 6 | asimd_fp_fmul_single,5.00,0.50 7 | asimd_int_add_double,2.00,0.50 8 | asimd_int_add_half,2.00,0.50 9 | asimd_int_add_single,2.00,0.50 10 | asimd_int_mul_half,7.00,0.00 11 | asimd_int_mul_single,7.00,2.83 12 | fp_fadd_double,4.00,0.50 13 | fp_fadd_single,5.00,0.50 14 | fp_fdiv_double,6.00,0.00 15 | fp_fdiv_single,6.00,0.00 16 | fp_fmadd_double,5.00/7.00,0.50 17 | fp_fmadd_single,4.00/5.00,0.50 18 | fp_fmul_double,5.00,0.50 19 | fp_fmul_single,5.00,0.50 20 | fp_fsqrt_double,9.00,0.00 21 | fp_fsqrt_single,7.00,0.00 22 | int_add,1.00,0.33 23 | int_madd,1.00/4.00,1.00 24 | int_mul,4.00,1.00 25 | int_sdiv,6.00,5.00 26 | int_smull,3.00,1.00 27 | int_udiv,6.00,5.00 28 | -------------------------------------------------------------------------------- /ANDROID-PERF.md: -------------------------------------------------------------------------------- 1 | # Use perf counters on Android 2 | 3 | There are different ways to access perf counters on Android: 4 | 5 | 1. On root-ed device, you can access PMU via root user 6 | 2. Execute microbenchmarks using `adb shell` 7 | 8 | You need to copy executables to Android using `adb push`. But beware that some partitions are mounted as noexec e.g. `/storage/emulated/0/`. 9 | 10 | If you find it hard to find a target directory for `adb push`, you can: 11 | 12 | 1. Run `sshd` in Termux to launch a SSH server 13 | 2. Use `scp` to copy program to home directory under termux 14 | 3. Run `run-as com.termux` to enter Termux data directory under `adb shell` 15 | 4. Run `cd files/home` and run programs there using perf counters 16 | 17 | You can use `simpleperf` from NDK via `adb shell` in the same way. You can also run `usr/bin/sshd` from Termux in `adb shell` instead of running in Termux app. 18 | -------------------------------------------------------------------------------- /figures/plot_bp_size.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import csv 3 | import numpy as np 4 | 5 | size_data = [] 6 | history_data = [] 7 | min_data = [] 8 | avg_data = [] 9 | 10 | with open('bp_size.csv', newline='') as f: 11 | r = csv.DictReader(f) 12 | for row in r: 13 | size_data.append(float(row["size"])) 14 | history_data.append(float(row["history"])) 15 | min_data.append(float(row["min"])) 16 | avg_data.append(float(row["avg"])) 17 | 18 | plt.imshow(np.array(avg_data).reshape((11, 17)), vmax=10) 19 | plt.colorbar() 20 | 21 | xticks = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,65536] 22 | plt.xticks(range(len(xticks)), xticks, rotation='vertical') 23 | 24 | yticks = [1,2,4,8,16,32,64,128,256,512,1024] 25 | plt.yticks(range(len(yticks)), yticks) 26 | 27 | plt.xlabel('Pattern Length') 28 | plt.ylabel('Branch Num') 29 | plt.savefig('plot_bp_size.png') 30 | -------------------------------------------------------------------------------- /ios-cross.txt: -------------------------------------------------------------------------------- 1 | # from https://github.com/mesonbuild/meson/blob/master/cross/iphone.txt 2 | [binaries] 3 | c = ['clang', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk'] 4 | cpp = ['clang++', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk'] 5 | objc = ['clang', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk'] 6 | objcpp = ['clang++', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk'] 7 | ar = 'ar' 8 | strip = 'strip' 9 | 10 | [host_machine] 11 | system = 'darwin' 12 | subsystem = 'ios' 13 | kernel = 'xnu' 14 | cpu_family = 'aarch64' 15 | cpu = 'aarch64' 16 | endian = 'little' -------------------------------------------------------------------------------- /figures/plot_pht_index_bits_xor.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | import csv 5 | 6 | # Test branch toggles 7 | x_data = [] 8 | y_data = [] 9 | z_data = [] 10 | 11 | with open("pht_index_bits_xor.csv", newline="") as f: 12 | r = csv.DictReader(f) 13 | for row in r: 14 | x_data.append(int(row["branches"])) 15 | y_data.append(int(row["dummy"])) 16 | z_data.append(min(float(row["min"]), 0.5)) 17 | z_data = np.array(z_data) 18 | 19 | x_data = list(sorted(set(x_data))) 20 | y_data = list(sorted(set(y_data))) 21 | z_data = z_data.reshape((len(x_data), len(y_data))) 22 | 23 | plt.figure(figsize=(20, 6)) 24 | plt.imshow(z_data) 25 | plt.ylabel("Predict branches") 26 | plt.yticks(range(len(x_data)), x_data) 27 | plt.xlabel("PHR bit position") 28 | plt.xticks(range(len(y_data)), y_data, rotation=90) 29 | plt.savefig("plot_pht_index_bits_xor.png") 30 | plt.cla() 31 | -------------------------------------------------------------------------------- /src/instruction_latency.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | extern void instruction_latency(FILE *fp); 10 | extern bool instruction_latency_use_perf; 11 | extern int instruction_latency_loop_count; 12 | int main(int argc, char *argv[]) { 13 | int opt; 14 | while ((opt = getopt(argc, argv, "n:p")) != -1) { 15 | switch (opt) { 16 | case 'n': 17 | sscanf(optarg, "%d", &instruction_latency_loop_count); 18 | break; 19 | case 'p': 20 | instruction_latency_use_perf = true; 21 | break; 22 | default: 23 | fprintf(stderr, "Usage: %s [-p]\n", argv[0]); 24 | exit(EXIT_FAILURE); 25 | } 26 | } 27 | 28 | FILE *fp = fopen("instruction_latency.csv", "w"); 29 | assert(fp); 30 | instruction_latency(fp); 31 | printf("Result written to instruction_latency.csv\n"); 32 | return 0; 33 | } 34 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/returnstack.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 2012-02-19 AgF 3 | 4 | # Compile and run PMCTest for detecting return stack buffer size 5 | # (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses 6 | 7 | echo -e "Test of return stack buffer size\n" > results2/returnstack.txt 8 | echo -e "The size of the return stack buffer is the last level that has few or no mispredictions\n\n" >> results2/returnstack.txt 9 | 10 | . vars.sh 11 | 12 | for nesting in {2..66} 13 | do 14 | 15 | echo -e "\n\nNesting level $nesting" >> results2/returnstack.txt 16 | $ass -f elf64 -o b64.o -lxx.lst -Dnesting=$nesting -Dcounters=$BranchPMCs -Preturnstack.inc TemplateB64.nasm 17 | if [ $? -ne 0 ] ; then exit ; fi 18 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 19 | if [ $? -ne 0 ] ; then exit ; fi 20 | ./x >> results2/returnstack.txt 21 | 22 | done 23 | 24 | echo -e "\n" >> results2/returnstack.txt 25 | 26 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/instruct_boundaries.inc: -------------------------------------------------------------------------------- 1 | ;---------------------------------------------------------------------------- 2 | ; instruct_boundaries.inc 2013-07-11 Agner Fog 3 | ; 4 | ; PMC Test program for testing if instruction boundaries are marked in instruction cache 5 | ; 6 | ; (c) Copyright 2013 by Agner Fog. GNU General Public License www.gnu.org/licenses 7 | ;----------------------------------------------------------------------------- 8 | 9 | ; loops in template 10 | %define repeat0 16 11 | %define repeat1 1 12 | %define repeat2 1 13 | 14 | %ifndef repeatu 15 | %define repeatu 100 16 | %endif 17 | 18 | %macro testcode 0 19 | 20 | test r14d, 2 ; loop counter 21 | jnz $+4 ; jump past previous instruction boundary 22 | 23 | %rep repeatu 24 | mov eax,0xB8B8B8B8 25 | mov ebx,0xBBBBBBBB 26 | %endrep 27 | 28 | times 4 nop ; absorb last partial instruction 29 | 30 | %endmacro 31 | -------------------------------------------------------------------------------- /figures/plot_memory_latency.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import csv 3 | 4 | size_data = [] 5 | time_data = [] 6 | llc_miss_data = [] 7 | llc_load_data = [] 8 | 9 | with open('memory_latency.csv', newline='') as f: 10 | r = csv.reader(f) 11 | for row in r: 12 | if row[0] == "size": 13 | continue 14 | if len(row) < 5: 15 | continue 16 | size_data.append(float(row[0])) 17 | time_data.append(float(row[1])) 18 | llc_miss_data.append(float(row[3])) 19 | llc_load_data.append(float(row[4])) 20 | 21 | fig, ax = plt.subplots() 22 | 23 | ax.plot(size_data, time_data) 24 | ax.set_xscale('log') 25 | ax.set_ylabel('Time (ns)') 26 | ax.set_xlabel('Memory Block Size (B)') 27 | 28 | ax2 = ax.twinx() 29 | 30 | ax2.plot(size_data, llc_load_data, 'r.-', label='LLC Loads') 31 | ax2.plot(size_data, llc_miss_data, 'g.-', label='LLC Misses') 32 | ax2.legend() 33 | ax2.set_xscale('log') 34 | ax2.set_ylabel('LLC Load/Miss per Access') 35 | ax2.set_xlabel('Memory Block Size (B)') 36 | 37 | plt.savefig('plot_memory_latency.png') 38 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch5.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | 5 | # Reproduce Figure 5 of Half&Half 6 | x_data = range(1, 20) 7 | y_data = [] 8 | for branch_align in x_data: 9 | output = subprocess.check_output( 10 | ["./my_branch5.sh2", str(branch_align)], encoding="utf-8" 11 | ) 12 | heading = False 13 | data = [] 14 | for line in output.splitlines(): 15 | parts = list(filter(lambda s: len(s) > 0, line.strip().split(" "))) 16 | if len(parts) > 0: 17 | if not heading: 18 | assert parts[5] == "BrMisCond" 19 | heading = True 20 | else: 21 | data.append(int(parts[4])) 22 | avg = np.average(np.array(data)) / 10000 # 1 branches, 10000 loops 23 | print(branch_align, f"{avg:.2f}") 24 | y_data.append(avg) 25 | 26 | plt.plot(x_data, y_data) 27 | plt.xlabel("Branch alignment bits") 28 | plt.ylabel("Miss Rate") 29 | plt.xticks(x_data) 30 | plt.savefig("my_branch5.png") 31 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_phr_length.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | 5 | # Reproduce Figure 2(a) of Half&Half 6 | x_data = range(200) 7 | y_data = [] 8 | for dummy_branches in x_data: 9 | output = subprocess.check_output( 10 | ["./my_phr_length.sh2", str(dummy_branches)], encoding="utf-8" 11 | ) 12 | heading = False 13 | data = [] 14 | for line in output.splitlines(): 15 | parts = list(filter(lambda s: len(s) > 0, line.strip().split(" "))) 16 | if len(parts) > 0: 17 | if not heading: 18 | assert parts[5] == "BrMisCond" 19 | heading = True 20 | else: 21 | data.append(int(parts[4])) 22 | avg = np.average(np.array(data)) / 2000 # 2 branches, 1000 loops 23 | print(dummy_branches, f"{avg:.2f}") 24 | y_data.append(avg) 25 | 26 | plt.plot(x_data, y_data) 27 | plt.xlabel("Num. of Dummy Branches") 28 | plt.ylabel("Miss Rate") 29 | plt.yticks([0.25, 0.50]) 30 | plt.savefig("my_phr_length.png") 31 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/instruct_boundaries.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # instruct_boundaries.sh2 2013-07-11 Agner Fog 3 | # 4 | # PMC Test program for testing if instruction boundaries are marked in 5 | # instruction cache by making jump past previous instruction boundary 6 | # 7 | # (c) 2013 GNU General Public License www.gnu.org/licenses 8 | # 9 | # repeatu: number of instructions in sequency 10 | 11 | . vars.sh 12 | 13 | nthreads=1 14 | 15 | echo -e "Test if instruction boundaries are marked in instruction cache" > results2/instruct_boundaries.txt 16 | 17 | for repeatu in 10 100 1000 18 | do 19 | echo -e "\n$repeatu instructions" >> results2/instruct_boundaries.txt 20 | 21 | $ass -f elf64 -o b64.o -Drepeatu=$repeatu -Dcounters=$BranchPMCs -Dnthreads=$nthreads -Pinstruct_boundaries.inc TemplateB64.nasm 22 | if [ $? -ne 0 ] ; then exit ; fi 23 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 24 | if [ $? -ne 0 ] ; then exit ; fi 25 | ./x >> results2/instruct_boundaries.txt 26 | done 27 | 28 | echo -e "\n" >> results2/instruct_boundaries.txt 29 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/memcpy.inc: -------------------------------------------------------------------------------- 1 | ; memcpy.inc 2016-11-02 2 | 3 | ; Run PMCTest for for different implementations of memcpy 4 | 5 | ; (c) 2012-2016 by Agner Fog. GNU General Public License www.gnu.org/licenses 6 | 7 | ; Parameters: 8 | ; 9 | ; mversion: version of memcpy function 10 | ; 11 | ; dsize: Size of memory block to move 12 | ; srcoff: Source offset relative to cache line (64B) 13 | ; dstoff: Destination offset relative to cache line (64B) 14 | 15 | ; define external memcpy function 16 | extern mversion 17 | 18 | ; data size for each thread 19 | %define threaddatasize ((dsize*2+10FFh) & (-100h)) 20 | 21 | ; allocate data for all threads 22 | %macro testdata 0 23 | times (threaddatasize * nthreads) DB 0 24 | %endmacro 25 | 26 | ; main testcode macro 27 | %macro testcode 0 28 | 29 | imul rax, r15, threaddatasize 30 | lea rsi, [UserData + rax] 31 | lea rdi, [rsi + ((dsize+7FH)&(-80H))+100H+dstoff] 32 | mov edx, dsize 33 | call mversion 34 | 35 | %endmacro 36 | 37 | ; default test loops 38 | %define repeat1 100 39 | %define repeat2 1 40 | -------------------------------------------------------------------------------- /src/itlb_size.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | extern void itlb_size(FILE *fp); 7 | extern bool avoid_hugepage_merging; 8 | extern int stride; 9 | extern int fake_page_size; 10 | int main(int argc, char *argv[]) { 11 | int opt; 12 | while ((opt = getopt(argc, argv, "hs:f:")) != -1) { 13 | switch (opt) { 14 | case 'h': 15 | avoid_hugepage_merging = true; 16 | break; 17 | case 's': 18 | sscanf(optarg, "%d", &stride); 19 | break; 20 | case 'f': 21 | sscanf(optarg, "%d", &fake_page_size); 22 | break; 23 | default: 24 | fprintf(stderr, "Usage: %s [-h] [-s stride] [-f page_size]\n", argv[0]); 25 | fprintf(stderr, "\t-h: avoid huge page merging\n"); 26 | fprintf(stderr, "\t-s stride: set branch address stride\n"); 27 | fprintf(stderr, "\t-f page_size: fake page size\n"); 28 | exit(EXIT_FAILURE); 29 | } 30 | } 31 | 32 | FILE *fp = fopen("itlb_size.csv", "w"); 33 | assert(fp); 34 | itlb_size(fp); 35 | printf("Results are written to itlb_size.csv\n"); 36 | return 0; 37 | } 38 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/alltests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # alltests.sh 2016-10-28 Agner Fog 3 | # (c) Copyright 2013-2016 by Agner Fog. GNU General Public License www.gnu.org/licenses 4 | 5 | 6 | # initalization 7 | ./init.sh $1 8 | 9 | echo -e "Running all tests\n`date`\n`./cpugetinfo brand`.\nFamily `./cpugetinfo family hex`, model `./cpugetinfo model hex`" >> results2/statistics.txt 10 | 11 | Starttime=`date +%s` 12 | 13 | # measure latencies and throughputs of instructions based on lists in .csv files 14 | ./allcsv.sh 15 | 16 | # measure latencies and throughputs of instructions 17 | ./allsh1.sh 18 | 19 | # other microarchitecture tests 20 | ./allsh2.sh 21 | 22 | Endtime=`date +%s` 23 | Elapsedtime=$(($Endtime - $Starttime)) 24 | Minutes=$(($Elapsedtime/60)) 25 | Seconds=$(($Elapsedtime-($Minutes*60))) 26 | Numscripts=$( ls *.sh1 *.sh2 *.csv | wc -w) 27 | 28 | echo Executed $Numscripts scripts. Elapsed time $Minutes m, $Seconds s 29 | echo -e "\nExecuted $Numscripts scripts. Elapsed time $Minutes m, $Seconds s\n\n" >> results2/statistics.txt 30 | 31 | # pack all results into zipfile 32 | ./pack_results.sh 33 | -------------------------------------------------------------------------------- /figures/plot_btb_size.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import csv 3 | 4 | # Mimic https://chipsandcheese.com/2023/10/27/cortex-x2-arm-aims-high/ 5 | 6 | size_data = [] 7 | stride_data = [] 8 | min_data = [] 9 | avg_data = [] 10 | 11 | with open('btb_size.csv', newline='') as f: 12 | r = csv.DictReader(f) 13 | for row in r: 14 | size_data.append(float(row["size"])) 15 | stride_data.append(float(row["stride"])) 16 | min_data.append(float(row["min"])) 17 | avg_data.append(float(row["avg"])) 18 | 19 | for stride in [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]: 20 | x_data = [] 21 | y_data = [] 22 | for i in range(len(stride_data)): 23 | if stride_data[i] == stride and size_data[i] <= 8192: 24 | x_data.append(size_data[i]) 25 | y_data.append(min_data[i]) 26 | plt.plot(x_data, y_data, label=f"Branch Per {stride}B") 27 | plt.xscale('log') 28 | ticks = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192] 29 | plt.xticks(ticks, ticks) 30 | plt.yticks(range(10)) 31 | plt.ylim((0, 10)) 32 | plt.grid() 33 | plt.xlabel('Branches in loop') 34 | plt.ylabel('Cycles Per Branch') 35 | plt.legend() 36 | plt.savefig('plot_btb_size.png') 37 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/cache_banks.inc: -------------------------------------------------------------------------------- 1 | ; cache_banks.inc 2015-12-20 Agner Fog 2 | 3 | ; Measure cache bank conflicts 4 | ; (c) 2014 - 2015 by Agner Fog. GNU General Public License www.gnu.org/licenses 5 | ; 6 | ; Parameters: 7 | ; 8 | ; stride1: Bigger than all banks 9 | ; 10 | ; stride2: Bank size 11 | ; 12 | ; tmode: Test mode: 13 | ; 1: read from two addresses spaced by stride1 14 | ; 2: read from two addresses spaced by stride1 + stride2 15 | ; 3: read and write from two addresses spaced by stride1 16 | ; 4: read and write from two addresses spaced by stride1 + stride2 17 | 18 | %macro testinit1 0 19 | 20 | %endmacro 21 | 22 | 23 | ; main testcode macro 24 | 25 | %macro testcode 0 26 | %if tmode == 1 ; 27 | mov eax,[rsi] 28 | mov ebx,[rsi + stride1] 29 | %elif tmode == 2 ; 30 | mov eax,[rsi] 31 | mov ebx,[rsi + stride1 + stride2] 32 | %elif tmode == 3 ; 33 | mov eax,[rsi] 34 | mov [rsi + stride1],eax 35 | %elif tmode == 4 ; 36 | mov eax,[rsi] 37 | mov [rsi + stride1 + stride2],eax 38 | %endif 39 | %endmacro 40 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/stack_sync_uops.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 2013-07-19 Agner Fog 3 | # Compile and run PMCTest for testing branch prediction penalty with and without 4 | # microop cache 5 | 6 | # (c) Copyright 2013 by Agner Fog. GNU General Public License www.gnu.org/licenses 7 | 8 | # Detect CPU specific variables 9 | . vars.sh 10 | 11 | 12 | echo -e "Stack synchronization micro-ops" > results2/stack_sync_uops.txt 13 | 14 | repeat0=10 15 | count1=10 16 | 17 | let tcase=0 18 | 19 | for xcase in Push_and_pop_only added_mov_r,[rsp] further_added_mov_r,rsp \ 20 | call_and_ret call_and_ret_imm call_and_ret_and_add_rsp,const 21 | do 22 | let tcase+=1 23 | 24 | echo -e "\n\nCase $tcase: $xcase" >> results2/stack_sync_uops.txt 25 | 26 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dcount1=$count1 -Drepeat0=$repeat0 -Pstack_sync_uops.inc TemplateB64.nasm 27 | if [ $? -ne 0 ] ; then exit ; fi 28 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 29 | if [ $? -ne 0 ] ; then exit ; fi 30 | ./x >> results2/stack_sync_uops.txt 31 | done 32 | 33 | 34 | echo -e "\n" >> results2/stack_sync_uops.txt 35 | -------------------------------------------------------------------------------- /include/uarch.h: -------------------------------------------------------------------------------- 1 | #ifndef __UARCH_H__ 2 | #define __UARCH_H__ 3 | 4 | enum uarch { 5 | // special 6 | unknown, 7 | 8 | // arm64 9 | // apple 10 | // m1 11 | firestorm, 12 | icestorm, 13 | // m2 14 | avalanche, 15 | blizzard, 16 | // m4 17 | m4_pcore, 18 | m4_ecore, 19 | // qualcomm 20 | oryon, 21 | // arm 22 | cortex_a77, 23 | cortex_a78, 24 | cortex_x1, 25 | neoverse_n1, 26 | neoverse_v1, 27 | neoverse_n2, 28 | neoverse_v2, 29 | // hisilicon 30 | tsv110, 31 | 32 | unknown_arm64, 33 | arm64_begin = firestorm, 34 | arm64_end = unknown_arm64, 35 | 36 | // loongarch 37 | la464, 38 | la664, 39 | unknown_loongarch64, 40 | loongarch64_begin = la464, 41 | loongarch64_end = unknown_loongarch64, 42 | 43 | // intel 44 | golden_cove, 45 | gracemont, 46 | sunny_cove, 47 | skylake, 48 | broadwell, 49 | // amd 50 | zen1, 51 | zen2, 52 | zen3, 53 | zen4, 54 | zen5, 55 | 56 | unknown_amd64, 57 | 58 | // valid range 59 | all_begin = firestorm, 60 | all_end = unknown_amd64, 61 | }; 62 | 63 | // detect uarch 64 | enum uarch get_uarch(); 65 | // which core to bind 66 | int get_bind_core(); 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /figures/plot_pht_associativity.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | import csv 5 | 6 | # Test branch toggles 7 | for prefix in [""]: 8 | x_data = [] 9 | y_data = [] 10 | z_data = [] 11 | 12 | with open(f"{prefix}pht_associativity.csv", newline="") as f: 13 | r = csv.DictReader(f) 14 | for row in r: 15 | x_data.append(int(row["branches"])) 16 | y_data.append(int(row["align"])) 17 | z_data.append(min(float(row["min"]), 0.5)) 18 | z_data = np.array(z_data) 19 | 20 | x_data = list(sorted(set(x_data))) 21 | y_data = list(sorted(set(y_data))) 22 | z_data = z_data.reshape((len(x_data), len(y_data))) 23 | 24 | plt.imshow(z_data.transpose()) 25 | plt.xlabel("# Conditional branches") 26 | plt.xticks(range(len(x_data)), x_data, rotation=90) 27 | plt.ylabel("Log2 branch base address") 28 | plt.yticks(range(len(y_data)), y_data) 29 | bar = plt.colorbar(shrink=0.5) 30 | bar.ax.set_ylabel("Misprediction rate", fontsize=8, rotation=270, labelpad=9.0) 31 | plt.savefig(f"plot_{prefix}pht_associativity.png") 32 | plt.savefig(f"plot_{prefix}pht_associativity.pdf", bbox_inches="tight") 33 | plt.cla() 34 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all compile clean 2 | 3 | ifeq ($(shell which meson),) 4 | $(error Please install meson first!) 5 | endif 6 | 7 | ifeq ($(shell which ninja),) 8 | $(error Please install ninja first!) 9 | endif 10 | 11 | all: builddir/stamp compile 12 | 13 | builddir/stamp: 14 | meson setup builddir --buildtype=release 15 | touch $@ 16 | 17 | compile: builddir/stamp 18 | ninja -C builddir 19 | 20 | clean: builddir/stamp 21 | ninja -C builddir clean 22 | 23 | distclean: 24 | rm -rf builddir 25 | 26 | builddir-ios: 27 | meson setup builddir-ios -Dios=true --buildtype=release --cross-file ios-cross.txt 28 | 29 | ios: builddir-ios 30 | ninja -C builddir-ios 31 | 32 | builddir-android: 33 | meson setup builddir-android -Dandroid=true --buildtype=release --cross-file android-cross.txt 34 | 35 | android: builddir-android 36 | ninja -C builddir-android 37 | 38 | builddir-aarch64-linux: 39 | meson setup builddir-aarch64-linux -Dlinux-cross=aarch64 --buildtype=release --cross-file aarch64-linux-cross.txt 40 | 41 | aarch64-linux: builddir-aarch64-linux 42 | ninja -C builddir-aarch64-linux 43 | 44 | builddir-gem5: 45 | meson setup builddir-gem5 -Dgem5=true --buildtype=release 46 | 47 | gem5: builddir-gem5 48 | ninja -C builddir-gem5 49 | 50 | .PHONY: gem5 51 | -------------------------------------------------------------------------------- /agner/testp/DriverSrcLinux/DriverSrcLinux.txt: -------------------------------------------------------------------------------- 1 | Instructions for Linux driver 2011-06-19 Agner Fog 2 | 3 | To install the Linux driver for PMCTest under Linux, 32 or 64 bit, 4 | unzip DriverSrcLinux.zip, make and install the driver according to 5 | the following commands. Must reinstall after reboot. 6 | The driver has only been tested in Ubuntu. 7 | 8 | 9 | make 10 | chmod 744 *.sh 11 | sudo ./install.sh 12 | 13 | 14 | In some older systems you may need to replace MSRdrv.c with MSRdrv1.c if 15 | compilation gives errors. 16 | 17 | If build directory is missing: 18 | 19 | sudo ln -s /usr/src/linux-headers-`uname -r` /lib/modules/`uname -r`/build 20 | 21 | Or if the target doesn't exist, e.g.: 22 | 23 | sudo ln -s /usr/src/linux-headers-2.6.24-23-server /lib/modules/`uname -r`/build 24 | 25 | In Red Hat/Fedora you may need the following: 26 | rpm -q kernel kernel-source 27 | or 28 | yum -y install kernel-devel kernel-headers 29 | If it installs a wrong version, run: 30 | yum distro-sync 31 | reboot 32 | ./install2.sh 33 | 34 | 35 | install.sh: 36 | 37 | mknod /dev/MSRdrv c 222 0 38 | chmod 666 /dev/MSRdrv 39 | insmod -f MSRdrv.ko 40 | 41 | uninstall.sh: 42 | 43 | rm -f /dev/MSRdrv 44 | rmmod MSRdrv 45 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch2.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | 5 | # Reproduce Figure 4 of Half&Half 6 | x_data = range(0, 16) 7 | y_data = range(0, 6) 8 | z_data = [] 9 | for branch_toggle in x_data: 10 | temp = [] 11 | for target_toggle in y_data: 12 | output = subprocess.check_output( 13 | ["./my_branch2.sh2", "16", "16", str(branch_toggle), str(target_toggle)], 14 | encoding="utf-8", 15 | ) 16 | heading = False 17 | data = [] 18 | for line in output.splitlines(): 19 | parts = list(filter(lambda s: len(s) > 0, line.strip().split(" "))) 20 | if len(parts) > 0: 21 | if not heading: 22 | assert parts[5] == "BrMisCond" 23 | heading = True 24 | else: 25 | data.append(int(parts[4])) 26 | avg = np.average(np.array(data)) / 2000 # 2 branches, 1000 loops 27 | print(branch_toggle, target_toggle, f"{avg:.2f}") 28 | temp.append(avg) 29 | z_data.append(temp) 30 | 31 | plt.imshow(z_data) 32 | plt.xlabel("Target toggle bit") 33 | plt.xticks(y_data) 34 | plt.ylabel("Branch toggle bit") 35 | plt.yticks(x_data) 36 | plt.savefig("my_branch2.png") 37 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/pointer_chasing.inc: -------------------------------------------------------------------------------- 1 | ; pointer_chasing.inc 2021-01-30 Agner Fog 2 | 3 | ; Test cache access time 4 | ; (c) 2013 - 2021 by Agner Fog. GNU General Public License www.gnu.org/licenses 5 | 6 | ; Parameters: 7 | ; 8 | ; tcase: addressing mode 9 | 10 | ; define test data 11 | %macro testdata 0 12 | pointers dq 0,0,0,0 13 | %endmacro 14 | 15 | ; initialization of pointers 16 | %macro testinit1 0 17 | lea rsi, [pointers] 18 | lea rdi, [rsi+8] 19 | mov [rsi], rdi 20 | add rdi, 8 21 | mov [rsi+8], rdi 22 | add rdi, 8 23 | mov [rsi+16], rdi 24 | add rdi, 8 25 | mov [rsi+24], rsi 26 | mov rdi, rsi 27 | xor ebp, ebp 28 | xor r8d, r8d 29 | mov r9d, 1 30 | %endmacro 31 | 32 | 33 | ; main testcode macro 34 | %macro testcode 0 35 | %if tcase == 1 36 | mov rdi, [rdi] ; base pointer only 37 | %elif tcase == 2 38 | mov rdi, [rdi + rbp * 2] ; base + scaled index 39 | %elif tcase == 3 40 | mov rdi, [r8 + rdi * 1] ; base + scaled index. latency through index 41 | %elif tcase == 4 42 | mov rdi, [rdi + r9 * 4 - 4] ; base + scaled index + offset 43 | %endif 44 | %endmacro 45 | 46 | ; disable default test loops 47 | %define repeat1 1000 48 | %define repeat2 100 49 | -------------------------------------------------------------------------------- /figures/plot_pht_tag_bits_xor.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | import csv 5 | 6 | # Test branch toggles 7 | x_data = [[], []] 8 | y_data = [[], []] 9 | z_data = [[], []] 10 | 11 | with open("pht_tag_bits_xor.csv", newline="") as f: 12 | r = csv.DictReader(f) 13 | for row in r: 14 | target = int(row["target"]) 15 | x_data[target].append(int(row["align"])) 16 | y_data[target].append(int(row["dummy_branches"])) 17 | z_data[target].append(float(row["min"])) 18 | for i in range(2): 19 | z_data[i] = np.array(z_data[i]) 20 | 21 | fig, axes = plt.subplots(2, figsize=(20, 10)) 22 | for i in range(2): 23 | x_data[i] = list(sorted(set(x_data[i]))) 24 | y_data[i] = list(sorted(set(y_data[i]))) 25 | z_data[i].resize(len(x_data[i]) * len(y_data[i])) 26 | z_data[i] = z_data[i].reshape((len(x_data[i]), len(y_data[i]))) 27 | 28 | axes[i].imshow(z_data[i]) 29 | axes[i].set_ylabel("Branch alignment") 30 | axes[i].set_yticks(range(len(x_data[i])), x_data[i]) 31 | if i == 0: 32 | axes[i].set_xlabel("PHRB bits") 33 | else: 34 | axes[i].set_xlabel("PHRT bits") 35 | axes[i].set_xticks(range(len(y_data[i])), y_data[i], rotation=90) 36 | axes[i].grid() 37 | plt.savefig("plot_pht_tag_bits_xor.png") 38 | plt.cla() 39 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch4.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | 5 | # Reproduce Figure 4 of Half&Half 6 | x_data = range(0, 16) 7 | y_data = range(0, 6) 8 | z_data = [] 9 | for branch_toggle in x_data: 10 | temp = [] 11 | for target_toggle in y_data: 12 | output = subprocess.check_output( 13 | ["./my_branch4.sh2", "16", "16", str(branch_toggle), str(target_toggle)], 14 | encoding="utf-8", 15 | ) 16 | heading = False 17 | data = [] 18 | for line in output.splitlines(): 19 | parts = list(filter(lambda s: len(s) > 0, line.strip().split(" "))) 20 | if len(parts) > 0: 21 | if not heading: 22 | assert parts[5] == "BrMisCond" 23 | heading = True 24 | else: 25 | data.append(int(parts[4])) 26 | # remove 500 mis-predictions in dummy branches 27 | avg = (np.average(np.array(data)) - 500) / 2000 # 2 branches, 1000 loops 28 | print(branch_toggle, target_toggle, f"{avg:.2f}") 29 | temp.append(avg) 30 | z_data.append(temp) 31 | 32 | plt.imshow(z_data) 33 | plt.xlabel("Target toggle bit") 34 | plt.xticks(y_data) 35 | plt.ylabel("Branch toggle bit") 36 | plt.yticks(x_data) 37 | plt.savefig("my_branch4.png") 38 | -------------------------------------------------------------------------------- /figures/plot_pht_tag_bits_xor_phr.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | import csv 5 | 6 | # Test branch toggles 7 | x_data = [[], []] 8 | y_data = [[], []] 9 | z_data = [[], []] 10 | 11 | with open("pht_tag_bits_xor_phr.csv", newline="") as f: 12 | r = csv.DictReader(f) 13 | for row in r: 14 | target = int(row["target"]) 15 | x_data[target].append(int(row["first_phr_bit"])) 16 | y_data[target].append(int(row["dummy_branches"])) 17 | z_data[target].append(float(row["min"])) 18 | for i in range(2): 19 | z_data[i] = np.array(z_data[i]) 20 | 21 | fig, axes = plt.subplots(2, figsize=(15, 10)) 22 | for i in range(2): 23 | x_data[i] = list(sorted(set(x_data[i]))) 24 | y_data[i] = list(sorted(set(y_data[i]))) 25 | z_data[i].resize(len(x_data[i]) * len(y_data[i])) 26 | z_data[i] = z_data[i].reshape((len(x_data[i]), len(y_data[i]))) 27 | 28 | axes[i].imshow(z_data[i]) 29 | if i == 0: 30 | axes[i].set_ylabel("PHRB bits") 31 | else: 32 | axes[i].set_ylabel("PHRT bits") 33 | axes[i].set_yticks(range(len(x_data[i])), x_data[i]) 34 | axes[i].set_xlabel("PHRT bits") 35 | axes[i].set_xticks(range(len(y_data[i])), y_data[i], rotation=90) 36 | axes[i].grid() 37 | 38 | plt.savefig("plot_pht_tag_bits_xor_phr.png") 39 | plt.cla() 40 | -------------------------------------------------------------------------------- /figures/plot_phr_branch_bits_location.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | import csv 5 | 6 | # Reproduce Table 2 of Half&Half 7 | # Test branch toggles 8 | for prefix in [""]: 9 | x_data = [] 10 | y_data = [] 11 | z_data = [] 12 | 13 | with open(f"{prefix}phr_branch_bits_location.csv", newline="") as f: 14 | r = csv.DictReader(f) 15 | for row in r: 16 | x_data.append(int(row["toggle"])) 17 | y_data.append(int(row["dummy"])) 18 | z_data.append(min(float(row["avg"]), 0.5)) 19 | z_data = np.array(z_data) 20 | 21 | x_data = list(sorted(set(x_data))) 22 | y_data = list(sorted(set(y_data))) 23 | z_data = z_data.reshape((len(x_data), len(y_data))) 24 | 25 | plt.figure(figsize=(len(y_data) / 4, len(x_data) / 4)) 26 | plt.imshow(z_data) 27 | plt.xlabel("Dummy branches") 28 | plt.xticks(range(len(y_data)), y_data, rotation=90) 29 | plt.ylabel("Branch toggle bit") 30 | plt.yticks(range(len(x_data)), x_data) 31 | bar = plt.colorbar(shrink=0.8, ticks=[0.25, 0.50]) 32 | bar.ax.set_ylabel("Misprediction rate", fontsize=8, rotation=270) 33 | plt.clim(0.25, 0.50) 34 | plt.grid() 35 | plt.savefig(f"plot_{prefix}phr_branch_bits_location.png") 36 | plt.savefig(f"plot_{prefix}phr_branch_bits_location.pdf", bbox_inches="tight") 37 | plt.cla() 38 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/returnstack.inc: -------------------------------------------------------------------------------- 1 | ; returnstack.inc 2012-02-22 AgF 2 | 3 | ; Test return stack buffer 4 | ; (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses 5 | 6 | ; Parameters: 7 | ; 8 | ; nesting: function nesting level (minimum value = 2) 9 | 10 | %ifndef nesting 11 | %define nesting 8 12 | %endif 13 | 14 | ; define function named F%1 calling F%2 15 | %macro definefunc 2 16 | F%1: nop 17 | nop 18 | nop 19 | nop 20 | call F%2 21 | nop 22 | nop 23 | nop 24 | nop 25 | ret 26 | nop 27 | nop 28 | %endmacro 29 | 30 | ; main testcode macro 31 | %macro testcode 0 32 | jmp ZZZ 33 | nop 34 | nop 35 | 36 | align 16 37 | ; define nested functions 38 | %assign n nesting-1 39 | %if n < 1 40 | %assign n 1 41 | %endif 42 | 43 | %assign i 0 44 | %rep n 45 | %assign i i+1 46 | %assign j i+1 47 | %if j > n 48 | %assign j 9999 49 | %endif 50 | definefunc i, j 51 | %endrep 52 | 53 | ; Last function 54 | F9999: 55 | nop 56 | nop 57 | ret 58 | 59 | align 16 60 | ZZZ: nop 61 | nop 62 | call F1 63 | nop 64 | nop 65 | call F1 66 | nop 67 | nop 68 | 69 | %endmacro 70 | 71 | ; disable default test loops 72 | %define repeat1 100 73 | %define repeat2 1 74 | 75 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/out_of_order.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # out_of_order.sh2 2014-08-03 Agner Fog 3 | # 4 | # PMC Test program for measuring out-of-order capacity 5 | # 6 | # (c) 2014 GNU General Public License www.gnu.org/licenses 7 | 8 | # Test cases: 9 | # 1: integer add 10 | # 2: integer mul 11 | # 3: floating point add 12 | # 4: floating point mul 13 | 14 | . vars.sh 15 | 16 | echo -e "Test out-of-order execution capacity" > results2/out_of_order.txt 17 | 18 | for tcase in {1..4} 19 | do 20 | 21 | for chainlength in 10 100 1000 1000000 22 | do 23 | 24 | if [ $tcase -eq 1 ]; then echo -e "\n\nCase 1: integer add, chainlength = $chainlength" >> results2/out_of_order.txt ; fi 25 | if [ $tcase -eq 2 ]; then echo -e "\n\nCase 2: integer mul, chainlength = $chainlength" >> results2/out_of_order.txt ; fi 26 | if [ $tcase -eq 3 ]; then echo -e "\n\nCase 3: floating point add, chainlength = $chainlength" >> results2/out_of_order.txt ; fi 27 | if [ $tcase -eq 4 ]; then echo -e "\n\nCase 4: floating point mul, chainlength = $chainlength" >> results2/out_of_order.txt ; fi 28 | 29 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dchainlength=$chainlength -Dnthreads=1 -Pout_of_order.inc TemplateB64.nasm 30 | if [ $? -ne 0 ] ; then exit ; fi 31 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 32 | if [ $? -ne 0 ] ; then exit ; fi 33 | ./x >> results2/out_of_order.txt 34 | 35 | done 36 | done 37 | 38 | echo -e "\n" >> results2/out_of_order.txt 39 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/unaligned_mem.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 2014-01-31 AgF 3 | 4 | # Test unaligned memory read and write throughput 5 | # (c) 2014 by Agner Fog. GNU General Public License www.gnu.org/licenses 6 | 7 | # Parameters: 8 | # 9 | # regsize: size of read or write operand, default = 32 10 | # roffset: offset to aligned boundary, default = 0 11 | # alignment: alignment of boundary to cross 12 | # tmode: R: read, W: write, WR: write, then read (store forwarding) 13 | 14 | 15 | . vars.sh 16 | 17 | echo -e "\n\nMeasure unaligned read/write throughput\n" > results2/unaligned_mem.txt 18 | 19 | cachelinesize=`./cpugetinfo cachelinesize` 20 | 21 | for tmode in R W WR 22 | do 23 | 24 | for alignment in $(($cachelinesize/2)) $cachelinesize 4096 25 | do 26 | 27 | for regsize in 32 128 28 | do 29 | 30 | for roffset in 0 $(($regsize/8/4)) $(($regsize/8/2)) 31 | do 32 | 33 | 34 | echo -e "\n\ntmode = $tmode, alignment = $alignment, register size = $regsize, offset = -$roffset \n" >> results2/unaligned_mem.txt 35 | $ass -f elf64 -o b64.o -Dtmode=$tmode -Dalignment=$alignment -Droffset=$roffset -Dregsize=$regsize -Punaligned_mem.inc TemplateB64.nasm 36 | if [ $? -ne 0 ] ; then exit ; fi 37 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 38 | if [ $? -ne 0 ] ; then exit ; fi 39 | ./x >> results2/unaligned_mem.txt 40 | 41 | done 42 | done 43 | done 44 | done 45 | 46 | echo -e "\n" >> results2/unaligned_mem.txt 47 | -------------------------------------------------------------------------------- /figures/plot_pht_index_bits_xor_phr.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | import csv 5 | 6 | # collect phr bits 7 | phr_bits = [] 8 | with open("pht_index_bits_xor_phr.csv", newline="") as f: 9 | r = csv.DictReader(f) 10 | for row in r: 11 | phr_bits.append(int(row["phr"])) 12 | 13 | phr_bits = list(sorted(list(set(phr_bits)))) 14 | fig, axes = plt.subplots(len(phr_bits), figsize=(16, len(phr_bits) * 2.5)) 15 | 16 | for i, phr_bit in enumerate(phr_bits): 17 | x_data = [] 18 | y_data = [] 19 | z_data = [] 20 | 21 | with open("pht_index_bits_xor_phr.csv", newline="") as f: 22 | r = csv.DictReader(f) 23 | for row in r: 24 | if int(row["phr"]) == phr_bit: 25 | x_data.append(int(row["branches"])) 26 | y_data.append(int(row["dummy"])) 27 | z_data.append(min(float(row["min"]), 0.5)) 28 | z_data = np.array(z_data) 29 | 30 | x_data = list(sorted(set(x_data))) 31 | y_data = list(sorted(set(y_data))) 32 | z_data = z_data.reshape((len(x_data), len(y_data))) 33 | 34 | axes[i].imshow(z_data) 35 | axes[i].set_ylabel("Predict branches") 36 | axes[i].set_yticks(range(len(x_data)), x_data) 37 | axes[i].set_xlabel(f"PHR bit position injecting PHR[{phr_bit}]") 38 | axes[i].set_xticks(range(len(y_data)), y_data, rotation=90) 39 | plt.subplots_adjust(hspace=1.0) 40 | plt.savefig("plot_pht_index_bits_xor_phr.png") 41 | plt.cla() 42 | -------------------------------------------------------------------------------- /figures/plot_phr_target_bits_location.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | import csv 5 | 6 | # Reproduce Table 2 of Half&Half 7 | # Test target toggles 8 | for prefix in [""]: 9 | x_data = [] 10 | y_data = [] 11 | z_data = [] 12 | 13 | with open(f"{prefix}phr_target_bits_location.csv", newline="") as f: 14 | r = csv.DictReader(f) 15 | for row in r: 16 | x_data.append(int(row["toggle"])) 17 | y_data.append(int(row["dummy"])) 18 | z_data.append(float(row["avg"])) 19 | z_data = np.array(z_data) 20 | 21 | x_data = list(sorted(set(x_data))) 22 | y_data = list(sorted(set(y_data))) 23 | z_data = z_data.reshape((len(x_data), len(y_data))) 24 | 25 | plt.figure(figsize=(len(y_data) / 4, len(x_data) / 4)) 26 | plt.imshow(z_data) 27 | plt.xlabel("Dummy branches") 28 | plt.xticks(range(len(y_data)), y_data, rotation=90) 29 | plt.ylabel("Target toggle bit") 30 | plt.yticks(range(len(x_data)), x_data) 31 | bar = plt.colorbar(shrink=0.8) 32 | bar.ax.set_ylabel("Misprediction rate", fontsize=8, rotation=270, labelpad=9.0) 33 | if np.max(z_data) > 0.7: 34 | # missing conditional branch misses 35 | plt.clim(0.5, 1.0) 36 | else: 37 | plt.clim(0, 0.5) 38 | plt.grid() 39 | plt.savefig(f"plot_{prefix}phr_target_bits_location.png") 40 | plt.savefig(f"plot_{prefix}phr_target_bits_location.pdf", bbox_inches="tight") 41 | plt.cla() 42 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/latencycf.inc: -------------------------------------------------------------------------------- 1 | ; latencycf.inc 2016-10-23 Agner Fog 2 | ; Define test code for measuring latency of miscellaneous instructions with carry flag output 3 | ; (c) Copyright 2016 by Agner Fog. GNU General Public License www.gnu.org/licenses 4 | 5 | ; Parameters: 6 | ; 7 | ; instruct: Instruction to test 8 | ; 9 | ; regsize: Register size: 8, 16, 32, 64, 128, 256, 512. 10 | ; 11 | ; regtype: Register type: v = vector register 128 bits and bigger, k = mask register. 12 | ; 13 | ; numop: Number of register operands, not including flags register 14 | ; 15 | ; numimm: Number of immediate operands. Default = 0 16 | ; 17 | ; immvalue: Value of first immediate operand. Default = 0 18 | 19 | 20 | ; initialize eax 21 | %macro testinit2 0 22 | xor eax, eax 23 | %endmacro 24 | 25 | ; Define specific test code for each register type 26 | 27 | %ifidni regtype, v 28 | 29 | %macro testcode 0 30 | %if numop == 2 31 | instruct reg1, reg1 32 | setb al 33 | %if regsize == 128 34 | movd xmm1, eax 35 | %else 36 | vmovd xmm1, eax 37 | %endif 38 | %else 39 | %error unsupported number of operands numop 40 | %endif 41 | %endmacro 42 | 43 | %elifidni regtype, k 44 | 45 | %macro testcode 0 46 | %if numop == 2 47 | instruct k1,k1 48 | setb al 49 | kmovw k1,eax 50 | %else 51 | %error unsupported number of operands numop 52 | %endif 53 | %endmacro 54 | 55 | %else 56 | 57 | %error unknown register type regtype 58 | 59 | %endif 60 | 61 | -------------------------------------------------------------------------------- /agner/testp/PMCTest/make_a_obj.bat: -------------------------------------------------------------------------------- 1 | rem make_a_obj.bat 2014-04-16 Agner Fog 2 | 3 | rem compiles PMCTestA.cpp into PMCTestA32.obj and PMCTestA64.obj 4 | 5 | rem System requirements: 6 | rem Windows 2000 or NT or later 7 | rem Microsoft Visual C++ compiler or other C++ compiler 8 | 9 | rem You have to change all paths to the actual paths on your computer 10 | 11 | rem Set path to 32 bit compiler 12 | set VSroot=C:\Program Files (x86)\Microsoft Visual Studio 11.0 13 | set SDKroot=C:\Program Files (x86)\Windows Kits\8.0\ 14 | set path1=%path% 15 | set path=%VSroot%\VC\bin;%VSroot%\Common7\IDE;%path1% 16 | 17 | rem Set path to *.h include files. 18 | set include=%VSroot%\VC\include;%SDKroot%\Include\um;%SDKroot%\Include\shared 19 | 20 | rem Set path to *.lib library files. 21 | set LIB="%VSroot%\VC\lib;%SDKroot%\Lib\win8\um\x86" 22 | 23 | rem compile 32 bit object file 24 | cl /c /O2 /FoPMCTestA32.obj PMCTestA.cpp 25 | if errorlevel 1 pause 26 | 27 | rem compile 32bit exe file 28 | rem cl /O2 /MT /Fepmctest.exe PMCTestA32.obj PMCTestB.cpp "%SDKroot%\Lib\win8\um\x86\uuid.lib" "%VSroot%\VC\lib\libcmt.lib" "%VSroot%\VC\lib\oldnames.lib" 29 | 30 | cl /O2 /MT /Fepmctest.exe PMCTestA.cpp PMCTestB.cpp Advapi32.lib /link /LIBPATH:"%SDKroot%\Lib\win8\um\x86" /LIBPATH:"%VSroot%\VC\lib" 31 | if errorlevel 1 pause 32 | 33 | 34 | 35 | 36 | rem Set path to 64 bit compiler 37 | set path=%VSroot%\VC\bin\x86_amd64;%VSroot%\Common7\IDE;%path1% 38 | 39 | rem Set path to *.lib library files. 40 | set lib="%VSroot%\VC\lib\amd64" 41 | 42 | rem compile 64 bit version 43 | cl /c /O2 /FoPMCTestA64.obj PMCTestA.cpp 44 | if errorlevel 1 pause 45 | 46 | pause 47 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch.inc: -------------------------------------------------------------------------------- 1 | ; reproduce Figure 3 of Half&Half 2 | 3 | ; alignment bits of branch instruction address 4 | %ifndef branchalign 5 | %define branchalign 18 6 | %endif 7 | 8 | ; alignment bits of branch target address 9 | %ifndef targetalign 10 | %define targetalign 5 11 | %endif 12 | 13 | %macro testinit3 0 14 | mov rdi, 1000 15 | 16 | ; loop 300 times to clear phr 17 | ; since we only consider branch misprediction of the last two branches 18 | ; we do not have to be accurate here e.g. 93/194 19 | loop_begin: 20 | mov eax, 300 21 | align 64 22 | jmp dummy_target 23 | 24 | align 1<<19 25 | %rep (1<<19)-(1<<8) 26 | nop 27 | %endrep 28 | 29 | ; dummy_target aligned to 1<<8 30 | dummy_target: 31 | %rep (1<<8)-7 32 | nop 33 | %endrep 34 | dec eax ; 2 bytes 35 | ; the last byte of jnz aligned to 1<<19 36 | ; jnz dummy_target 37 | db 0x0f 38 | db 0x85 39 | dd dummy_target - $ - 4 40 | 41 | READ_PMC_START 42 | rdrand eax 43 | and eax, 1 44 | 45 | ; READ_PMC_START: 166 46 | ; rdrand eax: 3 bytes 47 | ; and eax, 1: 3 bytes 48 | ; jnz first_target: 6 bytes 49 | 50 | %rep (1< results1/32bitinstr.txt 15 | 16 | if [ $support32bit -eq 0 ] ; then 17 | echo -e "\nError: 32-bit instructions cannot be compiled on this platform\n" >> results1/32bitinstr.txt 18 | exit 19 | fi 20 | 21 | for instruct in aaa aas daa das aad aam_latency aam_throughput bound into lahf sahf lahf_sahf leave pushad popad salc salc_inc_al 22 | do 23 | 24 | echo -e "\n\ninstruction: $instruct " >> results1/32bitinstr.txt 25 | for cts in $PMClist 26 | do 27 | $ass -f elf32 -o b32.o -Dinstruct=$instruct -Drepeat1=100 -Dcounters=$cts -P32bitinstr.inc TemplateB32.nasm 28 | if [ $? -ne 0 ] ; then exit ; fi 29 | g++ -fno-pie -no-pie -m32 -fno-pie -no-pie a32.o b32.o -ox -lpthread 30 | if [ $? -ne 0 ] ; then exit ; fi 31 | ./x >> results1/32bitinstr.txt 32 | done 33 | done 34 | 35 | for immvalue in {0..4} 36 | do 37 | echo -e "\n\ninstruction: enter 4, $immvalue " >> results1/32bitinstr.txt 38 | for cts in $PMClist 39 | do 40 | $ass -f elf32 -o b32.o -Dinstruct=enter -Dimmvalue=$immvalue -Dcounters=$cts -P32bitinstr.inc TemplateB32.nasm 41 | if [ $? -ne 0 ] ; then exit ; fi 42 | g++ -fno-pie -no-pie -m32 -fno-pie -no-pie a32.o b32.o -ox -lpthread 43 | if [ $? -ne 0 ] ; then exit ; fi 44 | ./x >> results1/32bitinstr.txt 45 | done 46 | done 47 | 48 | echo -e "\n" >> results1/32bitinstr.txt 49 | 50 | -------------------------------------------------------------------------------- /src/ras_size_lib.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // ref: 8 | // https://zhuanlan.zhihu.com/p/595585895 9 | 10 | // generated in ras_size_gen.cpp 11 | // args: loop count 12 | typedef void (*gadget)(size_t); 13 | extern "C" { 14 | extern gadget ras_size_gadgets[]; 15 | } 16 | 17 | void ras_size(FILE *fp) { 18 | #ifdef GEM5 19 | int loop_count = 10; 20 | #else 21 | int loop_count = 1000; 22 | #endif 23 | // match gen_ras_test 24 | int min_size = 1; 25 | int max_size = 128; 26 | 27 | bind_to_core(); 28 | setup_time_or_cycles(); 29 | fprintf(fp, "size,min,avg,max\n"); 30 | int gadget_index = 0; 31 | for (int size = min_size; size <= max_size; size++) { 32 | std::vector history; 33 | int iterations = 100; 34 | history.reserve(iterations); 35 | 36 | double sum = 0; 37 | // run several times 38 | for (int i = 0; i < iterations; i++) { 39 | uint64_t begin = get_time_or_cycles(); 40 | ras_size_gadgets[gadget_index](loop_count); 41 | uint64_t elapsed = get_time_or_cycles() - begin; 42 | 43 | // skip warmup 44 | if (i >= 10) { 45 | double time = (double)elapsed / loop_count / size; 46 | history.push_back(time); 47 | sum += time; 48 | } 49 | } 50 | gadget_index++; 51 | 52 | double min = history[0]; 53 | double max = history[0]; 54 | for (size_t i = 0; i < history.size(); i++) { 55 | if (min > history[i]) { 56 | min = history[i]; 57 | } 58 | if (max < history[i]) { 59 | max = history[i]; 60 | } 61 | } 62 | fprintf(fp, "%d,%.2lf,%.2lf,%.2lf\n", size, min, sum / history.size(), max); 63 | fflush(fp); 64 | } 65 | return; 66 | } 67 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/unaligned_mem.inc: -------------------------------------------------------------------------------- 1 | ; unaligned_mem.inc 2014-01-31 AgF 2 | 3 | ; Test unaligned memory read and write throughput 4 | ; (c) 2014 by Agner Fog. GNU General Public License www.gnu.org/licenses 5 | 6 | ; Parameters: 7 | ; 8 | ; regsize: size of read or write operand, default = 32 9 | ; roffset: offset to aligned boundary, default = 0 10 | ; alignment: alignment of boundary to cross 11 | ; tmode: R: read, W: write, WR: write, then read (store forwarding) 12 | 13 | %ifndef regsize 14 | %define roffset 32 15 | %endif 16 | 17 | %ifndef roffset 18 | %define roffset 0 19 | %endif 20 | 21 | %ifndef alignment 22 | %define alignment 64 23 | %endif 24 | 25 | %ifndef tmode 26 | %define tmode R 27 | %endif 28 | 29 | ; define move instruction 30 | %if regsize < 65 31 | %define moveinst mov 32 | %elif regsize == 65 33 | %define moveinst movq 34 | %elif regsize == 128 35 | %define moveinst movdqu 36 | %elif regsize == 256 37 | %define moveinst vmovdqu 38 | %else 39 | %error unknown register size 40 | %endif 41 | 42 | 43 | ; initialization of aligned or misaligned pointer 44 | %macro testinit1 0 45 | lea psi, [UserData + 1000h] 46 | and psi, -1000h 47 | %endmacro 48 | 49 | 50 | ; main testcode macro 51 | %macro testcode 0 52 | %ifidni tmode, R 53 | moveinst reg0, [psi+alignment-roffset] 54 | %elifidni tmode, W 55 | moveinst [psi+alignment-roffset], reg0 56 | %elifidni tmode, WR 57 | moveinst [psi+alignment-roffset], reg0 58 | moveinst reg0, [psi-roffset] 59 | %endif 60 | %endmacro 61 | 62 | 63 | %macro testdata 0 64 | times 10000H DB 0 65 | %endmacro 66 | 67 | 68 | ; test loops 69 | %define repeat1 1000 70 | %define repeat2 100 71 | -------------------------------------------------------------------------------- /agner/testp/PMCTest/MSRDriver.h: -------------------------------------------------------------------------------- 1 | // msrdriver.h 2012-03-02 Agner Fog 2 | 3 | // Device driver for access to Model-specific registers and control registers 4 | // in Windows 2000 and later and Linux (32 and 64 bit x86 platform) 5 | 6 | // (c) Copyright 2005-2012 by Agner Fog. GNU General Public License www.gnu.org/licences 7 | 8 | #pragma once 9 | 10 | // list of input/output data structures for MSR driver 11 | #define MAX_QUE_ENTRIES 32 // maximum number of entries in queue 12 | 13 | // commands for MSR driver. Shared with application program 14 | enum EMSR_COMMAND { 15 | MSR_IGNORE = 0, // do nothing 16 | MSR_STOP = 1, // skip rest of list 17 | MSR_READ = 2, // read model specific register 18 | MSR_WRITE = 3, // write model specific register 19 | CR_READ = 4, // read control register 20 | CR_WRITE = 5, // write control register 21 | PMC_ENABLE = 6, // Enable RDPMC and RDTSC instructions 22 | PMC_DISABLE= 7, // Disable RDPMC instruction (RDTSC remains enabled) 23 | PROC_GET = 8, // Get processor number (In multiprocessor systems. 0-based) 24 | PROC_SET = 9, // Set processor number (In multiprocessor systems. 0-based) 25 | UNUSED1 = 0x7fffffff // make sure this enum takes 32 bits 26 | }; 27 | 28 | 29 | // input/output data structure for MSR driver 30 | struct SMSRInOut { 31 | enum EMSR_COMMAND msr_command; // command for read or write register 32 | unsigned int register_number; // register number 33 | union { 34 | long long value; // 64 bit value to read or write 35 | unsigned int val[2]; // lower and upper 32 bits 36 | }; 37 | }; 38 | -------------------------------------------------------------------------------- /agner/testp/DriverSrcLinux/MSRDriver.h: -------------------------------------------------------------------------------- 1 | // msrdriver.h 2012-03-02 Agner Fog 2 | 3 | // Device driver for access to Model-specific registers and control registers 4 | // in Windows 2000 and later and Linux (32 and 64 bit x86 platform) 5 | 6 | // (c) Copyright 2005-2012 by Agner Fog. GNU General Public License www.gnu.org/licences 7 | 8 | #pragma once 9 | 10 | // list of input/output data structures for MSR driver 11 | #define MAX_QUE_ENTRIES 32 // maximum number of entries in queue 12 | 13 | // commands for MSR driver. Shared with application program 14 | enum EMSR_COMMAND { 15 | MSR_IGNORE = 0, // do nothing 16 | MSR_STOP = 1, // skip rest of list 17 | MSR_READ = 2, // read model specific register 18 | MSR_WRITE = 3, // write model specific register 19 | CR_READ = 4, // read control register 20 | CR_WRITE = 5, // write control register 21 | PMC_ENABLE = 6, // Enable RDPMC and RDTSC instructions 22 | PMC_DISABLE= 7, // Disable RDPMC instruction (RDTSC remains enabled) 23 | PROC_GET = 8, // Get processor number (In multiprocessor systems. 0-based) 24 | PROC_SET = 9, // Set processor number (In multiprocessor systems. 0-based) 25 | UNUSED1 = 0x7fffffff // make sure this enum takes 32 bits 26 | }; 27 | 28 | 29 | // input/output data structure for MSR driver 30 | struct SMSRInOut { 31 | enum EMSR_COMMAND msr_command; // command for read or write register 32 | unsigned int register_number; // register number 33 | union { 34 | long long value; // 64 bit value to read or write 35 | unsigned int val[2]; // lower and upper 32 bits 36 | }; 37 | }; 38 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/MSRDriver.h: -------------------------------------------------------------------------------- 1 | // msrdriver.h 2012-03-02 Agner Fog 2 | 3 | // Device driver for access to Model-specific registers and control registers 4 | // in Windows 2000 and later and Linux (32 and 64 bit x86 platform) 5 | 6 | // (c) Copyright 2005-2012 by Agner Fog. GNU General Public License www.gnu.org/licences 7 | 8 | #pragma once 9 | 10 | // list of input/output data structures for MSR driver 11 | #define MAX_QUE_ENTRIES 32 // maximum number of entries in queue 12 | 13 | // commands for MSR driver. Shared with application program 14 | enum EMSR_COMMAND { 15 | MSR_IGNORE = 0, // do nothing 16 | MSR_STOP = 1, // skip rest of list 17 | MSR_READ = 2, // read model specific register 18 | MSR_WRITE = 3, // write model specific register 19 | CR_READ = 4, // read control register 20 | CR_WRITE = 5, // write control register 21 | PMC_ENABLE = 6, // Enable RDPMC and RDTSC instructions 22 | PMC_DISABLE= 7, // Disable RDPMC instruction (RDTSC remains enabled) 23 | PROC_GET = 8, // Get processor number (In multiprocessor systems. 0-based) 24 | PROC_SET = 9, // Set processor number (In multiprocessor systems. 0-based) 25 | UNUSED1 = 0x7fffffff // make sure this enum takes 32 bits 26 | }; 27 | 28 | 29 | // input/output data structure for MSR driver 30 | struct SMSRInOut { 31 | enum EMSR_COMMAND msr_command; // command for read or write register 32 | unsigned int register_number; // register number 33 | union { 34 | long long value; // 64 bit value to read or write 35 | unsigned int val[2]; // lower and upper 32 bits 36 | }; 37 | }; 38 | -------------------------------------------------------------------------------- /results/rob_size/README.md: -------------------------------------------------------------------------------- 1 | # Measured ROB Size 2 | 3 | - AMD Zen 1: 192 4 | - AMD Zen 2: 224 5 | - Intel Ivy Bridge EP: 168 6 | - Intel Broadwell: 192 7 | - Intel Cascade Lake: 224 8 | - ARM Cortex A72: 40 9 | - ARM Neoverse N1: 128 10 | - IBM Power 8: ? 11 | 12 | References: 13 | 14 | https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client) 15 | - Intel Broadwell: 192 16 | - Intel Skylake(client): 224 17 | 18 | https://en.wikichip.org/wiki/intel/microarchitectures/cascade_lake 19 | - Intel Cascade Lake: 224 20 | 21 | https://www.anandtech.com/show/14514/examining-intels-ice-lake-microarchitecture-and-sunny-cove/3 22 | - Intel Haswell: 182 23 | - Intel Skylake: 224 24 | - Intel Sunny Cove: 352 25 | 26 | https://en.wikichip.org/wiki/intel/microarchitectures/broadwell_(client) 27 | - Intel Broadwell: 192 28 | 29 | https://en.wikichip.org/wiki/intel/microarchitectures/haswell_(client) 30 | - Intel Haswell: 192 31 | 32 | https://en.wikichip.org/wiki/amd/microarchitectures/zen_2 33 | - AMD Zen 1: 192 34 | - AMD Zen 2: 224 35 | 36 | https://www.anandtech.com/show/16226/apple-silicon-m1-a14-deep-dive/2 37 | - Apple M1: ~630 38 | 39 | https://www.anandtech.com/show/10435/assessing-ibms-power8-part-1/3 40 | - IBM Power 8: 224 41 | - Intel Broadwell: 192 42 | - Intel Skylake: 224 43 | 44 | https://en.wikichip.org/wiki/arm_holdings/microarchitectures/neoverse_n1 45 | - ARM Neoverse N1: 128 46 | 47 | https://www.tomshardware.com/reviews/arm-cortex-a72-architecture,4424.html 48 | - ARM Cortex A72: 128 49 | 50 | https://travisdowns.github.io/blog/2019/06/11/speed-limits.html#ooo-table 51 | - Intel Sandy Bridge: 168 52 | - Intel Ivy Bridge: 168 53 | - Intel Haswell: 192 54 | - Intel Broadwell: 192 55 | - Intel Skylake: 224 56 | - Intel Sunny Cove: 352 57 | - AMD Zen: 192 58 | - AMD Zen 2: 224 59 | - AMD Zen 3: 256 60 | - Apple M1 Firestorm: 636 61 | - Apple M1 Icestorm: 111 62 | - Amazon Graviton 2: ~124 -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | 5 | # Reproduce Figure 3(a) of Half&Half 6 | x_data = range(1, 20) 7 | y_data = [] 8 | for branch_align in x_data: 9 | output = subprocess.check_output( 10 | ["./my_branch.sh2", str(branch_align), "6"], encoding="utf-8" 11 | ) 12 | heading = False 13 | data = [] 14 | for line in output.splitlines(): 15 | parts = list(filter(lambda s: len(s) > 0, line.strip().split(" "))) 16 | if len(parts) > 0: 17 | if not heading: 18 | assert parts[5] == "BrMisCond" 19 | heading = True 20 | else: 21 | data.append(int(parts[4])) 22 | avg = np.average(np.array(data)) / 2000 # 2 branches, 1000 loops 23 | print(branch_align, 6, f"{avg:.2f}") 24 | y_data.append(avg) 25 | 26 | plt.plot(x_data, y_data) 27 | plt.xlabel("Branch alignment bits") 28 | plt.ylabel("Miss Rate") 29 | plt.yticks([0.25, 0.50]) 30 | plt.savefig("my_branch_1.png") 31 | plt.cla() 32 | 33 | # Reproduce Figure 3(b) of Half&Half 34 | x_data = range(1, 19) 35 | y_data = [] 36 | for target_align in x_data: 37 | output = subprocess.check_output( 38 | ["./my_branch.sh2", "16", str(target_align)], encoding="utf-8" 39 | ) 40 | heading = False 41 | data = [] 42 | for line in output.splitlines(): 43 | parts = list(filter(lambda s: len(s) > 0, line.strip().split(" "))) 44 | if len(parts) > 0: 45 | if not heading: 46 | assert parts[5] == "BrMisCond" 47 | heading = True 48 | else: 49 | data.append(int(parts[4])) 50 | avg = np.average(np.array(data)) / 2000 # 2 branches, 1000 loops 51 | print(15, target_align, f"{avg:.2f}") 52 | y_data.append(avg) 53 | 54 | plt.plot(x_data, y_data) 55 | plt.xlabel("Target alignment bits") 56 | plt.ylabel("Miss Rate") 57 | plt.yticks([0.25, 0.50]) 58 | plt.savefig("my_branch_2.png") 59 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/jmp.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 2013-07-19 Agner Fog 3 | 4 | #Test jump instructions performance 5 | 6 | # (c) Copyright 2013 by Agner Fog. GNU General Public License www.gnu.org/licenses 7 | 8 | # Detect CPU specific variables 9 | . vars.sh 10 | 11 | # Performance counters 12 | if [ "$CPUbrand" = "Intel" -a $imodel -ne 28 ] ; then 13 | # This one is for Intel processors with uop cache: 14 | cts="1,9,100,25,26,207" 15 | else 16 | cts=$BranchPMCs 17 | fi 18 | 19 | 20 | echo -e "Test jump instructions" > results2/jmp.txt 21 | 22 | for jmp_per_16b in 1 2 3 4 5 6 8 23 | do 24 | echo -e "\n\njmp ($jmp_per_16b per 16 bytes), 64 bit" >> results2/jmp.txt 25 | $ass -f elf64 -o b64.o -Dinstruct=jmp -Drepeat1=1000 -Dregsize=64 -Djmp_per_16b=$jmp_per_16b -Dcounters=$cts -Pmisc_int.inc TemplateB64.nasm 26 | if [ $? -ne 0 ] ; then exit ; fi 27 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 28 | if [ $? -ne 0 ] ; then exit ; fi 29 | ./x >> results2/jmp.txt 30 | done 31 | 32 | for jmptaken in yes no 33 | do 34 | echo -e "\n\nconditional jump, taken=$jmptaken, 2 per 16 bytes, 64 bit" >> results2/jmp.txt 35 | $ass -f elf64 -o b64.o -Dinstruct=conditional_jmp -Drepeat1=1000 -Dregsize=64 -Djmp_per_16b=2 -Djmptaken=$jmptaken -Dcounters=$cts -Pmisc_int.inc TemplateB64.nasm 36 | if [ $? -ne 0 ] ; then exit ; fi 37 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 38 | if [ $? -ne 0 ] ; then exit ; fi 39 | ./x >> results2/jmp.txt 40 | done 41 | 42 | for jmp_per_16b in 1 2 3 4 5 6 8 43 | do 44 | echo -e "\n\nconditional jump, taken=alternate, $jmp_per_16b per 16 bytes, 64 bit" >> results2/jmp.txt 45 | $ass -f elf64 -o b64.o -Dinstruct=conditional_jmp -Drepeat1=1000 -Dregsize=64 -Djmp_per_16b=$jmp_per_16b -Djmptaken=alternate -Drepeat0=10 -Dcounters=$cts -Pmisc_int.inc TemplateB64.nasm 46 | if [ $? -ne 0 ] ; then exit ; fi 47 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 48 | if [ $? -ne 0 ] ; then exit ; fi 49 | ./x >> results2/jmp.txt 50 | done 51 | 52 | echo -e "\n" >> results2/jmp.txt 53 | 54 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch2.inc: -------------------------------------------------------------------------------- 1 | ; reproduce Figure 4 of Half&Half 2 | 3 | ; alignment bits of branch instruction address 4 | %ifndef branchalign 5 | %define branchalign 18 6 | %endif 7 | 8 | ; alignment bits of branch target address 9 | %ifndef targetalign 10 | %define targetalign 5 11 | %endif 12 | 13 | ; toggle bit of branch address 14 | %ifndef branchtoggle 15 | %define branchtoggle 0 16 | %endif 17 | 18 | ; toggle bit of target address 19 | %ifndef targettoggle 20 | %define targettoggle 0 21 | %endif 22 | 23 | %macro testinit3 0 24 | mov rdi, 1000 25 | 26 | ; loop 300 times to clear phr 27 | ; since we only consider branch misprediction of the last two branches 28 | ; we do not have to be accurate here e.g. 93/194 29 | loop_begin: 30 | mov eax, 300 31 | align 64 32 | jmp dummy_target 33 | 34 | align 1<<19 35 | %rep (1<<19)-(1<<8) 36 | nop 37 | %endrep 38 | 39 | ; dummy_target aligned to 1<<8 40 | dummy_target: 41 | %rep (1<<8)-7 42 | nop 43 | %endrep 44 | dec eax ; 2 bytes 45 | ; the last byte of jnz aligned to 1<<19 46 | ; jnz dummy_target 47 | db 0x0f 48 | db 0x85 49 | dd dummy_target - $ - 4 50 | 51 | READ_PMC_START 52 | rdrand eax 53 | and eax, 1 54 | 55 | ; READ_PMC_START: 166 56 | ; rdrand eax: 3 bytes 57 | ; and eax, 1: 3 bytes 58 | ; jnz first_target: 6 bytes 59 | 60 | %rep (1< 3 | #include 4 | #include 5 | #include 6 | 7 | // generated in ghr_size_gen.cpp 8 | // args: loop count, buffer 9 | typedef void (*gadget)(size_t, uint32_t *); 10 | extern "C" { 11 | extern gadget ghr_size_gadgets[]; 12 | } 13 | 14 | void ghr_size(FILE *fp) { 15 | int loop_count = 1000; 16 | // match gen_ghr_test 17 | int repeat = 2; 18 | int min_size = 1; 19 | int max_size = 1024; 20 | 21 | bind_to_core(); 22 | #ifdef IOS 23 | // no pmu 24 | #else 25 | setup_perf_branch_misses(); 26 | #endif 27 | assert(fp); 28 | 29 | uint32_t *buffer = new uint32_t[loop_count + 1]; 30 | 31 | fprintf(fp, "size,min,avg,max\n"); 32 | int gadget_index = 0; 33 | for (int size = min_size; size <= max_size; size++) { 34 | std::vector history; 35 | int iterations = 100; 36 | history.reserve(iterations); 37 | 38 | double sum = 0; 39 | // run several times 40 | for (int i = 0; i < iterations; i++) { 41 | for (int i = 0; i <= loop_count; i++) { 42 | buffer[i] = rand() % 2; 43 | } 44 | #ifdef IOS 45 | // fallback 46 | uint64_t begin = get_time(); 47 | #else 48 | uint64_t begin = perf_read_branch_misses(); 49 | #endif 50 | 51 | ghr_size_gadgets[gadget_index](loop_count, buffer); 52 | 53 | #ifdef IOS 54 | // fallback 55 | uint64_t elapsed = get_time() - begin; 56 | #else 57 | uint64_t elapsed = perf_read_branch_misses() - begin; 58 | #endif 59 | 60 | // skip warmup 61 | if (i >= 10) { 62 | double time = (double)elapsed / loop_count / repeat; 63 | history.push_back(time); 64 | sum += time; 65 | } 66 | } 67 | gadget_index++; 68 | 69 | double min = history[0]; 70 | double max = history[0]; 71 | for (size_t i = 0; i < history.size(); i++) { 72 | if (min > history[i]) { 73 | min = history[i]; 74 | } 75 | if (max < history[i]) { 76 | max = history[i]; 77 | } 78 | } 79 | fprintf(fp, "%d,%.2lf,%.2lf,%.2lf\n", size, min, sum / history.size(), max); 80 | fflush(fp); 81 | } 82 | delete[] buffer; 83 | } 84 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch4.inc: -------------------------------------------------------------------------------- 1 | ; reproduce Figure 4 of Half&Half 2 | ; additionally shift PHR by 16 bits to avoid tag collision 3 | 4 | ; alignment bits of branch instruction address 5 | %ifndef branchalign 6 | %define branchalign 18 7 | %endif 8 | 9 | ; alignment bits of branch target address 10 | %ifndef targetalign 11 | %define targetalign 5 12 | %endif 13 | 14 | ; toggle bit of branch address 15 | %ifndef branchtoggle 16 | %define branchtoggle 0 17 | %endif 18 | 19 | ; toggle bit of target address 20 | %ifndef targettoggle 21 | %define targettoggle 0 22 | %endif 23 | 24 | %macro SHIFT_PHR 1 25 | mov eax, %1+1 26 | 27 | align 1<<16 28 | %rep (1<<16)-(1<<6) 29 | nop 30 | %endrep 31 | 32 | ; dummy_target aligned to 1<<6 33 | %%shift_phr_dummy_target: 34 | %rep (1<<6)-7 35 | nop 36 | %endrep 37 | dec eax ; 2 bytes 38 | ; the last byte of jnz aligned to 1<<16 39 | ; jnz shift_phr_dummy_target 40 | db 0x0f 41 | db 0x85 42 | dd %%shift_phr_dummy_target - $ - 4 43 | %endmacro 44 | 45 | %macro testinit3 0 46 | mov rdi, 1000 47 | 48 | loop_begin: 49 | ; loop 300 times to clear phr 50 | SHIFT_PHR 300 51 | 52 | READ_PMC_START 53 | rdrand eax 54 | and eax, 1 55 | 56 | ; READ_PMC_START: 166 57 | ; rdrand eax: 3 bytes 58 | ; and eax, 1: 3 bytes 59 | ; jnz first_target: 6 bytes 60 | 61 | %rep (1< 3 | #include 4 | #include 5 | #include 6 | 7 | // generated in rob_size_gen.cpp 8 | // args: buffer1, buffer2, loop count 9 | typedef void (*gadget)(char ***, char ***, size_t); 10 | extern "C" { 11 | extern gadget rob_size_gadgets[]; 12 | } 13 | 14 | int main(int argc, char *argv[]) { 15 | #ifdef GEM5 16 | int loop_count = 10; 17 | #else 18 | int loop_count = 1000; 19 | #endif 20 | // match gen_rob_test 21 | int repeat = 20; 22 | int min_size = 1; 23 | int max_size = 1024; 24 | 25 | bind_to_core(); 26 | setup_time_or_cycles(); 27 | FILE *fp = fopen("rob_size.csv", "w"); 28 | assert(fp); 29 | 30 | #ifdef GEM5 31 | size_t buffer_size = 1024 * 1024 * 16; // 16 MB 32 | #else 33 | size_t buffer_size = 1024 * 1024 * 256; // 256 MB 34 | #endif 35 | char **buffer1 = generate_random_pointer_chasing(buffer_size); 36 | char **p1 = buffer1; 37 | char **buffer2 = generate_random_pointer_chasing(buffer_size); 38 | char **p2 = buffer2; 39 | fprintf(fp, "size,min,avg,max\n"); 40 | for (int size = min_size; size <= max_size; size++) { 41 | std::vector history; 42 | int iterations = 100; 43 | history.reserve(iterations); 44 | 45 | double sum = 0; 46 | // run several times 47 | for (int i = 0; i < iterations; i++) { 48 | uint64_t begin = get_time_or_cycles(); 49 | rob_size_gadgets[size - min_size](&p1, &p2, loop_count); 50 | uint64_t elapsed = get_time_or_cycles() - begin; 51 | 52 | // skip warmup 53 | if (i >= 10) { 54 | double time = (double)elapsed / loop_count / repeat; 55 | history.push_back(time); 56 | sum += time; 57 | } 58 | } 59 | 60 | double min = history[0]; 61 | double max = history[0]; 62 | for (size_t i = 0; i < history.size(); i++) { 63 | if (min > history[i]) { 64 | min = history[i]; 65 | } 66 | if (max < history[i]) { 67 | max = history[i]; 68 | } 69 | } 70 | fprintf(fp, "%d,%.2lf,%.2lf,%.2lf\n", size, min, sum / history.size(), max); 71 | fflush(fp); 72 | } 73 | 74 | printf("Results are written to rob_size.csv\n"); 75 | delete[] buffer1; 76 | delete[] buffer2; 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch5.inc: -------------------------------------------------------------------------------- 1 | ; reproduce Figure 5 of Half&Half 2 | 3 | ; alignment bits of branch instruction address 4 | %ifndef branchalign 5 | %define branchalign 18 6 | %endif 7 | 8 | %macro SHIFT_PHR 1 9 | mov eax, %1+1 10 | 11 | align 1<<16 12 | %rep (1<<16)-(1<<6) 13 | nop 14 | %endrep 15 | 16 | ; dummy_target aligned to 1<<6 17 | %%shift_phr_dummy_target: 18 | %rep (1<<6)-7 19 | nop 20 | %endrep 21 | dec eax ; 2 bytes 22 | ; the last byte of jnz aligned to 1<<16 23 | ; jnz shift_phr_dummy_target 24 | db 0x0f 25 | db 0x85 26 | dd %%shift_phr_dummy_target - $ - 4 27 | %endmacro 28 | 29 | %macro testinit3 0 30 | mov rdi, 10000 31 | 32 | loop_begin: 33 | ; k = rand() % 2 34 | rdrand ebx 35 | and ebx, 1 36 | 37 | ; set phr to k0...0 38 | ; set lower two bits to 0bk0 39 | align 1<<5 40 | %rep (1<<4)-1 41 | nop 42 | %endrep 43 | ; B3=0, B4=1 44 | jnz zero_target 45 | 46 | align 1<<5 47 | %rep (1<<5)-1 48 | nop 49 | %endrep 50 | ; B3=0, B4=0 51 | jz zero_target 52 | 53 | ; T0=T1=0 54 | align 1<<6 55 | zero_target: 56 | 57 | ; shift phr by 193 times 58 | SHIFT_PHR 193 59 | 60 | ; first test branch 61 | test ebx, ebx 62 | align 1<<(branchalign+1) 63 | %rep (1< results2/ucache_misprediction.txt 21 | 22 | repeat0=20 23 | 24 | echo -e "\n\nCase 1: Tiny loop. Expect loop counter to be used" >> results2/ucache_misprediction.txt 25 | tcase=1 26 | 27 | for count1 in 10 100 1000 28 | do 29 | echo -e "\n\nLoop count $count1" >> results2/ucache_misprediction.txt 30 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dcount1=$count1 -Dcounters=$cts -Drepeat0=$repeat0 -Pucache_misprediction.inc TemplateB64.nasm 31 | if [ $? -ne 0 ] ; then exit ; fi 32 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 33 | if [ $? -ne 0 ] ; then exit ; fi 34 | ./x >> results2/ucache_misprediction.txt 35 | done 36 | 37 | echo -e "\n\nCase 2: Normal loop. Expect uop cache to be used" >> results2/ucache_misprediction.txt 38 | tcase=2 39 | 40 | for count1 in 4 10 100 41 | do 42 | echo -e "\n\nNumber of branches $count1" >> results2/ucache_misprediction.txt 43 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dcount1=$count1 -Dcounters=$cts -Drepeat0=$repeat0 -Pucache_misprediction.inc TemplateB64.nasm 44 | if [ $? -ne 0 ] ; then exit ; fi 45 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 46 | if [ $? -ne 0 ] ; then exit ; fi 47 | ./x >> results2/ucache_misprediction.txt 48 | done 49 | 50 | echo -e "\n\nCase 3: Extremely big loop. Expect only fetch and decode to be used" >> results2/ucache_misprediction.txt 51 | tcase=3 52 | 53 | for count1 in 4 10 100 54 | do 55 | echo -e "\n\nNumber of branches $count1" >> results2/ucache_misprediction.txt 56 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dcount1=$count1 -Dcounters=$cts -Drepeat0=$repeat0 -Pucache_misprediction.inc TemplateB64.nasm 57 | if [ $? -ne 0 ] ; then exit ; fi 58 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 59 | if [ $? -ne 0 ] ; then exit ; fi 60 | ./x >> results2/ucache_misprediction.txt 61 | done 62 | 63 | 64 | 65 | echo -e "\n" >> results2/ucache_misprediction.txt 66 | -------------------------------------------------------------------------------- /src/pht_associativity.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // defined in gen_pht_associativity_test() 8 | // args: loop count, buffer 9 | typedef void (*gadget)(size_t, uint32_t *); 10 | extern "C" { 11 | extern gadget pht_associativity_gadgets[]; 12 | } 13 | 14 | int main(int argc, char *argv[]) { 15 | int loop_count = 1000; 16 | // match gen_pht_associativity_test 17 | int min_branches = 1; 18 | int max_branches = 32; 19 | int min_branch_align = 3; 20 | #ifdef __APPLE__ 21 | // alignment cannot surpass page size 22 | int max_branch_align = 8; 23 | #else 24 | int max_branch_align = 19; 25 | #endif 26 | 27 | bind_to_core(); 28 | setup_perf_cond_branch_misses(); 29 | FILE *fp = fopen("pht_associativity.csv", "w"); 30 | assert(fp); 31 | 32 | uint32_t *buffer = new uint32_t[loop_count + 1]; 33 | for (int i = 0; i <= loop_count; i++) { 34 | buffer[i] = rand() % 2; 35 | } 36 | 37 | fprintf(fp, "branches,align,min,avg,max\n"); 38 | int gadget_index = 0; 39 | for (int branches = min_branches; branches <= max_branches; branches++) { 40 | for (int branch_align = min_branch_align; branch_align <= max_branch_align; 41 | branch_align++) { 42 | std::vector history; 43 | int iterations = 100; 44 | history.reserve(iterations); 45 | 46 | double sum = 0; 47 | // run several times 48 | for (int i = 0; i < iterations; i++) { 49 | uint64_t begin = perf_read_cond_branch_misses(); 50 | pht_associativity_gadgets[gadget_index](loop_count, buffer); 51 | uint64_t elapsed = perf_read_cond_branch_misses() - begin; 52 | 53 | // skip warmup 54 | if (i >= 10) { 55 | double time = (double)elapsed / loop_count; 56 | history.push_back(time); 57 | sum += time; 58 | } 59 | } 60 | gadget_index++; 61 | 62 | double min = history[0]; 63 | double max = history[0]; 64 | for (size_t i = 0; i < history.size(); i++) { 65 | if (min > history[i]) { 66 | min = history[i]; 67 | } 68 | if (max < history[i]) { 69 | max = history[i]; 70 | } 71 | } 72 | fprintf(fp, "%d,%d,%.2lf,%.2lf,%.2lf\n", branches, branch_align, min, 73 | sum / history.size(), max); 74 | fflush(fp); 75 | } 76 | } 77 | 78 | printf("Results are written to pht_associativity.csv\n"); 79 | delete[] buffer; 80 | return 0; 81 | } 82 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/out_of_order.inc: -------------------------------------------------------------------------------- 1 | ;---------------------------------------------------------------------------- 2 | ; out_of_order.inc 2014-08-03 Agner Fog 3 | ; 4 | ; PMC Test program for measuring out-of-order capacity 5 | ; 6 | ; (c) 2014 GNU General Public License www.gnu.org/licenses 7 | ; 8 | ; Test cases: 9 | ; 1: integer add 10 | ; 2: integer mul 11 | ; 3: floating point add 12 | ; 4: floating point mul 13 | ;----------------------------------------------------------------------------- 14 | ; Define any undefined macros 15 | 16 | ; total number of instructions 17 | %ifndef totalrep 18 | %define totalrep 1000000 19 | %endif 20 | 21 | %if chainlength < totalrep 22 | %define repeat1 (totalrep / chainlength / 2) 23 | %define clength chainlength 24 | %else 25 | %define repeat1 (totalrep / 100) 26 | %define clength 100 27 | %endif 28 | 29 | 30 | ; Define test cases 31 | 32 | %if tcase == 1 ; integer add 33 | 34 | %macro chain1 0 35 | %rep clength 36 | add eax,eax 37 | %endrep 38 | %endmacro 39 | 40 | %macro chain2 0 41 | %rep clength 42 | add ebx,ebx 43 | %endrep 44 | %endmacro 45 | 46 | %elif tcase == 2 ; integer mul 47 | 48 | %macro chain1 0 49 | %rep clength 50 | imul eax,eax 51 | %endrep 52 | %endmacro 53 | 54 | %macro chain2 0 55 | %rep clength 56 | imul ebx,ebx 57 | %endrep 58 | %endmacro 59 | 60 | %elif tcase == 3 ; floating point add 61 | 62 | %macro chain1 0 63 | %rep clength 64 | addps xmm1,xmm1 65 | %endrep 66 | %endmacro 67 | 68 | %macro chain2 0 69 | %rep clength 70 | addps xmm2,xmm2 71 | %endrep 72 | %endmacro 73 | 74 | %elif tcase == 4 ; floating point mul 75 | 76 | %macro chain1 0 77 | %rep clength 78 | mulps xmm1,xmm1 79 | %endrep 80 | %endmacro 81 | 82 | %macro chain2 0 83 | %rep clength 84 | mulps xmm2,xmm2 85 | %endrep 86 | %endmacro 87 | 88 | %else 89 | %error unknown test case tcase 90 | %endif 91 | 92 | 93 | %macro testcode 0 94 | %if chainlength < totalrep 95 | chain1 96 | chain2 97 | %else 98 | chain1 99 | %endif 100 | %endmacro 101 | 102 | 103 | ; disable default test loops 104 | %define repeat2 1 105 | 106 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/read_write_bandwidth.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 2021-03-23 Agner Fog 3 | 4 | # Test the maximum number of memory reads and writes per clock cycle 5 | 6 | # (c) 2013-2021 by Agner Fog. GNU General Public License www.gnu.org/licenses 7 | 8 | . vars.sh 9 | 10 | pmclist1="$PMClist" 11 | 12 | echo -e "\nTest the maximum number of memory reads and writes per clock cycle" > results2/read_write_bandwidth.txt 13 | echo -e "Test modes:" >> results2/read_write_bandwidth.txt 14 | echo -e "R: read only" >> results2/read_write_bandwidth.txt 15 | echo -e "W: write only" >> results2/read_write_bandwidth.txt 16 | echo -e "RW: one read and one write" >> results2/read_write_bandwidth.txt 17 | echo -e "RRW: two reads and one write" >> results2/read_write_bandwidth.txt 18 | echo -e "RRRW: three reads and one write" >> results2/read_write_bandwidth.txt 19 | echo -e "RWW: one read and two writes" >> results2/read_write_bandwidth.txt 20 | echo -e "RWW2: one read and two writes to different cache lines" >> results2/read_write_bandwidth.txt 21 | 22 | for tmode in R W RW RRW RRRW RWW RWW2 23 | do 24 | 25 | # The modes are: 26 | # R: read only 27 | # W: write only 28 | # RW: one read and one write 29 | # RRW: two reads and one write 30 | # RRRW: three reads and one write 31 | # RWW: one read and two writes 32 | 33 | echo -e "\n\n===========================================\n" >> results2/read_write_bandwidth.txt 34 | echo -e "test mode = $tmode\n" >> results2/read_write_bandwidth.txt 35 | echo -e "===========================================\n\n" >> results2/read_write_bandwidth.txt 36 | 37 | # Check if AVX supported 38 | if [ `grep -c -i "avx" cpuinfo.txt ` -ne 0 ] ; then 39 | reg256=256 40 | else 41 | reg256= 42 | fi 43 | 44 | # Check if AVX512 supported 45 | if [ `grep -c -i "avx512" cpuinfo.txt ` -ne 0 ] ; then 46 | reg512=512 47 | else 48 | reg512= 49 | fi 50 | 51 | 52 | for regsize in 8 16 32 64 128 $reg256 $reg512 53 | do 54 | 55 | echo -e "\n\nRegister size = $regsize bits" >> results2/read_write_bandwidth.txt 56 | 57 | IFS=" " 58 | for pmc in $pmclist1 ; do 59 | 60 | $ass -f elf64 -o b64.o -Dtmode=$tmode -Dregsize=$regsize -Dcounters=$pmc -Pread_write_bandwidth.inc TemplateB64.nasm 61 | if [ $? -ne 0 ] ; then exit ; fi 62 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 63 | if [ $? -ne 0 ] ; then exit ; fi 64 | ./x >> results2/read_write_bandwidth.txt 65 | 66 | done 67 | IFS="," 68 | 69 | done 70 | done 71 | 72 | echo -e "\n" >> results2/read_write_bandwidth.txt 73 | -------------------------------------------------------------------------------- /src/ras_size_gen.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | 3 | // https://github.com/ChipsandCheese/Microbenchmarks/blob/master/AsmGen/tests/ReturnStackTest.cs 4 | int main(int argc, char *argv[]) { 5 | FILE *fp = fopen(argv[1], "w"); 6 | assert(fp); 7 | int min_size = 1; 8 | int max_size = 128; 9 | 10 | // args: loop count 11 | fprintf(fp, ".text\n"); 12 | for (int size = min_size; size <= max_size; size++) { 13 | // entry 14 | fprintf(fp, ".global ras_size_%d\n", size); 15 | fprintf(fp, ".balign 64\n"); 16 | fprintf(fp, "ras_size_%d:\n", size); 17 | #ifdef HOST_AARCH64 18 | // save lr 19 | fprintf(fp, "\tsub sp, sp, #0x20\n"); 20 | fprintf(fp, "\tstp x29, x30, [sp, #0x10]\n"); 21 | 22 | fprintf(fp, "\t1:\n"); 23 | // call function 24 | fprintf(fp, "\tbl ras_func_%d\n", size - 1); 25 | fprintf(fp, "\tsubs x0, x0, #1\n"); 26 | fprintf(fp, "\tbne 1b\n"); 27 | 28 | // restore lr 29 | fprintf(fp, "\tldp x29, x30, [sp, #0x10]\n"); 30 | fprintf(fp, "\tadd sp, sp, #0x20\n"); 31 | fprintf(fp, "\tret\n"); 32 | #elif defined(HOST_AMD64) 33 | fprintf(fp, "\t1:\n"); 34 | // call function 35 | fprintf(fp, "\tcall ras_func_%d\n", size - 1); 36 | fprintf(fp, "\tdec %%rdi\n"); 37 | fprintf(fp, "\tjne 1b\n"); 38 | fprintf(fp, "\tret\n"); 39 | #endif 40 | 41 | // inner function 42 | fprintf(fp, ".global ras_func_%d\n", size); 43 | fprintf(fp, ".balign 64\n"); 44 | fprintf(fp, "ras_func_%d:\n", size); 45 | 46 | // TODO: if we don't want BTB to predict target address for ret 47 | // we can use two bl, and alternate between the two using x0 48 | 49 | #ifdef HOST_AARCH64 50 | // save lr 51 | fprintf(fp, "\tsub sp, sp, #0x20\n"); 52 | fprintf(fp, "\tstp x29, x30, [sp, #0x10]\n"); 53 | 54 | // call lower function 55 | fprintf(fp, "\tbl ras_func_%d\n", size - 1); 56 | 57 | // restore lr 58 | fprintf(fp, "\tldp x29, x30, [sp, #0x10]\n"); 59 | fprintf(fp, "\tadd sp, sp, #0x20\n"); 60 | fprintf(fp, "\tret\n"); 61 | #elif defined(HOST_AMD64) 62 | fprintf(fp, "\tcall ras_func_%d\n", size - 1); 63 | fprintf(fp, "\tret\n"); 64 | #endif 65 | } 66 | 67 | // recursion base 68 | fprintf(fp, ".global ras_func_%d\n", 0); 69 | fprintf(fp, ".balign 32\n"); 70 | fprintf(fp, "ras_func_%d:\n", 0); 71 | fprintf(fp, "\tret\n"); 72 | 73 | define_gadgets_array(fp, "ras_size_gadgets"); 74 | for (int size = min_size; size <= max_size; size++) { 75 | add_gadget(fp, "ras_size_%d", size); 76 | } 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /src/rob_size_gen.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | 3 | int main(int argc, char *argv[]) { 4 | FILE *fp = fopen(argv[1], "w"); 5 | assert(fp); 6 | int repeat = 20; 7 | int min_size = 1; 8 | int max_size = 1024; 9 | // args: loop count, buffer 10 | fprintf(fp, ".text\n"); 11 | for (int size = min_size; size <= max_size; size++) { 12 | fprintf(fp, ".global rob_size_%d\n", size); 13 | fprintf(fp, ".align 4\n"); 14 | fprintf(fp, "rob_size_%d:\n", size); 15 | #ifdef HOST_AARCH64 16 | // int sqrt_count = 8; 17 | fprintf(fp, "\tldr x3, [x0]\n"); 18 | fprintf(fp, "\t1:\n"); 19 | for (int i = 0; i < repeat; i++) { 20 | fprintf(fp, "\tldr x3, [x3]\n"); 21 | // use sqrt if necessary 22 | // for (int j = 0; j < sqrt_count; j++) { 23 | // fprintf(fp, "\tfsqrt d0, d0\n"); 24 | // } 25 | for (int j = 0; j < size - 1; j++) { 26 | fprintf(fp, "\tnop\n"); 27 | } 28 | } 29 | fprintf(fp, "\tsubs x2, x2, #1\n"); 30 | fprintf(fp, "\tbne 1b\n"); 31 | fprintf(fp, "\tstr x3, [x0]\n"); 32 | fprintf(fp, "\tret\n"); 33 | #elif defined(HOST_AMD64) 34 | fprintf(fp, "\tmovq 0(%%rdi), %%r8\n"); 35 | fprintf(fp, "\tmovq 0(%%rsi), %%r9\n"); 36 | fprintf(fp, "\tmovq %%rdx, %%rax\n"); 37 | fprintf(fp, "\t1:\n"); 38 | for (int i = 0; i < repeat; i++) { 39 | fprintf(fp, "\tmovq (%%r8), %%r8\n"); 40 | for (int j = 0; j < size - 1; j++) { 41 | fprintf(fp, "\tnop\n"); 42 | } 43 | fprintf(fp, "\tmovq (%%r9), %%r9\n"); 44 | // forbit further speculation 45 | fprintf(fp, "\tlfence\n"); 46 | fprintf(fp, "\tmfence\n"); 47 | } 48 | fprintf(fp, "\tsubl $1, %%eax\n"); 49 | fprintf(fp, "\tjne 1b\n"); 50 | fprintf(fp, "\tmovq %%r8, 0(%%rdi)\n"); 51 | fprintf(fp, "\tmovq %%r9, 0(%%rsi)\n"); 52 | fprintf(fp, "\tret\n"); 53 | #elif defined(__loongarch__) 54 | fprintf(fp, "\tld.d $a3, $a0, 0\n"); 55 | fprintf(fp, "\t1:\n"); 56 | for (int i = 0; i < repeat; i++) { 57 | fprintf(fp, "\tld.d $a3, $a3, 0\n"); 58 | for (int j = 0; j < size - 1; j++) { 59 | fprintf(fp, "\tnop\n"); 60 | } 61 | } 62 | fprintf(fp, "\taddi.d $a2, $a2, -1\n"); 63 | fprintf(fp, "\tbne $a2, $zero, 1b\n"); 64 | fprintf(fp, "\tst.d $a3, $a0, 0\n"); 65 | fprintf(fp, "\tret\n"); 66 | #endif 67 | } 68 | 69 | define_gadgets_array(fp, "rob_size_gadgets"); 70 | for (int size = min_size; size <= max_size; size++) { 71 | add_gadget(fp, "rob_size_%d", size); 72 | } 73 | return 0; 74 | } 75 | -------------------------------------------------------------------------------- /src/pht_index_tag_bits.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // defined in gen_pht_index_tag_bits_test() 8 | // args: loop count, buffer 9 | typedef void (*gadget)(size_t, uint32_t *); 10 | extern "C" { 11 | extern gadget pht_index_tag_bits_gadgets[]; 12 | } 13 | 14 | int main(int argc, char *argv[]) { 15 | int loop_count = 1000; 16 | // match gen_pht_index_tag_bits_test 17 | int min_branch_align = 2; 18 | #ifdef __APPLE__ 19 | // cannot surpass page size 20 | int max_branch_align = 13; 21 | #else 22 | int max_branch_align = 19; 23 | #endif 24 | 25 | bind_to_core(); 26 | #ifdef NO_COND_BRANCH_MISSES 27 | setup_perf_branch_misses(); 28 | #else 29 | setup_perf_cond_branch_misses(); 30 | #endif 31 | FILE *fp = fopen("pht_index_tag_bits.csv", "w"); 32 | assert(fp); 33 | 34 | uint32_t *buffer = new uint32_t[loop_count + 1]; 35 | 36 | fprintf(fp, "align,min,avg,max\n"); 37 | int gadget_index = 0; 38 | for (int branch_align = min_branch_align; branch_align <= max_branch_align; 39 | branch_align++) { 40 | std::vector history; 41 | int iterations = 100; 42 | history.reserve(iterations); 43 | 44 | double sum = 0; 45 | // run several times 46 | for (int i = 0; i < iterations; i++) { 47 | for (int i = 0; i <= loop_count; i++) { 48 | buffer[i] = rand() % 2; 49 | } 50 | 51 | #ifdef NO_COND_BRANCH_MISSES 52 | uint64_t begin = perf_read_branch_misses(); 53 | #else 54 | uint64_t begin = perf_read_cond_branch_misses(); 55 | #endif 56 | pht_index_tag_bits_gadgets[gadget_index](loop_count, buffer); 57 | #ifdef NO_COND_BRANCH_MISSES 58 | uint64_t elapsed = perf_read_branch_misses() - begin; 59 | #else 60 | uint64_t elapsed = perf_read_cond_branch_misses() - begin; 61 | #endif 62 | 63 | // skip warmup 64 | if (i >= 10) { 65 | double time = (double)elapsed / loop_count; 66 | history.push_back(time); 67 | sum += time; 68 | } 69 | } 70 | gadget_index++; 71 | 72 | double min = history[0]; 73 | double max = history[0]; 74 | for (size_t i = 0; i < history.size(); i++) { 75 | if (min > history[i]) { 76 | min = history[i]; 77 | } 78 | if (max < history[i]) { 79 | max = history[i]; 80 | } 81 | } 82 | fprintf(fp, "%d,%.2lf,%.2lf,%.2lf\n", branch_align, min, 83 | sum / history.size(), max); 84 | fflush(fp); 85 | } 86 | 87 | printf("Results are written to pht_index_tag_bits.csv\n"); 88 | delete[] buffer; 89 | return 0; 90 | } 91 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch3.inc: -------------------------------------------------------------------------------- 1 | ; reproduce Table 2 of Half&Half 2 | 3 | ; alignment bits of branch instruction address 4 | %ifndef branchalign 5 | %define branchalign 18 6 | %endif 7 | 8 | ; alignment bits of branch target address 9 | %ifndef targetalign 10 | %define targetalign 5 11 | %endif 12 | 13 | ; toggle bit of branch address (-1 means do not toggle) 14 | %ifndef branchtoggle 15 | %define branchtoggle 0 16 | %endif 17 | 18 | ; toggle bit of target address (-1 means do not toggle) 19 | %ifndef targettoggle 20 | %define targettoggle 0 21 | %endif 22 | 23 | ; number of dummy branches 24 | %ifndef dummybranches 25 | %define dummybranches 5 26 | %endif 27 | 28 | %macro SHIFT_PHR 1 29 | mov eax, %1+1 30 | 31 | align 1<<16 32 | %rep (1<<16)-(1<<6) 33 | nop 34 | %endrep 35 | 36 | ; dummy_target aligned to 1<<6 37 | %%shift_phr_dummy_target: 38 | %rep (1<<6)-7 39 | nop 40 | %endrep 41 | dec eax ; 2 bytes 42 | ; the last byte of jnz aligned to 1<<16 43 | ; jnz shift_phr_dummy_target 44 | db 0x0f 45 | db 0x85 46 | dd %%shift_phr_dummy_target - $ - 4 47 | %endmacro 48 | 49 | %macro testinit3 0 50 | mov rdi, 1000 51 | 52 | loop_begin: 53 | 54 | ; loop to clear phr 55 | SHIFT_PHR 200 56 | 57 | ; train branch 58 | READ_PMC_START 59 | rdrand ebx 60 | and ebx, 1 61 | ; READ_PMC_START: 166 bytes 62 | ; rdrand ebx: 3 bytes 63 | ; and ebx, 1: 3 bytes 64 | ; jnz first_target: 6 bytes 65 | %rep (1< results2/fused_branch.txt 14 | 15 | let case=0 16 | 17 | for instr1 in cmp test add sub and or xor 18 | do 19 | for instr2 in jz ja jb jg jl js jo jp 20 | do 21 | 22 | let optype=0 23 | for xoptype in reg,reg reg,imm 24 | do 25 | let optype+=1 26 | 27 | echo -e "\n\n$instr1 $xoptype / $instr2 \n" >> results2/fused_branch.txt 28 | $ass -f elf64 -o b64.o -Dcase=$case -Doptype=$optype -Dtaken=0 -Dinstr1=$instr1 -Dinstr2=$instr2 -Dcounters=$BranchPMCs -Pfused_branch.inc TemplateB64.nasm 29 | if [ $? -ne 0 ] ; then exit ; fi 30 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 31 | if [ $? -ne 0 ] ; then exit ; fi 32 | ./x >> results2/fused_branch.txt 33 | 34 | done 35 | done 36 | done 37 | 38 | let case=1 39 | 40 | for instr1 in inc dec neg not 41 | do 42 | for instr2 in jz ja jb jg jl js jo 43 | do 44 | 45 | let optype=0 46 | for xoptype in reg,reg reg,imm 47 | do 48 | let optype+=1 49 | 50 | echo -e "\n\n$instr1 $xoptype / $instr2 \n" >> results2/fused_branch.txt 51 | $ass -f elf64 -o b64.o -Dcase=$case -Doptype=$optype -Dtaken=0 -Dinstr1=$instr1 -Dinstr2=$instr2 -Dcounters=$BranchPMCs -Pfused_branch.inc TemplateB64.nasm 52 | if [ $? -ne 0 ] ; then exit ; fi 53 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 54 | if [ $? -ne 0 ] ; then exit ; fi 55 | ./x >> results2/fused_branch.txt 56 | 57 | done 58 | done 59 | done 60 | 61 | 62 | let optype=1 63 | xoptype="reg,reg" 64 | let case=12 65 | 66 | for xcase in ADC+JC/JNC OR+JZ/JNZ NOT+JZ/JNZ SHR+JC/JNC JECXZ Boundary_before_jz Boundary_in_jz 67 | do 68 | let case+=1 69 | let taken=-1 70 | for xtaken in no yes 71 | do 72 | let taken+=1 73 | 74 | echo -e "\n\n$xcase $xoptype, taken: $xtaken\n" >> results2/fused_branch.txt 75 | # $ass -f elf64 -o b64.o -l fusedb$case.lst -Dcase=$case -Doptype=$optype -Dtaken=$taken -Dcounters=$BranchPMCs -Pfused_branch.inc 76 | $ass -f elf64 -o b64.o -Dcase=$case -Doptype=$optype -Dtaken=$taken -Dcounters=$BranchPMCs -Pfused_branch.inc TemplateB64.nasm 77 | if [ $? -ne 0 ] ; then exit ; fi 78 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 79 | if [ $? -ne 0 ] ; then exit ; fi 80 | ./x >> results2/fused_branch.txt 81 | done 82 | done 83 | 84 | echo -e "\n" >> results2/fused_branch.txt 85 | -------------------------------------------------------------------------------- /src/elimination_lib.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // generated in elimination_gen.cpp 8 | // args: loop count 9 | typedef char **(*gadget)(size_t); 10 | extern "C" { 11 | extern gadget elimination_gadgets[]; 12 | } 13 | 14 | void elimination(FILE *fp) { 15 | int loop_count = 10000; 16 | // match gen_rob_test 17 | 18 | bind_to_core(); 19 | setup_perf_instructions_per_cycle(); 20 | int num_patterns = 16; 21 | const char *pattern_names[] = {"int dependent add", 22 | "int independent add", 23 | "int dependent mov", 24 | "int independent mov", 25 | "int dependent zero via xor", 26 | "int dependent zero via sub", 27 | "int independent zero via mov", 28 | "int independent one via mov", 29 | "int independent two via mov", 30 | "int independent 1024 via mov", 31 | "vec dependent mov", 32 | "vec independent mov", 33 | "vec dependent zero via xor", 34 | "vec dependent zero via sub", 35 | "vec independent zero via mov", 36 | "nop"}; 37 | 38 | int gadget_index = 0; 39 | fprintf(fp, "pattern,min,avg,max\n"); 40 | for (int pattern = 0; pattern < num_patterns; pattern++) { 41 | std::vector history; 42 | int iterations = 100; 43 | history.reserve(iterations); 44 | 45 | double sum = 0; 46 | // run several times 47 | for (int i = 0; i < iterations; i++) { 48 | perf_begin_instructions_per_cycle(); 49 | elimination_gadgets[gadget_index](loop_count); 50 | counter_per_cycle elapsed = perf_end_instructions_per_cycle(); 51 | 52 | // skip warmup 53 | if (i >= 10) { 54 | double time = (double)elapsed.counter / elapsed.cycles; 55 | history.push_back(time); 56 | sum += time; 57 | } 58 | } 59 | gadget_index++; 60 | 61 | double min = history[0]; 62 | double max = history[0]; 63 | for (size_t i = 0; i < history.size(); i++) { 64 | if (min > history[i]) { 65 | min = history[i]; 66 | } 67 | if (max < history[i]) { 68 | max = history[i]; 69 | } 70 | } 71 | fprintf(fp, "%s,%.2lf,%.2lf,%.2lf\n", pattern_names[pattern], min, 72 | sum / history.size(), max); 73 | fflush(fp); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/warmup_fp.inc: -------------------------------------------------------------------------------- 1 | ;---------------------------------------------------------------------------- 2 | ; warmup_fp.inc 2015-12-21 Agner Fog 3 | ; 4 | ; PMC Test program for testing warm up effect of floating point unit 5 | ; 6 | ; Constants to be defined: 7 | ; 8 | ; tcase: 1: integer multiplication 9 | ; 2: x87 floating point multiplication 10 | ; 3: xmm floating point scalar multiplication 11 | ; 4: xmm 128-bit floating point vector multiplication 12 | ; 5: ymm 256-bit floating point vector multiplication 13 | ; 14 | ; (c) Copyright 2013 - 2015 by Agner Fog. GNU General Public License www.gnu.org/licenses 15 | ;----------------------------------------------------------------------------- 16 | ; Define any undefined macros 17 | 18 | %ifndef tcase 19 | %define tcase 1 20 | %endif 21 | 22 | %define WARMUPCOUNT 0 ; don't use warm up in TemplateB64.nasm 23 | 24 | ; Let f.p. unit cool down by using integer unit 25 | ; (This is actually the same as in TemplateB64.nasm, but that may change:) 26 | %macro testinit1 0 27 | %if tcase == 2 ; use x87 28 | fld1 29 | fld1 30 | %endif 31 | 32 | %ifdef primingdelay 33 | vxorps ymm0,ymm0,ymm0 34 | mov ecx, primingdelay/20 35 | mov eax, 1 36 | align 16 37 | Wuloop1: 38 | %rep 20 39 | add eax,eax 40 | %endrep 41 | dec ecx 42 | jnz Wuloop1 43 | %endif 44 | %endmacro 45 | 46 | ; define counts in warmup_fp.sh2 47 | ; %define repeat0 20 48 | ; %define repeat1 10 49 | ; %define repeat2 10 50 | 51 | 52 | ; Define test cases 53 | 54 | %if tcase == 1 ; integer multiplication 55 | 56 | %macro testcode 0 57 | imul rax, rbx 58 | %endmacro 59 | 60 | %elif tcase == 2 ; x87 floating point multiplication 61 | 62 | %macro testcode 0 63 | ;fmul st(1),st(0) 64 | fmul st1,st0 65 | %endmacro 66 | 67 | %elif tcase == 3 ; xmm floating point scalar multiplication 68 | 69 | %macro testcode 0 70 | mulsd xmm1,xmm2 71 | %endmacro 72 | 73 | %elif tcase == 4 ; xmm 128-bit floating point vector multiplication 74 | 75 | %macro testcode 0 76 | mulpd xmm1,xmm2 77 | %endmacro 78 | 79 | %elif tcase == 5 ; ymm 256-bit floating point vector multiplication latency 80 | 81 | %macro testcode 0 82 | vmulpd ymm1,ymm1,ymm2 83 | %endmacro 84 | 85 | %elif tcase == 6 ; ymm 256-bit floating point vector multiplication throughput 86 | 87 | %macro testcode 0 88 | vmulpd ymm1,ymm2,ymm2 89 | vmulpd ymm3,ymm4,ymm4 90 | %endmacro 91 | 92 | %else 93 | %error unknown test case tcase 94 | %endif 95 | -------------------------------------------------------------------------------- /src/pht_tag_bits_xor_phr.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // generated in pht_tag_bits_xor_phr_gen.cpp 8 | // args: loop count, buffer 9 | typedef void (*gadget)(size_t, uint32_t *); 10 | extern "C" { 11 | extern gadget pht_tag_bits_xor_phr_gadgets[]; 12 | } 13 | 14 | int main(int argc, char *argv[]) { 15 | int loop_count = 5000; 16 | // match gen_pht_tag_bits_xor_phr_test 17 | int min_first_phr_bit = 0; 18 | int max_first_phr_bit = 35; 19 | int min_dummy_branches = 0; 20 | int max_dummy_branches = PHR_BRANCHES; 21 | 22 | bind_to_core(); 23 | setup_perf_cond_branch_misses(); 24 | FILE *fp = fopen("pht_tag_bits_xor_phr.csv", "w"); 25 | assert(fp); 26 | 27 | uint32_t *buffer = new uint32_t[loop_count + 1]; 28 | for (int i = 0; i <= loop_count; i++) { 29 | buffer[i] = rand() % 4; 30 | } 31 | 32 | fprintf(fp, "target,first_phr_bit,dummy_branches,min,avg,max\n"); 33 | int gadget_index = 0; 34 | for (int inject_target = 0; inject_target <= 1; inject_target++) { 35 | for (int first_phr_bit = min_first_phr_bit; 36 | first_phr_bit <= max_first_phr_bit; first_phr_bit++) { 37 | for (int dummy_branches = min_dummy_branches; 38 | dummy_branches <= max_dummy_branches; dummy_branches++) { 39 | std::vector history; 40 | int iterations = 100; 41 | history.reserve(iterations); 42 | 43 | double sum = 0; 44 | // run several times 45 | for (int i = 0; i < iterations; i++) { 46 | uint64_t begin = perf_read_cond_branch_misses(); 47 | pht_tag_bits_xor_phr_gadgets[gadget_index](loop_count, buffer); 48 | uint64_t elapsed = perf_read_cond_branch_misses() - begin; 49 | 50 | // skip warmup 51 | if (i >= 10) { 52 | // 1/8 branches 53 | double time = (double)elapsed / loop_count * 4; 54 | history.push_back(time); 55 | sum += time; 56 | } 57 | } 58 | gadget_index++; 59 | 60 | double min = history[0]; 61 | double max = history[0]; 62 | for (size_t i = 0; i < history.size(); i++) { 63 | if (min > history[i]) { 64 | min = history[i]; 65 | } 66 | if (max < history[i]) { 67 | max = history[i]; 68 | } 69 | } 70 | fprintf(fp, "%d,%d,%d,%.2lf,%.2lf,%.2lf\n", inject_target, 71 | first_phr_bit, dummy_branches, min, sum / history.size(), max); 72 | fflush(fp); 73 | } 74 | } 75 | } 76 | 77 | printf("Results are written to pht_tag_bits_xor_phr.csv\n"); 78 | delete[] buffer; 79 | return 0; 80 | } 81 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/ucache_misprediction.inc: -------------------------------------------------------------------------------- 1 | ;---------------------------------------------------------------------------- 2 | ; ucache_misprediction.inc 2013-07-21 Agner Fog 3 | ; 4 | ; PMC Test program for testingbranch prediction 5 | ; NASM syntax 6 | ; 7 | ; The following macros can be defined on the command line or in include files: 8 | ; 9 | ; tcase: Test case number. See below for each case 10 | ; 1. Tiny loop. Expect loop counter to be used 11 | ; 2. Normal loop. Expect uop cache to be used 12 | ; 3. Extremely big loop. Expect only fetch and decode to be used 13 | ; 14 | ; count1: Loop count for outer loop 15 | ; 16 | ; count2: Loop count for inner loop 17 | ; 18 | ; 19 | ; (c) Copyright 2013 by Agner Fog. GNU General Public License www.gnu.org/licenses 20 | ;----------------------------------------------------------------------------- 21 | 22 | %ifndef tcase 23 | %define case 1 ; default case 1 24 | %endif 25 | 26 | %ifndef count1 27 | %define count1 10 ; default count1 28 | %endif 29 | 30 | %ifndef count2 31 | %define count2 16 ; default count2 32 | %endif 33 | 34 | 35 | ;############################################################################## 36 | ;# 37 | ;# Test code macros 38 | ;# 39 | ;############################################################################## 40 | 41 | ; define long nops 42 | %ifndef noptype 43 | %define noptype 2 44 | %endif 45 | 46 | %include "nops.inc" 47 | 48 | %if tcase == 1 ; Tiny loop. Expect loop counter to be used 49 | 50 | %macro testcode 0 51 | nop 52 | nop 53 | mov ebp, count1 54 | align 16 55 | LL: 56 | test r14b,4 57 | jz L2 58 | nop8 59 | L2: 60 | nop8 61 | dec ebp 62 | jnz LL 63 | %endmacro 64 | 65 | %elif tcase == 2 ; Normal loop. Expect uop cache to be used 66 | 67 | %macro testcode 0 68 | %rep count1 69 | test r14b,4 70 | jnz $+10 71 | nop8 72 | nop8 73 | %endrep 74 | %endmacro 75 | 76 | %elif tcase == 3 ; Extremely big loop. Expect only fetch and decode to be used 77 | 78 | %macro testinitc 0 79 | %rep 100000 ; lots of code before counters are read, to prevent uop caching 80 | nop 81 | %endrep 82 | %endmacro 83 | 84 | %macro testcode 0 85 | %rep count1 86 | test r14b,4 87 | jnz $+10 88 | nop8 89 | nop8 90 | %endrep 91 | %endmacro 92 | 93 | %else 94 | %error unknown test case tcase 95 | %endif 96 | 97 | ; disable default test loops 98 | %define repeat1 1 99 | %define repeat2 1 100 | 101 | -------------------------------------------------------------------------------- /src/pht_tag_bits_xor.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // generated in pht_tag_bits_xor_gen.cpp 8 | // args: loop count, buffer 9 | typedef void (*gadget)(size_t, uint32_t *); 10 | extern "C" { 11 | extern gadget pht_tag_bits_xor_gadgets[]; 12 | } 13 | 14 | int main(int argc, char *argv[]) { 15 | int loop_count = 5000; 16 | // match gen_pht_tag_bits_xor_test 17 | int min_branch_align = 3; 18 | #ifdef __APPLE__ 19 | int max_branch_align = 13; 20 | #else 21 | int max_branch_align = 18; 22 | #endif 23 | int min_dummy_branches = 0; 24 | int max_dummy_branches = PHR_BRANCHES + 5; 25 | 26 | bind_to_core(); 27 | setup_perf_cond_branch_misses(); 28 | FILE *fp = fopen("pht_tag_bits_xor.csv", "w"); 29 | assert(fp); 30 | 31 | uint32_t *buffer = new uint32_t[loop_count + 1]; 32 | for (int i = 0; i <= loop_count; i++) { 33 | buffer[i] = rand() % 2; 34 | } 35 | 36 | fprintf(fp, "target,align,dummy_branches,min,avg,max\n"); 37 | int gadget_index = 0; 38 | for (int inject_target = 0; inject_target <= 1; inject_target++) { 39 | for (int branch_align = min_branch_align; branch_align <= max_branch_align; 40 | branch_align++) { 41 | for (int dummy_branches = min_dummy_branches; 42 | dummy_branches <= max_dummy_branches; dummy_branches++) { 43 | std::vector history; 44 | int iterations = 100; 45 | history.reserve(iterations); 46 | 47 | double sum = 0; 48 | // run several times 49 | for (int i = 0; i < iterations; i++) { 50 | uint64_t begin = perf_read_cond_branch_misses(); 51 | pht_tag_bits_xor_gadgets[gadget_index](loop_count, buffer); 52 | uint64_t elapsed = perf_read_cond_branch_misses() - begin; 53 | 54 | // skip warmup 55 | if (i >= 10) { 56 | // 1/8 branches 57 | double time = (double)elapsed / loop_count * 4; 58 | history.push_back(time); 59 | sum += time; 60 | } 61 | } 62 | gadget_index++; 63 | 64 | double min = history[0]; 65 | double max = history[0]; 66 | for (size_t i = 0; i < history.size(); i++) { 67 | if (min > history[i]) { 68 | min = history[i]; 69 | } 70 | if (max < history[i]) { 71 | max = history[i]; 72 | } 73 | } 74 | fprintf(fp, "%d,%d,%d,%.2lf,%.2lf,%.2lf\n", inject_target, branch_align, 75 | dummy_branches, min, sum / history.size(), max); 76 | fflush(fp); 77 | } 78 | } 79 | } 80 | 81 | printf("Results are written to pht_tag_bits_xor.csv\n"); 82 | delete[] buffer; 83 | return 0; 84 | } 85 | -------------------------------------------------------------------------------- /include/counters_mapping.h: -------------------------------------------------------------------------------- 1 | // make clangd happy 2 | #ifndef DEFINE_COUNTER 3 | #define DEFINE_COUNTER(...) 4 | #endif 5 | 6 | #ifndef DEFINE_COUNTER_RANGE 7 | #define DEFINE_COUNTER_RANGE(...) 8 | #endif 9 | 10 | #ifndef DEFINE_COUNTER_SUBTRACT 11 | #define DEFINE_COUNTER_SUBTRACT(...) 12 | #endif 13 | 14 | #ifndef DEFINE_COMPUTED_COUNTER_RANGE 15 | #define DEFINE_COMPUTED_COUNTER_RANGE(...) 16 | #endif 17 | 18 | #ifdef __APPLE__ 19 | // macOS/iOS 20 | DEFINE_COUNTER(cycles, FIXED_CYCLES) 21 | DEFINE_COUNTER(instructions, FIXED_INSTRUCTIONS) 22 | DEFINE_COUNTER(branch_misses, BRANCH_MISPRED_NONSPEC) 23 | DEFINE_COUNTER(cond_branch_misses, BRANCH_COND_MISPRED_NONSPEC) 24 | 25 | #else 26 | // Linux 27 | // select pmu based on icestorm/firestorm 28 | // 0xb: firestorm pmu 29 | #define PERF_TYPE_FIRESTORM 0xb 30 | // 0xa: icestorm pmu 31 | #define PERF_TYPE_ICESTORM 0xa 32 | 33 | // 0xa: gracemont pmu 34 | #define PERF_TYPE_GRACEMONT 0xaL 35 | 36 | // firestorm/icestorm 37 | // 0x02: CORE_ACTIVE_CYCLE from 38 | // https://github.com/jiegec/apple-pmu/blob/master/a14.md 39 | DEFINE_COUNTER(cycles, firestorm, PERF_TYPE_FIRESTORM, 0x02) 40 | // 0x8c: INST_ALL from 41 | // https://github.com/jiegec/apple-pmu/blob/master/a14.md 42 | DEFINE_COUNTER(instructions, firestorm, PERF_TYPE_FIRESTORM, 0x8c) 43 | // 0xcb: BRANCH_MISPRED_NONSPEC from 44 | // https://github.com/jiegec/apple-pmu/blob/master/a14.md 45 | DEFINE_COUNTER(branch_misses, firestorm, PERF_TYPE_FIRESTORM, 0xcb) 46 | // 0xc5: BRANCH_COND_MISPRED_NONSPEC from 47 | // https://github.com/jiegec/apple-pmu/blob/master/a14.md 48 | DEFINE_COUNTER(cond_branch_misses, firestorm, PERF_TYPE_FIRESTORM, 0xc5) 49 | 50 | // arm64 general 51 | // ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED in 52 | // linux/include/linux/perf/arm_pmuv3.h PERF_COUNT_HW_BRANCH_MISSES was mapped 53 | // to ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, which counts speculative mis-predictions, 54 | // we want retired mis-predictions 55 | DEFINE_COUNTER_RANGE(branch_misses, arm64, PERF_TYPE_RAW, 0x22) 56 | 57 | // qualcomm oryon 58 | // discovered via find_branch_misses_pmu tool 59 | DEFINE_COUNTER(cond_branch_misses, oryon, PERF_TYPE_RAW, 0x400) 60 | 61 | // fallback counters 62 | 63 | // cycles 64 | DEFINE_COUNTER_RANGE(cycles, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES) 65 | 66 | // instructions retired 67 | DEFINE_COUNTER_RANGE(instructions, all, PERF_TYPE_HARDWARE, 68 | PERF_COUNT_HW_INSTRUCTIONS) 69 | 70 | // branch mispredictions 71 | DEFINE_COUNTER_RANGE(branch_misses, all, PERF_TYPE_HARDWARE, 72 | PERF_COUNT_HW_BRANCH_MISSES) 73 | 74 | // counter per cycle 75 | DEFINE_COMPUTED_COUNTER_RANGE(instructions_per_cycle, counter_per_cycle, all, 76 | compute_counter_per_cycle, instructions, cycles) 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /src/phr_size_lib.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // use with generate_gadget tool 8 | 9 | // defined in gen_phr_test() 10 | // args: loop count, buffer 11 | typedef void (*gadget)(size_t, uint32_t *); 12 | extern "C" { 13 | extern gadget phr_size_gadgets[]; 14 | } 15 | 16 | void phr_size(FILE *fp) { 17 | int loop_count = 1000; 18 | // match gen_phr_test 19 | int min_size = 1; 20 | int max_size = 256; 21 | 22 | bind_to_core(); 23 | #ifdef IOS 24 | // no pmu 25 | #elif defined(NO_COND_BRANCH_MISSES) 26 | // fallback 27 | setup_perf_branch_misses(); 28 | #else 29 | setup_perf_cond_branch_misses(); 30 | #endif 31 | assert(fp); 32 | 33 | uint32_t *buffer = new uint32_t[loop_count + 1]; 34 | 35 | fprintf(fp, "size,min,avg,max\n"); 36 | int gadget_index = 0; 37 | for (int size = min_size; size <= max_size; size++) { 38 | std::vector history; 39 | int iterations = 100; 40 | history.reserve(iterations); 41 | 42 | double sum = 0; 43 | // run several times 44 | for (int i = 0; i < iterations; i++) { 45 | 46 | // random 47 | for (int i = 0; i <= loop_count; i++) { 48 | buffer[i] = rand() % 2; 49 | } 50 | // ensures that ldr w11, [x0, w11, uxtw #2] does not change w11 51 | buffer[0] = 0; 52 | buffer[1] = 1; 53 | 54 | #ifdef IOS 55 | // fallback 56 | uint64_t begin = get_time(); 57 | #elif defined(NO_COND_BRANCH_MISSES) 58 | // fallback 59 | uint64_t begin = perf_read_branch_misses(); 60 | #else 61 | uint64_t begin = perf_read_cond_branch_misses(); 62 | #endif 63 | 64 | phr_size_gadgets[gadget_index](loop_count, buffer); 65 | 66 | #ifdef IOS 67 | // fallback 68 | uint64_t elapsed = get_time() - begin; 69 | #elif defined(NO_COND_BRANCH_MISSES) 70 | // fallback 71 | uint64_t elapsed = perf_read_branch_misses() - begin; 72 | #else 73 | uint64_t elapsed = perf_read_cond_branch_misses() - begin; 74 | #endif 75 | 76 | // skip warmup 77 | if (i >= 10) { 78 | double time = (double)elapsed / loop_count; 79 | history.push_back(time); 80 | sum += time; 81 | } 82 | } 83 | gadget_index++; 84 | 85 | double min = history[0]; 86 | double max = history[0]; 87 | for (size_t i = 0; i < history.size(); i++) { 88 | if (min > history[i]) { 89 | min = history[i]; 90 | } 91 | if (max < history[i]) { 92 | max = history[i]; 93 | } 94 | } 95 | fprintf(fp, "%d,%.2lf,%.2lf,%.2lf\n", size, min, sum / history.size(), max); 96 | fflush(fp); 97 | } 98 | delete[] buffer; 99 | } 100 | -------------------------------------------------------------------------------- /src/btb_size_basic_lib.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // generated in btb_size_basic_gen.cpp 10 | // args: loop count 11 | typedef void (*gadget)(size_t); 12 | extern "C" { 13 | extern gadget btb_size_basic_gadgets[]; 14 | } 15 | 16 | void btb_size_basic(FILE *fp) { 17 | int loop_count = 100; 18 | // match gen_btb_test 19 | uint64_t min_size = 0, max_size = 0, max_product = 0, min_stride = 0, 20 | max_stride = 0; 21 | std::vector mults; 22 | int num_patterns = 3; 23 | 24 | min_size = 2; 25 | max_size = 65536; 26 | max_product = 32768; 27 | min_stride = 4; 28 | max_stride = 8192; 29 | mults = {1, 3, 5, 7}; 30 | 31 | bind_to_core(); 32 | setup_perf_cycles(); 33 | fprintf(fp, "pattern,size,stride,min,avg,max\n"); 34 | int gadget_index = 0; 35 | for (int pattern = 0; pattern < num_patterns; pattern++) { 36 | for (uint64_t stride = min_stride; stride <= max_stride; stride *= 2) { 37 | std::set sizes; 38 | for (uint64_t size_base = min_size; size_base <= max_product / stride; 39 | size_base *= 2) { 40 | for (uint64_t mult : mults) { 41 | for (uint64_t size = size_base * mult - 1; 42 | size <= size_base * mult + 1 && size * stride <= max_product && 43 | size <= max_size; 44 | size++) { 45 | sizes.insert(size); 46 | } 47 | } 48 | } 49 | 50 | for (uint64_t size : sizes) { 51 | gadget entry = btb_size_basic_gadgets[gadget_index]; 52 | 53 | std::vector history; 54 | int iterations = 30; 55 | history.reserve(iterations); 56 | 57 | double sum = 0; 58 | // run several times 59 | for (int i = 0; i < iterations; i++) { 60 | uint64_t begin = perf_read_cycles(); 61 | entry(loop_count); 62 | uint64_t elapsed = perf_read_cycles() - begin; 63 | 64 | // skip warmup 65 | if (i >= 10) { 66 | double time = (double)elapsed / loop_count / size; 67 | history.push_back(time); 68 | sum += time; 69 | } 70 | } 71 | gadget_index++; 72 | 73 | double min = history[0]; 74 | double max = history[0]; 75 | for (size_t i = 0; i < history.size(); i++) { 76 | if (min > history[i]) { 77 | min = history[i]; 78 | } 79 | if (max < history[i]) { 80 | max = history[i]; 81 | } 82 | } 83 | fprintf(fp, "%d,%ld,%ld,%.2lf,%.2lf,%.2lf\n", pattern, size, stride, 84 | min, sum / history.size(), max); 85 | fflush(fp); 86 | } 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/length_chg_prefix.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # length_chg_prefix.sh2 2021-01-21 Agner Fog 3 | # 4 | # PMC Test program for testing any loop buffer or microop cache 5 | # 6 | # (c) 2013-2021 GNU General Public License www.gnu.org/licenses 7 | # 8 | # Parameters: 9 | # 10 | # tcase: 1: mov register,constant 11 | # 2: add register,constant 12 | # 3: test register,constant 13 | # 4: neg or not register (bogus length-changing prefix) 14 | # 5: lea with address size prefix (must run in 32-bit mode) 15 | # 16 | # tmode: 1: instructions with length-changing prefix, aligned by 16 17 | # 2: instructions with length-changing prefix, crossing 16-bytes boundary 18 | # 3: similar instructions with non-length-changing prefix, aligned by 16 19 | 20 | . vars.sh 21 | 22 | nthreads=1 23 | 24 | echo -e "Test length-changing prefixes" > results2/length_chg_prefix.txt 25 | 26 | for tcase in {1..5} 27 | do 28 | 29 | if [ $tcase -eq 1 ]; then echo -e "\n\nCase 1: mov register,constant\n" >> results2/length_chg_prefix.txt ; fi 30 | if [ $tcase -eq 2 ]; then echo -e "\n\nCase 2: add register,constant\n" >> results2/length_chg_prefix.txt ; fi 31 | if [ $tcase -eq 3 ]; then echo -e "\n\nCase 3: test register,constant\n" >> results2/length_chg_prefix.txt ; fi 32 | if [ $tcase -eq 4 ]; then echo -e "\n\nCase 4: neg or not register (bogus length-changing prefix)\n" >> results2/length_chg_prefix.txt ; fi 33 | if [ $tcase -eq 5 ]; then echo -e "\n\nCase 5: lea with address size prefix\n" >> results2/length_chg_prefix.txt ; fi 34 | 35 | for tmode in {1..3} 36 | do 37 | if [ $tmode -eq 1 ]; then echo -e "\nA. Instructions with length-changing prefix, aligned by 16" >> results2/length_chg_prefix.txt ; fi 38 | if [ $tmode -eq 2 ]; then echo -e "\nB. Instructions with length-changing prefix, crossing 16-bytes boundary" >> results2/length_chg_prefix.txt ; fi 39 | if [ $tmode -eq 3 ]; then echo -e "\nC. Similar instructions with non-length-changing prefix" >> results2/length_chg_prefix.txt ; fi 40 | 41 | if [[ $support32bit == 1 ]] ; then 42 | $ass -f elf32 -o b32.o -l b32.lst -Dnthreads=$nthreads -Dtcase=$tcase -Dtmode=$tmode -Plength_chg_prefix.inc TemplateB32.nasm 43 | if [ $? -ne 0 ] ; then exit ; fi 44 | g++ -fno-pie -no-pie -m32 -fno-pie -no-pie a32.o b32.o -ox -lpthread 45 | if [ $? -ne 0 ] ; then exit ; fi 46 | ./x >> results2/length_chg_prefix.txt 47 | else # must use 64 bits 48 | $ass -f elf64 -o b64.o -l b64.lst -Dnthreads=$nthreads -Dtcase=$tcase -Dtmode=$tmode -Plength_chg_prefix.inc TemplateB64.nasm 49 | if [ $? -ne 0 ] ; then exit ; fi 50 | g++ -fno-pie -no-pie -m64 -fno-pie -no-pie a64.o b64.o -ox -lpthread 51 | if [ $? -ne 0 ] ; then exit ; fi 52 | ./x >> results2/length_chg_prefix.txt 53 | fi 54 | 55 | done 56 | done 57 | 58 | echo -e "\n" >> results2/length_chg_prefix.txt 59 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/32bitinstr.inc: -------------------------------------------------------------------------------- 1 | ; 32bitinstr.inc 2 | ; Define test code for instructions in 32-bit mode 3 | ; (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses 4 | 5 | ; instruction-specific test codes 6 | 7 | %ifidni instruct, aam_latency 8 | %macro testcode 0 9 | aam 10 | %endmacro 11 | 12 | %elifidni instruct, aam_throughput 13 | %macro testcode 0 14 | xor eax,eax ; break dependency 15 | aam 16 | %endmacro 17 | 18 | %elifidni instruct, bound 19 | %macro testinit2 0 20 | mov [esi],esi ; setup bounds to avoid interrupt 21 | lea eax, [esi+100] 22 | mov [esi+4],eax 23 | %endmacro 24 | %macro testcode 0 25 | bound esi, [esi] 26 | %endmacro 27 | 28 | %elifidni instruct, into 29 | %macro testinit2 0 30 | xor eax,eax ; clear overflow flag 31 | %endmacro 32 | 33 | %elifidni instruct, lahf_sahf 34 | %macro testcode 0 35 | lahf ; test combined latency 36 | sahf 37 | %endmacro 38 | 39 | %elifidni instruct, leave 40 | %macro testcode 0 41 | mov esi, 100 ; can't use ebp here 42 | align 16 43 | repeat11loop: 44 | mov edi,esp ; prepare stack frame 45 | push 0 46 | mov ebp,esp 47 | mov [ebp],ebp 48 | %rep 100 49 | leave 50 | %endrep 51 | mov esp,edi ; restore stack 52 | dec esi 53 | jnz repeat11loop ; loop 54 | %endmacro 55 | %define repeat1 0 ; disable default loops 56 | %define repeat2 1 57 | 58 | %elifidni instruct, pushad 59 | %macro testcode 0 60 | mov edi,esp 61 | %rep 100 62 | pushad 63 | %endrep 64 | mov esp,edi ; restore stack 65 | %endmacro 66 | %define repeat2 1 67 | 68 | %elifidni instruct, popad 69 | %macro testcode 0 70 | movd xmm0, esp 71 | movd xmm1, ebp 72 | sub esp, 3200 ; prepare stack 73 | %rep 100 74 | popad 75 | %endrep 76 | movd esp, xmm0 ; restore stack 77 | movd ebp, xmm1 ; restore loop pointer 78 | %endmacro 79 | %define repeat2 1 80 | 81 | %elifidni instruct, salc_inc_al 82 | %macro testcode 0 83 | salc ; combined latency 84 | inc al 85 | %endmacro 86 | 87 | %elifidni instruct, enter 88 | %macro testcode 0 89 | mov esi, 100 ; can't use ebp here 90 | align 16 91 | repeat11loop: 92 | mov edi, esp 93 | lea ebp, [UserData+1000h] ; dummy frame 94 | %REP 100 95 | enter 4, immvalue 96 | %ENDREP 97 | mov esp,edi ; restore stack pointer 98 | dec esi 99 | jnz repeat11loop ; loop 100 | %endmacro 101 | %define repeat1 0 ; disable default loops 102 | %define repeat2 1 103 | 104 | 105 | %else 106 | 107 | 108 | %endif 109 | 110 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/my_branch3.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | 5 | # Reproduce Table 2 of Half&Half 6 | # Test branch toggles 7 | x_data = range(0, 16) 8 | y_data = range(185, 195) 9 | z_data = [] 10 | for branch_toggle in x_data: 11 | temp = [] 12 | for dummy_branches in y_data: 13 | output = subprocess.check_output( 14 | [ 15 | "./my_branch3.sh2", 16 | "16", 17 | "16", 18 | str(branch_toggle), 19 | "-1", 20 | str(dummy_branches), 21 | ], 22 | encoding="utf-8", 23 | ) 24 | heading = False 25 | data = [] 26 | for line in output.splitlines(): 27 | parts = list(filter(lambda s: len(s) > 0, line.strip().split(" "))) 28 | if len(parts) > 0: 29 | if not heading: 30 | assert parts[5] == "BrMisCond" 31 | heading = True 32 | else: 33 | data.append(int(parts[4])) 34 | # skip misprediction from dummy branches 35 | avg = (np.average(np.array(data)) - 1000) / 2000 # 2 branches, 1000 loops 36 | print(branch_toggle, dummy_branches, f"{avg:.2f}") 37 | temp.append(avg) 38 | z_data.append(temp) 39 | 40 | plt.imshow(z_data) 41 | plt.xlabel("Dummy branches") 42 | plt.xticks(range(len(y_data)), y_data, rotation=90) 43 | plt.ylabel("Branch toggle bit") 44 | plt.yticks(x_data) 45 | plt.savefig("my_branch3_1.png") 46 | plt.cla() 47 | 48 | # Test target toggles 49 | x_data = range(0, 6) 50 | y_data = range(185, 195) 51 | z_data = [] 52 | for target_toggle in x_data: 53 | temp = [] 54 | for dummy_branches in y_data: 55 | output = subprocess.check_output( 56 | [ 57 | "./my_branch3.sh2", 58 | "16", 59 | "16", 60 | "-1", 61 | str(target_toggle), 62 | str(dummy_branches), 63 | ], 64 | encoding="utf-8", 65 | ) 66 | heading = False 67 | data = [] 68 | for line in output.splitlines(): 69 | parts = list(filter(lambda s: len(s) > 0, line.strip().split(" "))) 70 | if len(parts) > 0: 71 | if not heading: 72 | assert parts[5] == "BrMisCond" 73 | heading = True 74 | else: 75 | data.append(int(parts[4])) 76 | # skip misprediction from dummy branches 77 | avg = (np.average(np.array(data)) - 1000) / 2000 # 2 branches, 1000 loops 78 | print(target_toggle, dummy_branches, f"{avg:.2f}") 79 | temp.append(avg) 80 | z_data.append(temp) 81 | 82 | plt.imshow(z_data) 83 | plt.xlabel("Dummy branches") 84 | plt.xticks(range(len(y_data)), y_data, rotation=90) 85 | plt.ylabel("Target toggle bit") 86 | plt.yticks(x_data) 87 | plt.savefig("my_branch3_2.png") 88 | plt.cla() 89 | -------------------------------------------------------------------------------- /src/find_branch_misses_pmu.cpp: -------------------------------------------------------------------------------- 1 | #ifdef __linux__ 2 | #include "include/utils.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // generated in find_branch_misses_pmu_gen.cpp 12 | // args: loop count, buffer 13 | typedef void (*gadget)(size_t, uint32_t *); 14 | extern "C" { 15 | extern gadget find_branch_misses_pmu_gadgets[]; 16 | } 17 | 18 | int main(int argc, char *argv[]) { 19 | int num_patterns = 3; 20 | int loop_count = 1000; 21 | int min_counter = 0x0; 22 | int max_counter = 0x1000; 23 | const char *pattern_names[] = { 24 | "50% cond branch miss", 25 | "50% indirect branch miss", 26 | "50% cond + indirect branch miss", 27 | }; 28 | 29 | bind_to_core(); 30 | FILE *fp = fopen("find_branch_misses_pmu.csv", "w"); 31 | assert(fp); 32 | 33 | uint32_t *buffer = new uint32_t[loop_count + 1]; 34 | for (int i = 0; i <= loop_count; i++) { 35 | buffer[i] = rand() % 2; 36 | } 37 | 38 | fprintf(fp, "pattern,counter,min,avg,max\n"); 39 | int gadget_index = 0; 40 | for (int pattern = 0; pattern < num_patterns; pattern++) { 41 | for (int counter = min_counter; counter <= max_counter; counter++) { 42 | std::vector history; 43 | int iterations = 100; 44 | history.reserve(iterations); 45 | 46 | double sum = 0; 47 | raw_perf_counter perf = 48 | setup_perf_common_failable(PERF_TYPE_RAW, counter); 49 | if (perf.fd < 0) { 50 | continue; 51 | } 52 | 53 | // run several times 54 | for (int i = 0; i < iterations; i++) { 55 | uint64_t begin = perf.read(); 56 | find_branch_misses_pmu_gadgets[gadget_index](loop_count, buffer); 57 | uint64_t elapsed = perf.read() - begin; 58 | 59 | // skip warmup 60 | if (i >= 10) { 61 | double time = (double)elapsed / loop_count; 62 | history.push_back(time); 63 | sum += time; 64 | } 65 | } 66 | close(perf.fd); 67 | if (perf.page) { 68 | munmap(perf.page, getpagesize()); 69 | } 70 | 71 | double min = history[0]; 72 | double max = history[0]; 73 | for (size_t i = 0; i < history.size(); i++) { 74 | if (min > history[i]) { 75 | min = history[i]; 76 | } 77 | if (max < history[i]) { 78 | max = history[i]; 79 | } 80 | } 81 | if (max > 0.0) 82 | fprintf(fp, "%s,0x%x,%.2lf,%.2lf,%.2lf\n", pattern_names[pattern], 83 | counter, min, sum / history.size(), max); 84 | fflush(fp); 85 | } 86 | gadget_index++; 87 | } 88 | 89 | printf("Results are written to find_branch_misses_pmu.csv\n"); 90 | delete[] buffer; 91 | return 0; 92 | } 93 | #else 94 | #include 95 | int main(int argc, char *argv[]) { 96 | printf("Not supported\n"); 97 | return 0; 98 | } 99 | #endif 100 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/loop_buffer.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # loop_buffer.sh2 2020-08-26 Agner Fog 3 | # 4 | # PMC Test program for testing any loop buffer or microop cache 5 | # 6 | # (c) 2013-2020 GNU General Public License www.gnu.org/licenses 7 | # 8 | # Parameters: 9 | # 10 | # nopsize: Size of NOP instructions (1 - 15) 11 | # 12 | # noptype: 2: long NOPs (0F 1F ...) 13 | # 3: 66 NOPs (simple NOP with up to 14 operand size prefixes) 14 | # 4: long NOPs up to 11, then other instructions with max 3 prefixes up to 14 (for processors that have penalties for > 3 prefixes) 15 | # 16 | # repeat1: Number of loop repetitions 17 | # 18 | # repeat2: Number of NOPs in loop 19 | # 20 | # nthreads: Number of simultaneous threads 21 | 22 | # (You may change the parameters to focus near the limit of the buffer size) 23 | 24 | . vars.sh 25 | 26 | nthreads=1 27 | 28 | repeat1=1000 29 | 30 | nopsize=12 31 | 32 | echo -e "Test loop buffer size" > results2/loop_buffer.txt 33 | 34 | for noptype in 2 3 35 | do 36 | for repeat2 in 2 10 20 30 32 40 50 100 1000 2000 2200 2500 2800 3000 4000 10000 20000 37 | do 38 | 39 | totalsize=$(expr 5 + $repeat2 \* $nopsize ) 40 | 41 | echo -e "\n\nNumber of NOPs = $repeat2, noptype = $noptype, nopsize = $nopsize, total size = $totalsize" >> results2/loop_buffer.txt 42 | 43 | $ass -f elf64 -o b64.o -Dnthreads=$nthreads -Drepeat1=$repeat1 -Drepeat2=$repeat2 -Dnoptype=$noptype -Dnopsize=$nopsize -Ploop_buffer.inc TemplateB64.nasm 44 | if [ $? -ne 0 ] ; then exit ; fi 45 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 46 | if [ $? -ne 0 ] ; then exit ; fi 47 | ./x >> results2/loop_buffer.txt 48 | 49 | done 50 | done 51 | 52 | nopsize=14 53 | noptype=2 54 | 55 | for repeat2 in 10 30 100 1000 2000 2200 2500 2800 3000 4000 10000 20000 56 | do 57 | 58 | totalsize=$(expr 5 + $repeat2 \* $nopsize ) 59 | 60 | echo -e "\n\nNumber of NOPs = $repeat2, noptype = $noptype, nopsize = $nopsize, total size = $totalsize" >> results2/loop_buffer.txt 61 | 62 | $ass -f elf64 -o b64.o -Dnthreads=$nthreads -Drepeat1=$repeat1 -Drepeat2=$repeat2 -Dnoptype=$noptype -Dnopsize=$nopsize -Ploop_buffer.inc TemplateB64.nasm 63 | if [ $? -ne 0 ] ; then exit ; fi 64 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 65 | if [ $? -ne 0 ] ; then exit ; fi 66 | ./x >> results2/loop_buffer.txt 67 | 68 | done 69 | 70 | 71 | 72 | 73 | 74 | echo -e "\n\n\nTest with multiple threads" >> results2/loop_buffer.txt 75 | nthreads=3 76 | 77 | for repeat2 in 100 1000 2000 2500 3000 10000 100000 78 | do 79 | for noptype in 2 80 | do 81 | 82 | totalsize=$(expr 5 + $repeat2 \* $nopsize ) 83 | 84 | echo -e "\n\nNumber of NOPs = $repeat2, noptype = $noptype, nopsize = $nopsize, total size = $totalsize" >> results2/loop_buffer.txt 85 | 86 | $ass -f elf64 -o b64.o -Dnthreads=$nthreads -Drepeat1=$repeat1 -Drepeat2=$repeat2 -Dnoptype=$noptype -Dnopsize=$nopsize -Ploop_buffer.inc TemplateB64.nasm 87 | if [ $? -ne 0 ] ; then exit ; fi 88 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 89 | if [ $? -ne 0 ] ; then exit ; fi 90 | ./x >> results2/loop_buffer.txt 91 | 92 | done 93 | done 94 | 95 | echo -e "\n" >> results2/loop_buffer.txt 96 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/mul.sh1: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 2012-01-26 AgF 3 | # Compile and run PMCTest for integer multiplication instructions 4 | # looping through list of instructions 5 | # (c) Copyright 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses 6 | 7 | # Detect CPU specific variables 8 | . vars.sh 9 | 10 | echo -e "Multiplication instructions latency and throughput\n" > results1/mul.txt 11 | 12 | # single operand: 13 | 14 | for i in mul imul 15 | do 16 | 17 | for r in 8 16 32 64 18 | do 19 | 20 | echo -e "\n\nLatency: $i , registersize $r " >> results1/mul.txt 21 | 22 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dnumop=1 -Dregsize=$r -Dtmode=L -Pmul.inc TemplateB64.nasm 23 | if [ $? -ne 0 ] ; then exit ; fi 24 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 25 | if [ $? -ne 0 ] ; then exit ; fi 26 | ./x >> results1/mul.txt 27 | 28 | echo -e "\n\nThroughput: $i , registersize $r (subtract 1 uop)" >> results1/mul.txt 29 | for cts in $PMClist 30 | do 31 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dnumop=1 -Dregsize=$r -Dtmode=T -Dcounters=$cts -Pmul.inc TemplateB64.nasm 32 | if [ $? -ne 0 ] ; then exit ; fi 33 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 34 | if [ $? -ne 0 ] ; then exit ; fi 35 | ./x >> results1/mul.txt 36 | done 37 | 38 | echo -e "\n\nThroughput with memory operand: $i , registersize $r (subtract 1 uop)" >> results1/mul.txt 39 | for cts in $PMClist 40 | do 41 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dnumop=1 -Dregsize=$r -Dtmode=M -Dcounters=$cts -Pmul.inc TemplateB64.nasm 42 | if [ $? -ne 0 ] ; then exit ; fi 43 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 44 | if [ $? -ne 0 ] ; then exit ; fi 45 | ./x >> results1/mul.txt 46 | done 47 | done 48 | done 49 | 50 | echo -e "\n" >> results1/mul.txt 51 | 52 | # 2 - 3 operands: 53 | 54 | for i in imul 55 | do 56 | 57 | for n in 2 3 58 | do 59 | 60 | for r in 16 32 64 61 | do 62 | 63 | echo -e "\n\nLatency: $i , regsize $r, numop $n" >> results1/mul.txt 64 | 65 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dnumop=$n -Dtmode=L -Dregsize=$r -Pmul.inc TemplateB64.nasm 66 | if [ $? -ne 0 ] ; then exit ; fi 67 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 68 | if [ $? -ne 0 ] ; then exit ; fi 69 | ./x >> results1/mul.txt 70 | 71 | echo -e "\n\nThroughput: $i , regsize $r, numop $n" >> results1/mul.txt 72 | for cts in $PMClist 73 | do 74 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dnumop=$n -Dtmode=T -Dregsize=$r -Dcounters=$cts -Pmul.inc TemplateB64.nasm 75 | if [ $? -ne 0 ] ; then exit ; fi 76 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 77 | if [ $? -ne 0 ] ; then exit ; fi 78 | ./x >> results1/mul.txt 79 | done 80 | 81 | echo -e "\n\nThroughput with memory operand: $i , regsize $r, numop $n" >> results1/mul.txt 82 | for cts in $PMClist 83 | do 84 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dnumop=$n -Dtmode=M -Dregsize=$r -Dcounters=$cts -Pmul.inc TemplateB64.nasm 85 | if [ $? -ne 0 ] ; then exit ; fi 86 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 87 | if [ $? -ne 0 ] ; then exit ; fi 88 | ./x >> results1/mul.txt 89 | done 90 | done 91 | done 92 | done 93 | 94 | echo -e "\n" >> results1/mul.txt 95 | 96 | -------------------------------------------------------------------------------- /src/phr_branch_target_xor.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // use with generate_gadget tool 8 | 9 | // defined in gen_phr_branch_target_xor_test() 10 | // args: loop count, buffer 11 | typedef void (*gadget)(size_t, uint32_t *); 12 | extern "C" { 13 | extern gadget phr_branch_target_xor_gadgets[]; 14 | } 15 | 16 | int main(int argc, char *argv[]) { 17 | int loop_count = 1000; 18 | // match gen_phr_branch_target_xor_test 19 | #if defined(HOST_AMD64) 20 | int min_branch_toggle = 1; 21 | int max_branch_toggle = 12; 22 | int min_target_toggle = 0; 23 | int max_target_toggle = 12; 24 | #else 25 | int min_branch_toggle = 2; 26 | int max_branch_toggle = 18; 27 | int min_target_toggle = 2; 28 | int max_target_toggle = 18; 29 | #endif 30 | 31 | bind_to_core(); 32 | #ifdef NO_COND_BRANCH_MISSES 33 | setup_perf_branch_misses(); 34 | #else 35 | setup_perf_cond_branch_misses(); 36 | #endif 37 | FILE *fp = fopen("phr_branch_target_xor.csv", "w"); 38 | assert(fp); 39 | 40 | uint32_t *buffer = new uint32_t[loop_count + 1]; 41 | 42 | fprintf(fp, "branch,target,min,avg,max\n"); 43 | int gadget_index = 0; 44 | int repeat = 2; // two branches 45 | for (int branch_toggle = min_branch_toggle; 46 | branch_toggle <= max_branch_toggle; branch_toggle++) { 47 | for (int target_toggle = min_target_toggle; 48 | target_toggle <= max_target_toggle; target_toggle++) { 49 | std::vector history; 50 | int iterations = 100; 51 | history.reserve(iterations); 52 | 53 | double sum = 0; 54 | // run several times 55 | for (int i = 0; i < iterations; i++) { 56 | for (int i = 0; i <= loop_count; i++) { 57 | buffer[i] = rand() % 2; 58 | } 59 | #ifdef NO_COND_BRANCH_MISSES 60 | uint64_t begin = perf_read_branch_misses(); 61 | #else 62 | uint64_t begin = perf_read_cond_branch_misses(); 63 | #endif 64 | phr_branch_target_xor_gadgets[gadget_index](loop_count, buffer); 65 | #ifdef NO_COND_BRANCH_MISSES 66 | uint64_t elapsed = perf_read_branch_misses() - begin; 67 | #else 68 | uint64_t elapsed = perf_read_cond_branch_misses() - begin; 69 | #endif 70 | 71 | // skip warmup 72 | if (i >= 10) { 73 | double time = (double)elapsed / loop_count / repeat; 74 | history.push_back(time); 75 | sum += time; 76 | } 77 | } 78 | gadget_index++; 79 | 80 | double min = history[0]; 81 | double max = history[0]; 82 | for (size_t i = 0; i < history.size(); i++) { 83 | if (min > history[i]) { 84 | min = history[i]; 85 | } 86 | if (max < history[i]) { 87 | max = history[i]; 88 | } 89 | } 90 | fprintf(fp, "%d,%d,%.2lf,%.2lf,%.2lf\n", branch_toggle, target_toggle, 91 | min, sum / history.size(), max); 92 | fflush(fp); 93 | } 94 | } 95 | 96 | printf("Results are written to phr_branch_target_xor.csv\n"); 97 | delete[] buffer; 98 | return 0; 99 | } 100 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/pushpop.sh1: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 2021-01-25 Agner Fog 3 | # Compile and run PMCTest for push and pop instructions 4 | # looping through list of instructions 5 | # (c) Copyright 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses 6 | 7 | . vars.sh 8 | 9 | echo -e "push and pop instructions latency and throughput\n64 bit mode" > results1/pushpop.txt 10 | echo -e "Operands:" >> results1/pushpop.txt 11 | echo -e "r = register" >> results1/pushpop.txt 12 | echo -e "I = immediate constant" >> results1/pushpop.txt 13 | echo -e "M = memory" >> results1/pushpop.txt 14 | echo -e "F = flags" >> results1/pushpop.txt 15 | echo -e "SP = stack pointer\n\n" >> results1/pushpop.txt 16 | 17 | 18 | # warmup 19 | $ass -f elf64 -o b64.o -Dinstruct=push -Dregsize=64 -Doper=R -DWARMUPCOUNT=10000000 -Ppushpop.inc TemplateB64.nasm 20 | if [ $? -ne 0 ] ; then exit ; fi 21 | g++ -fno-pie -no-pie -m64 -fno-pie -no-pie a64.o b64.o -ox -lpthread 22 | if [ $? -ne 0 ] ; then exit ; fi 23 | ./x >> /dev/null 24 | 25 | 26 | for i in push pop 27 | do 28 | for o in R SP M F 29 | do 30 | 31 | echo -e "\n\nThroughput: $i , operand: $o " >> results1/pushpop.txt 32 | for cts in $PMClist 33 | do 34 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dregsize=64 -Doper=$o -Dcounters=$cts -Ppushpop.inc TemplateB64.nasm 35 | if [ $? -ne 0 ] ; then exit ; fi 36 | g++ -fno-pie -no-pie -m64 -fno-pie -no-pie a64.o b64.o -ox -lpthread 37 | if [ $? -ne 0 ] ; then exit ; fi 38 | ./x >> results1/pushpop.txt 39 | done 40 | done 41 | done 42 | 43 | for i in push 44 | do 45 | for o in I 46 | do 47 | 48 | echo -e "\n\nThroughput: $i , operand: $o " >> results1/pushpop.txt 49 | for cts in $PMClist 50 | do 51 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dregsize=64 -Doper=$o -Dcounters=$cts -Ppushpop.inc TemplateB64.nasm 52 | if [ $? -ne 0 ] ; then exit ; fi 53 | g++ -fno-pie -no-pie -m64 -fno-pie -no-pie a64.o b64.o -ox -lpthread 54 | if [ $? -ne 0 ] ; then exit ; fi 55 | ./x >> results1/pushpop.txt 56 | done 57 | done 58 | done 59 | 60 | if [ $support32bit -ne 0 ] ; then 61 | 62 | echo -e "\n\n\npush and pop instructions latency and throughput\n32 bit mode" >> results1/pushpop.txt 63 | 64 | for i in push pop 65 | do 66 | for o in R SP M F 67 | do 68 | 69 | echo -e "\n\nThroughput: $i , operand: $o " >> results1/pushpop.txt 70 | 71 | $ass -f elf32 -o b32.o -Dinstruct=$i -Dregsize=32 -Doper=$o -Ppushpop.inc TemplateB32.nasm 72 | if [ $? -ne 0 ] ; then exit ; fi 73 | g++ -fno-pie -no-pie -m32 -fno-pie -no-pie a32.o b32.o -ox -lpthread 74 | if [ $? -ne 0 ] ; then exit ; fi 75 | ./x >> results1/pushpop.txt 76 | 77 | done 78 | done 79 | 80 | for i in push 81 | do 82 | for o in I 83 | do 84 | 85 | echo -e "\n\nThroughput: $i , operand: $o " >> results1/pushpop.txt 86 | 87 | $ass -f elf32 -o b32.o -Dinstruct=$i -Dregsize=32 -Doper=$o -Ppushpop.inc TemplateB32.nasm 88 | if [ $? -ne 0 ] ; then exit ; fi 89 | g++ -fno-pie -no-pie -m32 -fno-pie -no-pie a32.o b32.o -ox -lpthread 90 | if [ $? -ne 0 ] ; then exit ; fi 91 | ./x >> results1/pushpop.txt 92 | 93 | done 94 | done 95 | fi 96 | 97 | echo -e "\n" >> results1/pushpop.txt 98 | 99 | -------------------------------------------------------------------------------- /src/detect_uarch.cpp: -------------------------------------------------------------------------------- 1 | #include "include/uarch.h" 2 | #include 3 | #include 4 | 5 | int main() { 6 | enum uarch uarch = get_uarch(); 7 | switch (uarch) { 8 | case firestorm: 9 | printf("-DAPPLE_SILICON\n"); 10 | printf("-DAPPLE_PCORE\n"); 11 | printf("-DAPPLE_M1\n"); 12 | printf("-DAPPLE_M1_FIRESTORM\n"); 13 | break; 14 | case icestorm: 15 | printf("-DAPPLE_SILICON\n"); 16 | printf("-DAPPLE_M1\n"); 17 | printf("-DAPPLE_M1_ICESTORM\n"); 18 | break; 19 | case avalanche: 20 | printf("-DAPPLE_SILICON\n"); 21 | printf("-DAPPLE_PCORE\n"); 22 | printf("-DAPPLE_M2\n"); 23 | printf("-DAPPLE_M2_AVALANCHE\n"); 24 | break; 25 | case blizzard: 26 | printf("-DAPPLE_SILICON\n"); 27 | printf("-DAPPLE_M2\n"); 28 | printf("-DAPPLE_M2_BLIZZARD\n"); 29 | break; 30 | case m4_pcore: 31 | printf("-DAPPLE_SILICON\n"); 32 | printf("-DAPPLE_PCORE\n"); 33 | printf("-DAPPLE_M4\n"); 34 | printf("-DAPPLE_M4_PCORE\n"); 35 | break; 36 | case m4_ecore: 37 | printf("-DAPPLE_SILICON\n"); 38 | printf("-DAPPLE_M4\n"); 39 | printf("-DAPPLE_M4_ECORE\n"); 40 | break; 41 | case oryon: 42 | printf("-DQUALCOMM_ORYON\n"); 43 | break; 44 | case cortex_a78: 45 | printf("-DARM_CORTEX_A78\n"); 46 | break; 47 | case cortex_a77: 48 | printf("-DARM_CORTEX_A77\n"); 49 | break; 50 | case cortex_x1: 51 | printf("-DARM_CORTEX_X1\n"); 52 | break; 53 | case neoverse_n1: 54 | printf("-DNO_FJCVTZS\n"); 55 | printf("-DARM_NEOVERSE_N1\n"); 56 | break; 57 | case neoverse_v1: 58 | printf("-DARM_NEOVERSE_V1\n"); 59 | break; 60 | case neoverse_n2: 61 | printf("-DARM_NEOVERSE_N2\n"); 62 | break; 63 | case neoverse_v2: 64 | printf("-DARM_NEOVERSE_V2\n"); 65 | break; 66 | case tsv110: 67 | printf("-DHISILICON_TSV110\n"); 68 | break; 69 | case unknown_arm64: 70 | break; 71 | case golden_cove: 72 | printf("-DINTEL\n"); 73 | printf("-DINTEL_AHYBRID\n"); 74 | break; 75 | case gracemont: 76 | printf("-DINTEL\n"); 77 | printf("-DINTEL_AHYBRID\n"); 78 | break; 79 | case sunny_cove: 80 | printf("-DINTEL\n"); 81 | printf("-DINTEL_ICELAKE_SERVER\n"); 82 | break; 83 | case skylake: 84 | printf("-DINTEL\n"); 85 | printf("-DINTEL_SKYLAKE_SERVER\n"); 86 | break; 87 | case broadwell: 88 | printf("-DINTEL\n"); 89 | printf("-DINTEL_BROADWELL\n"); 90 | break; 91 | case zen1: 92 | printf("-DAMD\n"); 93 | printf("-DAMD_ZEN1\n"); 94 | break; 95 | case zen2: 96 | printf("-DAMD\n"); 97 | printf("-DAMD_ZEN2\n"); 98 | break; 99 | case zen3: 100 | printf("-DAMD\n"); 101 | printf("-DAMD_ZEN3\n"); 102 | break; 103 | case zen4: 104 | printf("-DAMD\n"); 105 | printf("-DAMD_ZEN4\n"); 106 | break; 107 | case zen5: 108 | printf("-DAMD\n"); 109 | printf("-DAMD_ZEN5\n"); 110 | break; 111 | case unknown_amd64: 112 | break; 113 | case la464: 114 | printf("-DLA464\n"); 115 | case unknown_loongarch64: 116 | break; 117 | default: 118 | assert(false); 119 | } 120 | return 0; 121 | } 122 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/shift.inc: -------------------------------------------------------------------------------- 1 | ;---------------------------------------------------------------------------- 2 | ; shift.inc 2012-01-26 Agner Fog 3 | ; 4 | ; PMC Test program for shift and rotate instructions 5 | ; YASM syntax 6 | ; 7 | ; The following macros can be defined on the command line or in include files: 8 | ; 9 | ; instruct: The name of a single instruction to test 10 | ; 11 | ; regsize: Register size: 8, 16, 32, 64 12 | ; 13 | ; cntop: Count operand: must be integer constant or cl 14 | ; 15 | ; tmode: L: Latency 16 | ; T: Throughput 17 | ; M: Throughput with memory operand 18 | ; 19 | ; (c) Copyright 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses 20 | ;----------------------------------------------------------------------------- 21 | %ifndef tmode 22 | %define tmode L ; default: measure latency 23 | %endif 24 | 25 | %ifndef cntop 26 | %define cntop 1 ; default count operand = 1 27 | %endif 28 | 29 | 30 | ;############################################################################## 31 | ;# 32 | ;# Test code macro 33 | ;# 34 | ;############################################################################## 35 | 36 | 37 | %macro blockports 0 ; currently unused 38 | %endmacro 39 | 40 | 41 | ; main testcode macro 42 | %macro testcode 0 43 | 44 | %ifidni instruct, shld ; define second operand for shld and shrd only 45 | %if modesize == 64 46 | %define op2 reg7, 47 | %else 48 | %define op2 reg6, 49 | %endif 50 | %elifidni instruct, shrd 51 | %if modesize == 64 52 | %define op2 reg7, 53 | %else 54 | %define op2 reg6, 55 | %endif 56 | %else 57 | %define op2 58 | %endif 59 | 60 | ; start loop 61 | mov ecx, 5 62 | mov ebp,100 63 | align 32 64 | Testloop1: 65 | 66 | %IFIDNI tmode, L ; measure latency 67 | 68 | %rep 100 69 | instruct reg0, op2 cntop 70 | %endrep 71 | 72 | %ELIFIDNI tmode, T ; measure throughput with register operands 73 | 74 | %rep 25 75 | instruct reg0, op2 cntop 76 | blockports 77 | instruct reg1, op2 cntop 78 | blockports 79 | instruct reg3, op2 cntop ; avoid ecx 80 | blockports 81 | instruct reg4, op2 cntop 82 | blockports 83 | %endrep 84 | 85 | %ELIFIDNI tmode, M ; measure throughput with memory operand 86 | 87 | %rep 25 88 | instruct sizeptr [rsi], op2 cntop 89 | blockports 90 | instruct sizeptr [rsi+regsize], op2 cntop 91 | blockports 92 | instruct sizeptr [rsi+regsize*2], op2 cntop 93 | blockports 94 | instruct sizeptr [rsi+regsize*3], op2 cntop 95 | blockports 96 | %endrep 97 | 98 | %ELSE 99 | %error unknown testmode 100 | %ENDIF 101 | 102 | dec ebp 103 | jnz Testloop1 104 | 105 | %endmacro ; testcode 106 | 107 | ; disable default test loops 108 | %define repeat1 1 109 | %define repeat2 1 110 | 111 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/stack_sync_uops.inc: -------------------------------------------------------------------------------- 1 | ;---------------------------------------------------------------------------- 2 | ; stack_sync_uops.inc 2013-07-21 Agner Fog 3 | ; 4 | ; PMC Test program for testingbranch prediction 5 | ; NASM syntax 6 | ; 7 | ; The following macros can be defined on the command line or in include files: 8 | ; 9 | ; tcase: Test case number. See below for each case 10 | ; 1. Push and pop only 11 | ; 2. added mov r,[rsp] 12 | ; 3. further added mov r,rsp 13 | ; 4. call and ret 14 | ; 5. call and ret imm 15 | ; 6. call and ret and add rsp,const 16 | ; 17 | ; 18 | ; (c) Copyright 2013 by Agner Fog. GNU General Public License www.gnu.org/licenses 19 | ;----------------------------------------------------------------------------- 20 | 21 | %ifndef tcase 22 | %define case 1 ; default case 1 23 | %endif 24 | 25 | %ifndef count1 26 | %define count1 10 ; default count1 27 | %endif 28 | 29 | 30 | ;############################################################################## 31 | ;# 32 | ;# Test code macros 33 | ;# 34 | ;############################################################################## 35 | 36 | 37 | %if tcase < 4 ; Push and pop only 38 | 39 | %macro testcode 0 40 | nop 41 | nop 42 | mov ebp, count1 43 | align 16 44 | LL: 45 | push rax 46 | push rbx 47 | push rcx 48 | %if tcase > 1 49 | mov r8,[rsp] 50 | %endif 51 | pop rdx 52 | pop rdx 53 | pop rdx 54 | %if tcase > 2 55 | mov rdi,rsp 56 | %endif 57 | dec ebp 58 | jnz LL 59 | %endmacro 60 | 61 | %elif tcase == 4 ; call and return 62 | 63 | %macro testinit1 0 64 | jmp AROUND 65 | align 16 66 | TESTFUNC: 67 | nop 68 | nop 69 | nop 70 | ret 71 | align 16 72 | AROUND: 73 | %endmacro 74 | 75 | %macro testcode 0 76 | mov ebp, count1 77 | align 16 78 | LL: 79 | push rax 80 | push rbx 81 | call TESTFUNC 82 | pop rdx 83 | pop rdx 84 | dec ebp 85 | jnz LL 86 | %endmacro 87 | 88 | %elif tcase == 5 ; call and ret imm 89 | 90 | %macro testinit1 0 91 | jmp AROUND 92 | align 16 93 | TESTFUNC: 94 | nop 95 | nop 96 | nop 97 | ret 16 98 | align 16 99 | AROUND: 100 | %endmacro 101 | 102 | %macro testcode 0 103 | mov ebp, count1 104 | align 16 105 | LL: 106 | push rax 107 | push rbx 108 | call TESTFUNC 109 | dec ebp 110 | jnz LL 111 | %endmacro 112 | 113 | %elif tcase == 6 ; call and ret and add rsp,const 114 | 115 | %macro testinit1 0 116 | jmp AROUND 117 | align 16 118 | TESTFUNC: 119 | nop 120 | nop 121 | nop 122 | ret 123 | align 16 124 | AROUND: 125 | %endmacro 126 | 127 | %macro testcode 0 128 | mov ebp, count1 129 | align 16 130 | LL: 131 | push rax 132 | push rbx 133 | call TESTFUNC 134 | add rsp,8 135 | pop rcx 136 | dec ebp 137 | jnz LL 138 | %endmacro 139 | 140 | %else 141 | %error unknown test case tcase 142 | %endif 143 | 144 | ; disable default test loops 145 | %define repeat1 1 146 | %define repeat2 1 147 | 148 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/daxpy.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 2016-11-02 Agner Fog 3 | 4 | # Compile and run PMCTest for different implementations of DAXPY algorithm 5 | # (c) 2012-2016 by Agner Fog. GNU General Public License www.gnu.org/licenses 6 | 7 | 8 | echo -e "Different implementations of DAXPY\n" > results2/daxpy.txt 9 | 10 | . vars.sh 11 | 12 | ndat=2000 13 | repeat1=100 14 | tcase=1 15 | nthreads=1 16 | 17 | echo -e "\n\n$repeat1 * $ndat double precision elements\n" >> results2/daxpy.txt 18 | 19 | echo -e "\n\nCase 1: SSE2, 128 bit" >> results2/daxpy.txt 20 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm 21 | if [ $? -ne 0 ] ; then exit ; fi 22 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 23 | if [ $? -ne 0 ] ; then exit ; fi 24 | ./x >> results2/daxpy.txt 25 | 26 | if [ `grep -c -i "avx" cpuinfo.txt ` -gt 0 ] ; then 27 | tcase=2 28 | echo -e "\n\nCase 2: AVX, 256 bit" >> results2/daxpy.txt 29 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm 30 | if [ $? -ne 0 ] ; then exit ; fi 31 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 32 | if [ $? -ne 0 ] ; then exit ; fi 33 | ./x >> results2/daxpy.txt 34 | fi 35 | 36 | if [ `grep -c -i "fma[ 3,\b]" cpuinfo.txt ` -gt 0 ] ; then # FMA3 37 | tcase=3 38 | echo -e "\n\nCase 3: FMA3, 128 bit" >> results2/daxpy.txt 39 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm 40 | if [ $? -ne 0 ] ; then exit ; fi 41 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 42 | if [ $? -ne 0 ] ; then exit ; fi 43 | ./x >> results2/daxpy.txt 44 | 45 | tcase=4 46 | echo -e "\n\nCase 4: FMA3, 256 bit" >> results2/daxpy.txt 47 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm 48 | if [ $? -ne 0 ] ; then exit ; fi 49 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 50 | if [ $? -ne 0 ] ; then exit ; fi 51 | ./x >> results2/daxpy.txt 52 | fi 53 | 54 | if [ `grep -c -i "fma4" cpuinfo.txt ` -gt 0 ] ; then # FMA4 55 | tcase=5 56 | echo -e "\n\nCase 5: FMA4, 128 bit" >> results2/daxpy.txt 57 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm 58 | if [ $? -ne 0 ] ; then exit ; fi 59 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 60 | if [ $? -ne 0 ] ; then exit ; fi 61 | ./x >> results2/daxpy.txt 62 | 63 | tcase=6 64 | echo -e "\n\nCase 6: FMA4, 256 bit" >> results2/daxpy.txt 65 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm 66 | if [ $? -ne 0 ] ; then exit ; fi 67 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 68 | if [ $? -ne 0 ] ; then exit ; fi 69 | ./x >> results2/daxpy.txt 70 | fi 71 | 72 | if [ `grep -c -i "avx512" cpuinfo.txt ` -gt 0 ] ; then # AVX512 73 | tcase=7 74 | echo -e "\n\nCase 7: AVX512, 512 bit" >> results2/daxpy.txt 75 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm 76 | if [ $? -ne 0 ] ; then exit ; fi 77 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread 78 | if [ $? -ne 0 ] ; then exit ; fi 79 | ./x >> results2/daxpy.txt 80 | fi 81 | 82 | 83 | echo -e "\n" >> results2/daxpy.txt 84 | 85 | -------------------------------------------------------------------------------- /src/fp_peak.cpp: -------------------------------------------------------------------------------- 1 | #include "include/utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // defined in fp_peak_gen.cpp 10 | // args: loop count 11 | typedef void (*gadget)(size_t); 12 | extern "C" { 13 | extern gadget fp_peak_gadgets[]; 14 | } 15 | 16 | int main(int argc, char *argv[]) { 17 | // match fp_peak_gen.cpp 18 | int repeat = 1000; 19 | int loop_count = 1000; 20 | 21 | #ifdef HOST_AARCH64 22 | int num_patterns = 6; 23 | char patterns[][20] = { 24 | "32-bit SP FMADD", "64-bit DP FMADD", "128-bit SP ASIMD", 25 | "128-bit DP ASIMD", "xxxx-bit SP SVE", "xxxx-bit DP SVE", 26 | }; 27 | 28 | int coef[] = { 29 | 32 / 32 * 2, // 32-bit SP 30 | 64 / 64 * 2, // 64-bit DP 31 | 128 / 32 * 2, // 128-bit SP 32 | 128 / 64 * 2, // 128-bit DP 33 | 0, // ?-bit SP 34 | 0, // ?-bit DP 35 | }; 36 | #else 37 | int num_patterns = 4; 38 | const char *patterns[] = { 39 | "256-bit SP FMA", 40 | "256-bit DP FMA", 41 | "512-bit SP AVX512F", 42 | "512-bit DP AVX512F", 43 | }; 44 | 45 | int coef[] = { 46 | 256 / 32 * 2, // 256-bit SP 47 | 256 / 64 * 2, // 256-bit DP 48 | 512 / 32 * 2, // 512-bit SP 49 | 512 / 64 * 2, // 512-bit DP 50 | }; 51 | #endif 52 | 53 | bind_to_core(); 54 | setup_perf_cycles(); 55 | FILE *fp = fopen("fp_peak.csv", "w"); 56 | assert(fp); 57 | 58 | int gadget_index = 0; 59 | fprintf(fp, "pattern,min,avg,max\n"); 60 | for (int pattern = 0; pattern < num_patterns; pattern++) { 61 | std::vector history; 62 | int iterations = 100; 63 | history.reserve(iterations); 64 | 65 | #ifdef HOST_AARCH64 66 | // read sve length in runtime 67 | if (pattern == 4) { 68 | uint64_t len = 0; 69 | asm __volatile__(".arch armv9-a+sve\ncntw %0" : "=r"(len)); 70 | sprintf(patterns[pattern], "%ld-bit SP SVE", len * 32); 71 | coef[pattern] = len * 2; 72 | } else if (pattern == 5) { 73 | uint64_t len = 0; 74 | asm __volatile__(".arch armv9-a+sve\ncntd %0" : "=r"(len)); 75 | sprintf(patterns[pattern], "%ld-bit DP SVE", len * 64); 76 | coef[pattern] = len * 2; 77 | } 78 | #endif 79 | 80 | double sum = 0; 81 | // run several times 82 | for (int i = 0; i < iterations; i++) { 83 | uint64_t begin = perf_read_cycles(); 84 | fp_peak_gadgets[gadget_index](loop_count); 85 | uint64_t elapsed = perf_read_cycles() - begin; 86 | 87 | // skip warmup 88 | if (i >= 10) { 89 | double time = 90 | (double)coef[pattern] / ((double)elapsed / loop_count / repeat); 91 | history.push_back(time); 92 | sum += time; 93 | } 94 | } 95 | 96 | double min = history[0]; 97 | double max = history[0]; 98 | for (size_t i = 0; i < history.size(); i++) { 99 | if (min > history[i]) { 100 | min = history[i]; 101 | } 102 | if (max < history[i]) { 103 | max = history[i]; 104 | } 105 | } 106 | 107 | fprintf(fp, "%s,%.2lf,%.2lf,%.2lf\n", patterns[pattern], min, 108 | sum / history.size(), max); 109 | fflush(fp); 110 | 111 | gadget_index++; 112 | } 113 | 114 | printf("Results are written to fp_peak.csv\n"); 115 | return 0; 116 | } 117 | -------------------------------------------------------------------------------- /agner/testp/TestScripts/read_write_bandwidth.inc: -------------------------------------------------------------------------------- 1 | ; read_write_bandwidth.inc 2021-03-23 Agner Fog 2 | 3 | ; Test the maximum number of memory reads and writes per clock cycle 4 | ; 5 | ; (c) 2013-2021 by Agner Fog. GNU General Public License www.gnu.org/licenses 6 | ; 7 | ; Parameters: 8 | ; 9 | ; tmode: Test mode: 10 | ; R: read only 11 | ; W: write only 12 | ; RW: one read and one write 13 | ; RRW: two reads and one write 14 | ; RWW: one read and two writes 15 | ; RWW2: one read and two writes to different cache lines 16 | ; 17 | ; regsize: register size, bits 18 | 19 | ; define appropriate move instruction: 20 | %if regsize <= 64 21 | %define mov1 mov 22 | %elif regsize == 65 ; 64 bit mmx registers 23 | %define mov1 movq 24 | %elif regsize == 128 25 | %define mov1 movdqa 26 | %elif regsize == 256 27 | %define mov1 vmovdqa 28 | %elif regsize == 512 29 | %define mov1 vmovdqa64 30 | %else 31 | %error unknown register size regsize 32 | %endif 33 | 34 | 35 | ; main testcode macro 36 | %macro testcode 0 37 | 38 | %ifidni tmode, R ; read only 39 | 40 | %rep 25 41 | mov1 reg0, [psi] 42 | mov1 reg1, [psi+regsize/8] 43 | mov1 reg2, [psi+2*regsize/8] 44 | mov1 reg3, [psi+3*regsize/8] 45 | %endrep 46 | 47 | %elifidni tmode, W ; write only 48 | 49 | %rep 25 50 | mov1 [psi], reg0 51 | mov1 [psi+regsize/8], reg1 52 | mov1 [psi+2*regsize/8], reg2 53 | mov1 [psi+3*regsize/8], reg3 54 | %endrep 55 | 56 | %elifidni tmode, RW ; one read and one write 57 | 58 | %rep 50 59 | mov1 reg0, [psi] 60 | mov1 [psi+regsize/8], reg1 61 | mov1 reg2, [psi+2*regsize/8] 62 | mov1 [psi+3*regsize/8], reg3 63 | %endrep 64 | 65 | %elifidni tmode, RRW ; two reads and one write 66 | 67 | %rep 50 68 | mov1 reg0, [psi] 69 | mov1 reg1, [psi+regsize/8] 70 | mov1 [psi+4*regsize/8], reg2 71 | mov1 reg3, [psi+2*regsize/8] 72 | mov1 reg4, [psi+3*regsize/8] 73 | mov1 [psi+5*regsize/8], reg5 74 | %endrep 75 | 76 | %elifidni tmode, RRRW ; three reads and one write 77 | 78 | %rep 50 79 | mov1 reg0, [psi] 80 | mov1 reg1, [psi+2*regsize/8] 81 | mov1 reg2, [psi+4*regsize/8] 82 | mov1 [psi+6*regsize/8], reg2 83 | %endrep 84 | 85 | %elifidni tmode, RWW ; one read and two writes 86 | 87 | %rep 50 88 | mov1 reg0, [psi] 89 | mov1 [psi+2*regsize/8], reg1 90 | mov1 [psi+3*regsize/8], reg2 91 | mov1 reg3, [psi+1*regsize/8] 92 | mov1 [psi+4*regsize/8], reg4 93 | mov1 [psi+5*regsize/8], reg5 94 | %endrep 95 | 96 | %elifidni tmode, RWW2 ; one read and two writes to different cache lines 97 | 98 | %rep 50 99 | mov1 reg0, [psi] 100 | mov1 [psi+0x40], reg1 101 | mov1 [psi+0x80], reg2 102 | mov1 reg3, [psi+1*regsize/8] 103 | mov1 [psi+0xC0], reg4 104 | mov1 [psi+0x100], reg5 105 | %endrep 106 | 107 | %else 108 | 109 | %error unknown test mode tmode 110 | 111 | %endif 112 | 113 | %endmacro 114 | 115 | ; test loops 116 | %define repeat1 1000 117 | %define repeat2 1 118 | -------------------------------------------------------------------------------- /agner/testp/PMCTest/timingtest.h: -------------------------------------------------------------------------------- 1 | /*************************** timingtest.h **************************** 2 | * Author: Agner Fog 3 | * Date created: 2014-04-15 4 | * Last modified: 2014-04-15 5 | * Project: define functions for timing purposes etc. 6 | * Description: 7 | * 8 | ******************************************************************************/ 9 | 10 | #pragma once 11 | #include 12 | 13 | #if defined(__WINDOWS__) || defined(_WIN32) || defined(_WIN64) 14 | // System-specific definitions for Windows 15 | 16 | #if 1 // if intrin.h has __cpuid, __rdtsc and __readpmc 17 | 18 | #include 19 | 20 | static inline void cpuid_ (int32_t output[4], int32_t functionnumber) { 21 | __cpuid(output, functionnumber); 22 | } 23 | 24 | // serialize CPU by cpuid function 0 25 | static inline void serialize () { 26 | int dummy[4]; 27 | cpuid_(dummy, 0); 28 | // Prevent the compiler from optimizing away the whole Serialize function: 29 | volatile int DontSkip = dummy[0]; 30 | } 31 | 32 | // read time stamp counter 33 | static inline int64_t readtsc() { 34 | return __rdtsc(); 35 | } 36 | 37 | // read performance monitor counter 38 | static inline int64_t readpmc(int32_t nPerfCtr) { 39 | return __readpmc(nPerfCtr); 40 | } 41 | 42 | 43 | #else // intrin.h missing. use inline assembly 44 | 45 | // inline MASM syntax 46 | 47 | static inline void cpuid_ (int32_t output[4], int32_t functionnumber) { 48 | __asm { 49 | mov eax, functionnumber; 50 | cpuid; 51 | mov esi, output; 52 | mov [esi], eax; 53 | mov [esi+4], ebx; 54 | mov [esi+8], ecx; 55 | mov [esi+12], edx; 56 | } 57 | } 58 | 59 | static inline void serialize () { 60 | __asm { 61 | xor eax, eax; 62 | cpuid; 63 | } 64 | } 65 | 66 | // get time stamp counter 67 | #pragma warning(disable:4035) 68 | static inline uint64_t readtsc() { 69 | // read performance monitor counter number nPerfCtr 70 | __asm { 71 | rdtsc 72 | } 73 | } 74 | 75 | static inline uint64_t readpmc(int32_t nPerfCtr) { 76 | // read performance monitor counter number nPerfCtr 77 | __asm { 78 | mov ecx, nPerfCtr 79 | rdpmc 80 | } 81 | } 82 | #pragma warning(default:4035) 83 | 84 | #endif 85 | 86 | 87 | #elif defined(__unix__) || defined(__linux__) 88 | // System-specific definitions for Linux 89 | 90 | #include 91 | 92 | static inline void cpuid_ (int32_t output[4], int32_t functionnumber) { 93 | __get_cpuid(functionnumber, (uint32_t*)output, (uint32_t*)(output+1), (uint32_t*)(output+2), (uint32_t*)(output+3)); 94 | } 95 | 96 | static inline void serialize () { 97 | __asm __volatile__ ("cpuid" : : "a"(0) : "ebx", "ecx", "edx" ); // serialize 98 | } 99 | 100 | // read time stamp counter 101 | static inline uint64_t readtsc() { 102 | uint32_t lo, hi; 103 | __asm __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi) : : ); 104 | return lo | (uint64_t)hi << 32; 105 | } 106 | 107 | // read performance monitor counter 108 | static inline uint64_t readpmc(int32_t n) { 109 | uint32_t lo, hi; 110 | __asm __volatile__ ("rdpmc" : "=a"(lo), "=d"(hi) : "c"(n) : ); 111 | return lo | (uint64_t)hi << 32; 112 | } 113 | 114 | 115 | #else // not Windows or Unix 116 | 117 | #error Unknown platform 118 | 119 | #endif 120 | --------------------------------------------------------------------------------