├── results
    ├── .gitignore
    ├── rob_size
    │   ├── amd_zen_1.png
    │   ├── amd_zen_2.png
    │   ├── ibm_power_8.png
    │   ├── arm_cortex_a72.png
    │   ├── arm_neoverse_n1.png
    │   ├── intel_broadwell.png
    │   ├── intel_cascade_lake.png
    │   ├── intel_ivy_bridge_ep.png
    │   └── README.md
    └── instruction_latency
    │   ├── intel_broadwell.csv
    │   ├── loongson_3a6000.csv
    │   ├── loongson_3c5000.csv
    │   └── huawei_kunpeng920.csv
├── .clangd
├── agner
    ├── testp
    │   ├── TestScripts
    │   │   ├── MSRdrvL.h
    │   │   ├── PMCTestLinux.h
    │   │   ├── PMCTestB32.nasm
    │   │   ├── my_branch5.sh2
    │   │   ├── my_phr_length.sh2
    │   │   ├── my_branch.sh2
    │   │   ├── init64.sh
    │   │   ├── pack_results.sh
    │   │   ├── my_branch2.sh2
    │   │   ├── my_branch4.sh2
    │   │   ├── my_branch3.sh2
    │   │   ├── a64.sh
    │   │   ├── c64.sh
    │   │   ├── a32.sh
    │   │   ├── c32.sh
    │   │   ├── allsh2.sh
    │   │   ├── allcsv.sh
    │   │   ├── my_phr_length.inc
    │   │   ├── testmemcpyalign.sh2
    │   │   ├── allsh1.sh
    │   │   ├── returnstack.sh2
    │   │   ├── instruct_boundaries.inc
    │   │   ├── my_branch5.py
    │   │   ├── my_phr_length.py
    │   │   ├── instruct_boundaries.sh2
    │   │   ├── memcpy.inc
    │   │   ├── alltests.sh
    │   │   ├── cache_banks.inc
    │   │   ├── stack_sync_uops.sh2
    │   │   ├── my_branch2.py
    │   │   ├── pointer_chasing.inc
    │   │   ├── my_branch4.py
    │   │   ├── returnstack.inc
    │   │   ├── out_of_order.sh2
    │   │   ├── unaligned_mem.sh2
    │   │   ├── latencycf.inc
    │   │   ├── my_branch.inc
    │   │   ├── 32bitinstr.sh1
    │   │   ├── unaligned_mem.inc
    │   │   ├── MSRDriver.h
    │   │   ├── my_branch.py
    │   │   ├── jmp.sh2
    │   │   ├── my_branch2.inc
    │   │   ├── my_branch4.inc
    │   │   ├── my_branch5.inc
    │   │   ├── ucache_misprediction.sh2
    │   │   ├── out_of_order.inc
    │   │   ├── read_write_bandwidth.sh2
    │   │   ├── my_branch3.inc
    │   │   ├── fused_branch.sh2
    │   │   ├── warmup_fp.inc
    │   │   ├── ucache_misprediction.inc
    │   │   ├── length_chg_prefix.sh2
    │   │   ├── 32bitinstr.inc
    │   │   ├── my_branch3.py
    │   │   ├── loop_buffer.sh2
    │   │   ├── mul.sh1
    │   │   ├── pushpop.sh1
    │   │   ├── shift.inc
    │   │   ├── stack_sync_uops.inc
    │   │   ├── daxpy.sh2
    │   │   └── read_write_bandwidth.inc
    │   ├── testp.pdf
    │   ├── PMCTest
    │   │   ├── MSRdrvL.h
    │   │   ├── PMCTestA.cpp
    │   │   ├── PMCTestB.cpp
    │   │   ├── PMCTestB32.asm
    │   │   ├── PMCTestB32.nasm
    │   │   ├── PMCTestB64.asm
    │   │   ├── PMCTestLinux.h
    │   │   ├── uninstall.cpp
    │   │   ├── stopcounters.bat
    │   │   ├── startcounters.bat
    │   │   ├── a64.sh
    │   │   ├── c64.sh
    │   │   ├── a32.sh
    │   │   ├── c32.sh
    │   │   ├── make_a_obj.bat
    │   │   ├── MSRDriver.h
    │   │   └── timingtest.h
    │   └── DriverSrcLinux
    │   │   ├── MSRdrv.c
    │   │   ├── uninstall.sh
    │   │   ├── MSRdrv1.c
    │   │   ├── MSRdrvL.h
    │   │   ├── install1.sh
    │   │   ├── install.sh
    │   │   ├── Makefile
    │   │   ├── DriverSrcLinux.txt
    │   │   └── MSRDriver.h
    ├── NOTES.md
    └── .gitignore
├── reports
    └── dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon
    │   ├── .gitignore
    │   ├── plot_pht_tag_bits_xor_oryon.png
    │   ├── plot_phr_target_bits_location.png
    │   ├── plot_pht_associativity_oryon.png
    │   ├── plot_pht_associativity_firestorm.png
    │   ├── plot_pht_tag_bits_xor_firestorm.png
    │   ├── plot_pht_tag_bits_xor_phr_oryon.png
    │   ├── plot_phr_branch_bits_location_oryon.png
    │   ├── plot_pht_tag_bits_xor_phr_firestorm.png
    │   ├── plot_phr_branch_bits_location_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_pc_1st_oryon.png
    │   ├── plot_pht_index_bits_xor_phrt_pc_2nd_oryon.png
    │   ├── plot_pht_index_bits_xor_phrb_pc_3rd_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrb_phrb_3rd_oryon.png
    │   ├── plot_pht_index_bits_xor_phrb_phrb_4th_oryon.png
    │   ├── plot_pht_index_bits_xor_phrb_phrt_3rd_oryon.png
    │   ├── plot_pht_index_bits_xor_phrb_phrt_4th_oryon.png
    │   ├── plot_pht_index_bits_xor_phrt_pc_1st_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_pc_2nd_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_pc_3rd_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_pc_3rd_pc8_oryon.png
    │   ├── plot_pht_index_bits_xor_phrt_phrb_1st_oryon.png
    │   ├── plot_pht_index_bits_xor_phrt_phrb_2nd_oryon.png
    │   ├── plot_pht_index_bits_xor_phrt_phrb_3rd_oryon.png
    │   ├── plot_pht_index_bits_xor_phrt_phrb_4th_oryon.png
    │   ├── plot_pht_index_bits_xor_phrt_phrt_1st_oryon.png
    │   ├── plot_pht_index_bits_xor_phrt_phrt_2nd_oryon.png
    │   ├── plot_pht_index_bits_xor_phrt_phrt_3rd_oryon.png
    │   ├── plot_pht_index_bits_xor_phrt_phrt_4th_oryon.png
    │   ├── plot_pht_index_bits_xor_phrb_phrb_3rd_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrb_phrb_4th_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrb_phrb_5th_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrb_phrt_4th_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrb_phrt_5th_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_pc_3rd_pc10_oryon.png
    │   ├── plot_pht_index_bits_xor_phrt_phrb_1st_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_phrb_2nd_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_phrb_3rd_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_phrb_4th_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_phrb_5th_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_phrt_1st_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_phrt_2nd_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_phrt_3rd_firestorm.png
    │   ├── plot_pht_index_bits_xor_phrt_phrt_4th_firestorm.png
    │   └── plot_pht_index_bits_xor_phrt_phrt_5th_firestorm.png
├── .gitignore
├── src
    ├── itlb_size_gen.cpp
    ├── pht_index_bits_xor_phr_gen.cpp
    ├── pht_index_bits_xor_gen.cpp
    ├── ghr_size.cpp
    ├── phr_size.cpp
    ├── btb_size_basic.cpp
    ├── ras_size.cpp
    ├── elimination.cpp
    ├── instruction_latency.cpp
    ├── itlb_size.cpp
    ├── ras_size_lib.cpp
    ├── ghr_size_lib.cpp
    ├── rob_size.cpp
    ├── pht_associativity.cpp
    ├── ras_size_gen.cpp
    ├── rob_size_gen.cpp
    ├── pht_index_tag_bits.cpp
    ├── ghr_size_gen.cpp
    ├── elimination_lib.cpp
    ├── pht_tag_bits_xor_phr.cpp
    ├── pht_tag_bits_xor.cpp
    ├── phr_size_lib.cpp
    ├── btb_size_basic_lib.cpp
    ├── find_branch_misses_pmu.cpp
    ├── phr_branch_target_xor.cpp
    ├── detect_uarch.cpp
    └── fp_peak.cpp
├── meson_options.txt
├── aarch64-linux-cross.txt
├── android-cross.txt
├── default.nix
├── pyproject.toml
├── include
    ├── counters.h
    ├── uarch.h
    └── counters_mapping.h
├── figures
    ├── plot_dtlb_size.py
    ├── plot_rob_size.py
    ├── plot_ras_size.py
    ├── plot_phr_size.py
    ├── plot_bp_size.py
    ├── plot_pht_index_bits_xor.py
    ├── plot_memory_latency.py
    ├── plot_btb_size.py
    ├── plot_pht_associativity.py
    ├── plot_pht_tag_bits_xor.py
    ├── plot_pht_tag_bits_xor_phr.py
    ├── plot_phr_branch_bits_location.py
    ├── plot_pht_index_bits_xor_phr.py
    └── plot_phr_target_bits_location.py
├── README.md
├── stale.yml
├── ANDROID-PERF.md
├── ios-cross.txt
└── Makefile


/results/.gitignore:
--------------------------------------------------------------------------------
1 | !*.png
2 | !*.csv
3 | 


--------------------------------------------------------------------------------
/.clangd:
--------------------------------------------------------------------------------
1 | CompileFlags:
2 |   CompilationDatabase: builddir/
3 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/MSRdrvL.h:
--------------------------------------------------------------------------------
1 | ../DriverSrcLinux/MSRdrvL.h


--------------------------------------------------------------------------------
/agner/testp/TestScripts/PMCTestLinux.h:
--------------------------------------------------------------------------------
1 | ../PMCTest/PMCTestLinux.h


--------------------------------------------------------------------------------
/agner/NOTES.md:
--------------------------------------------------------------------------------
1 | testp downloaded from https://www.agner.org/optimize/#testp
2 | 


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/.gitignore:
--------------------------------------------------------------------------------
1 | !*.png
2 | 


--------------------------------------------------------------------------------
/agner/testp/testp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/testp.pdf


--------------------------------------------------------------------------------
/agner/testp/PMCTest/MSRdrvL.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/MSRdrvL.h


--------------------------------------------------------------------------------
/results/rob_size/amd_zen_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/amd_zen_1.png


--------------------------------------------------------------------------------
/results/rob_size/amd_zen_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/amd_zen_2.png


--------------------------------------------------------------------------------
/agner/testp/PMCTest/PMCTestA.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/PMCTestA.cpp


--------------------------------------------------------------------------------
/agner/testp/PMCTest/PMCTestB.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/PMCTestB.cpp


--------------------------------------------------------------------------------
/results/rob_size/ibm_power_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/ibm_power_8.png


--------------------------------------------------------------------------------
/agner/testp/DriverSrcLinux/MSRdrv.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/DriverSrcLinux/MSRdrv.c


--------------------------------------------------------------------------------
/agner/testp/DriverSrcLinux/uninstall.sh:
--------------------------------------------------------------------------------
1 | # Uninstall MSR driver
2 | rm -f /dev/MSRdrv
3 | rmmod MSRdrv
4 | # modprobe -r MSRdrv
5 | 


--------------------------------------------------------------------------------
/agner/testp/PMCTest/PMCTestB32.asm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/PMCTestB32.asm


--------------------------------------------------------------------------------
/agner/testp/PMCTest/PMCTestB32.nasm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/PMCTestB32.nasm


--------------------------------------------------------------------------------
/agner/testp/PMCTest/PMCTestB64.asm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/PMCTestB64.asm


--------------------------------------------------------------------------------
/agner/testp/PMCTest/PMCTestLinux.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/PMCTestLinux.h


--------------------------------------------------------------------------------
/agner/testp/PMCTest/uninstall.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/PMCTest/uninstall.cpp


--------------------------------------------------------------------------------
/results/rob_size/arm_cortex_a72.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/arm_cortex_a72.png


--------------------------------------------------------------------------------
/agner/testp/DriverSrcLinux/MSRdrv1.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/DriverSrcLinux/MSRdrv1.c


--------------------------------------------------------------------------------
/agner/testp/DriverSrcLinux/MSRdrvL.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/DriverSrcLinux/MSRdrvL.h


--------------------------------------------------------------------------------
/results/rob_size/arm_neoverse_n1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/arm_neoverse_n1.png


--------------------------------------------------------------------------------
/results/rob_size/intel_broadwell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/intel_broadwell.png


--------------------------------------------------------------------------------
/agner/testp/TestScripts/PMCTestB32.nasm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/agner/testp/TestScripts/PMCTestB32.nasm


--------------------------------------------------------------------------------
/results/rob_size/intel_cascade_lake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/intel_cascade_lake.png


--------------------------------------------------------------------------------
/results/rob_size/intel_ivy_bridge_ep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/results/rob_size/intel_ivy_bridge_ep.png


--------------------------------------------------------------------------------
/agner/testp/DriverSrcLinux/install1.sh:
--------------------------------------------------------------------------------
1 | # Install MSR driver
2 | mknod /dev/MSRdrv c 249 0
3 | chmod 666 /dev/MSRdrv
4 | insmod -f MSRdrv.ko
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | builddir
 2 | build
 3 | .vscode
 4 | *.png
 5 | *.pdf
 6 | *.csv
 7 | result
 8 | .cache
 9 | *.swp
10 | *.bin
11 | venv
12 | perf.data
13 | perf.data.old
14 | 


--------------------------------------------------------------------------------
/src/itlb_size_gen.cpp:
--------------------------------------------------------------------------------
1 | #include <cstdio>
2 | 
3 | int main(int argc, char *argv[]) {
4 |   FILE *fp = fopen(argv[1], "w");
5 |   // jit only
6 |   fclose(fp);
7 |   return 0;
8 | }
9 | 


--------------------------------------------------------------------------------
/agner/.gitignore:
--------------------------------------------------------------------------------
 1 | *.zip
 2 | x
 3 | *.o
 4 | *.mod
 5 | *.mod.c
 6 | a.out
 7 | *.lst
 8 | *.ko
 9 | *.order
10 | *.symvers
11 | *.cmd
12 | *.exe
13 | *.sys
14 | *.obj
15 | cpugetinfo
16 | countertypes.inc
17 | cpuinfo.txt
18 | info.txt
19 | 
20 | 


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_phr_target_bits_location.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_phr_target_bits_location.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_associativity_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_associativity_oryon.png


--------------------------------------------------------------------------------
/src/pht_index_bits_xor_phr_gen.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | 
 3 | // check index bit conflict xor PHR vs PHR
 4 | int main(int argc, char *argv[]) {
 5 |   FILE *fp = fopen(argv[1], "w");
 6 |   // jit only
 7 |   fclose(fp);
 8 |   return 0;
 9 | }
10 | 


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_associativity_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_associativity_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_phr_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_phr_oryon.png


--------------------------------------------------------------------------------
/meson_options.txt:
--------------------------------------------------------------------------------
1 | option('ios', type : 'boolean', value : false)
2 | option('android', type : 'boolean', value : false)
3 | option('gem5', type : 'boolean', value : false)
4 | option('linux-cross', type : 'combo', choices: ['none', 'aarch64'], value : 'none')
5 | 


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_phr_branch_bits_location_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_phr_branch_bits_location_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_phr_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_tag_bits_xor_phr_firestorm.png


--------------------------------------------------------------------------------
/aarch64-linux-cross.txt:
--------------------------------------------------------------------------------
 1 | [binaries]
 2 | c = ['aarch64-linux-gnu-gcc']
 3 | cpp = ['aarch64-linux-gnu-g++']
 4 | ar = 'ar'
 5 | strip = 'strip'
 6 | 
 7 | [host_machine]
 8 | system = 'linux'
 9 | cpu_family = 'aarch64'
10 | cpu = 'aarch64'
11 | endian = 'little'
12 | 


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_phr_branch_bits_location_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_phr_branch_bits_location_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_1st_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_1st_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_2nd_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_2nd_oryon.png


--------------------------------------------------------------------------------
/android-cross.txt:
--------------------------------------------------------------------------------
 1 | [binaries]
 2 | c = ['aarch64-linux-android31-clang']
 3 | cpp = ['aarch64-linux-android31-clang++']
 4 | ar = 'ar'
 5 | strip = 'strip'
 6 | 
 7 | [host_machine]
 8 | system = 'linux'
 9 | cpu_family = 'aarch64'
10 | cpu = 'aarch64'
11 | endian = 'little'
12 | 


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_pc_3rd_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_pc_3rd_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_3rd_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_3rd_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_4th_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_4th_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_3rd_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_3rd_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_4th_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_4th_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_1st_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_1st_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_2nd_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_2nd_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_3rd_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_3rd_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_3rd_pc8_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_3rd_pc8_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_1st_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_1st_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_2nd_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_2nd_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_3rd_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_3rd_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_4th_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_4th_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_1st_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_1st_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_2nd_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_2nd_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_3rd_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_3rd_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_4th_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_4th_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_3rd_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_3rd_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_4th_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_4th_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_5th_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrb_5th_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_4th_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_4th_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_5th_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrb_phrt_5th_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_3rd_pc10_oryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_pc_3rd_pc10_oryon.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_1st_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_1st_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_2nd_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_2nd_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_3rd_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_3rd_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_4th_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_4th_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_5th_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrb_5th_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_1st_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_1st_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_2nd_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_2nd_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_3rd_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_3rd_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_4th_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_4th_firestorm.png


--------------------------------------------------------------------------------
/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_5th_firestorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu-micro-benchmarks/HEAD/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/plot_pht_index_bits_xor_phrt_phrt_5th_firestorm.png


--------------------------------------------------------------------------------
/src/pht_index_bits_xor_gen.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | 
 3 | // check index bit conflict xor PC vs PHR
 4 | int main(int argc, char *argv[]) {
 5 |   FILE *fp = fopen(argv[1], "w");
 6 |   assert(fp);
 7 |   // use jit exclusively
 8 |   fclose(fp);
 9 |   return 0;
10 | }
11 | 


--------------------------------------------------------------------------------
/default.nix:
--------------------------------------------------------------------------------
 1 | with import <nixpkgs> {};
 2 | 
 3 | stdenv.mkDerivation {
 4 |   name = "cpu-micro-benchmarks";
 5 |   version = "1.0";
 6 | 
 7 |   src = ./.;
 8 | 
 9 |   nativeBuildInputs = [
10 |     meson
11 |     ninja
12 |   ];
13 | 
14 |   buildInputs = [
15 |   ];
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/results/instruction_latency/intel_broadwell.csv:
--------------------------------------------------------------------------------
 1 | name,latency,throughput
 2 | int_add,1.00,0.00
 3 | int_andn,1.00,0.50
 4 | int_lea_add,1.00,0.50
 5 | sse_addpd,3.00,0.00
 6 | sse_addsd,3.00,0.00
 7 | sse_mulpd,3.00,0.00
 8 | sse_mulsd,3.00,0.00
 9 | sse_subpd,3.00,0.00
10 | sse_subsd,3.00,0.00
11 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch5.sh2:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | nasm -f elf64 -l b64.lst -o b64.o -Dbranchalign=$1 -Dcounters=1,9,201,250 -Pmy_branch5.inc MyTemplateB64.nasm
3 | if [ $? -ne 0 ] ; then exit ; fi
4 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
5 | if [ $? -ne 0 ] ; then exit ; fi
6 | ./x
7 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_phr_length.sh2:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | nasm -f elf64 -l b64.lst -o b64.o -Ddummybranches=$1 -Dcounters=1,9,201,250 -Pmy_phr_length.inc MyTemplateB64.nasm
3 | if [ $? -ne 0 ] ; then exit ; fi
4 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
5 | if [ $? -ne 0 ] ; then exit ; fi
6 | ./x
7 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch.sh2:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | nasm -f elf64 -l b64.lst -o b64.o -Dbranchalign=$1 -Dtargetalign=$2 -Dcounters=1,9,201,250 -Pmy_branch.inc MyTemplateB64.nasm
3 | if [ $? -ne 0 ] ; then exit ; fi
4 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
5 | if [ $? -ne 0 ] ; then exit ; fi
6 | ./x
7 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/init64.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # init64.sh                                            2016-09-25 Agner Fog
3 | 
4 | # Initialization of files before running test scripts
5 | # Run 64-bit mode only
6 | # (c) Copyright 2016 by Agner Fog. GNU General Public License www.gnu.org/licenses
7 | 
8 | ./init.sh 64
9 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/pack_results.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # alltests.sh                                            2016-10-27 Agner Fog
3 | # (c) Copyright 2013-2016 by Agner Fog. GNU General Public License www.gnu.org/licenses
4 | 
5 | # pack all results into zipfile
6 | zip -q allresults.zip results/* results1/* results2/*
7 | 


--------------------------------------------------------------------------------
/src/ghr_size.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | extern void ghr_size(FILE *fp);
 6 | int main(int argc, char *argv[]) {
 7 |   FILE *fp = fopen("ghr_size.csv", "w");
 8 |   assert(fp);
 9 |   ghr_size(fp);
10 |   printf("Results are written to ghr_size.csv\n");
11 |   return 0;
12 | }
13 | 


--------------------------------------------------------------------------------
/src/phr_size.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | extern void phr_size(FILE *fp);
 6 | int main(int argc, char *argv[]) {
 7 |   FILE *fp = fopen("phr_size.csv", "w");
 8 |   assert(fp);
 9 |   phr_size(fp);
10 |   printf("Results are written to phr_size.csv\n");
11 |   return 0;
12 | }
13 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch2.sh2:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | nasm -f elf64 -l b64.lst -o b64.o -Dbranchalign=$1 -Dtargetalign=$2 -Dbranchtoggle=$3 -Dtargettoggle=$4 -Dcounters=1,9,201,250 -Pmy_branch2.inc MyTemplateB64.nasm
3 | if [ $? -ne 0 ] ; then exit ; fi
4 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
5 | if [ $? -ne 0 ] ; then exit ; fi
6 | ./x
7 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch4.sh2:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | nasm -f elf64 -l b64.lst -o b64.o -Dbranchalign=$1 -Dtargetalign=$2 -Dbranchtoggle=$3 -Dtargettoggle=$4 -Dcounters=1,9,201,250 -Pmy_branch4.inc MyTemplateB64.nasm
3 | if [ $? -ne 0 ] ; then exit ; fi
4 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
5 | if [ $? -ne 0 ] ; then exit ; fi
6 | ./x
7 | 


--------------------------------------------------------------------------------
/agner/testp/PMCTest/stopcounters.bat:
--------------------------------------------------------------------------------
 1 | rem  Example .bat script to stop PMC counters.
 2 | rem  Must run as administrator
 3 | 
 4 | rem  set to current path
 5 | @setlocal enableextensions
 6 | @cd /d "%~dp0"
 7 | 
 8 | rem  Stop counters. The numbers don't have to match the values used for starting.
 9 | 
10 | pmctest.exe stopcounters 1 9 100 311
11 | 
12 | pause


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch3.sh2:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | nasm -f elf64 -l b64.lst -o b64.o -Dbranchalign=$1 -Dtargetalign=$2 -Dbranchtoggle=$3 -Dtargettoggle=$4 -Ddummybranches=$5 -Dcounters=1,9,201,250 -Pmy_branch3.inc MyTemplateB64.nasm
3 | if [ $? -ne 0 ] ; then exit ; fi
4 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
5 | if [ $? -ne 0 ] ; then exit ; fi
6 | ./x
7 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "cpu-micro-benchmarks"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Jiajie Chen <c@jia.je>"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = "^3.8"
 9 | matplotlib = "^3.6.0"
10 | 
11 | [tool.poetry.dev-dependencies]
12 | 
13 | [build-system]
14 | requires = ["poetry-core>=1.0.0"]
15 | build-backend = "poetry.core.masonry.api"
16 | 


--------------------------------------------------------------------------------
/src/btb_size_basic.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <unistd.h>
 5 | 
 6 | extern void btb_size_basic(FILE *fp);
 7 | int main(int argc, char *argv[]) {
 8 |   FILE *fp = fopen("btb_size_basic.csv", "w");
 9 |   assert(fp);
10 |   btb_size_basic(fp);
11 |   printf("Results are written to btb_size_basic.csv\n");
12 |   return 0;
13 | }
14 | 


--------------------------------------------------------------------------------
/src/ras_size.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | #include <assert.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <vector>
 6 | 
 7 | extern void ras_size(FILE *fp);
 8 | int main(int argc, char *argv[]) {
 9 |   FILE *fp = fopen("ras_size.csv", "w");
10 |   assert(fp);
11 |   ras_size(fp);
12 |   printf("Results are written to ras_size.csv\n");
13 |   return 0;
14 | }
15 | 


--------------------------------------------------------------------------------
/agner/testp/DriverSrcLinux/install.sh:
--------------------------------------------------------------------------------
 1 | # Install MSR driver, alternative method
 2 | make clean
 3 | make
 4 | 
 5 | mknod /dev/MSRdrv c 249 0
 6 | chmod 666 /dev/MSRdrv
 7 | #insmod -f MSRdrv.ko
 8 | #instead of insmod:
 9 | KERNELDIR=/lib/modules/`uname -r`
10 | mkdir $KERNELDIR/extra
11 | cp MSRdrv.ko $KERNELDIR/extra
12 | depmod -ae
13 | modprobe MSRdrv
14 | #modprobe --force-vermagic MSRdrv
15 | 


--------------------------------------------------------------------------------
/agner/testp/PMCTest/startcounters.bat:
--------------------------------------------------------------------------------
 1 | rem  Example .bat script to start PMC counters.
 2 | rem  Must run as administrator
 3 | 
 4 | rem  set to current path
 5 | @setlocal enableextensions
 6 | @cd /d "%~dp0"
 7 | 
 8 | rem  Set counters. Modify the numbers to fit your purpose.
 9 | rem  See the end of PMCTestA.cpp for possible numbers
10 | 
11 | pmctest.exe startcounters 1 9 100 311
12 | 
13 | pause


--------------------------------------------------------------------------------
/src/elimination.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | #include <assert.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <vector>
 6 | 
 7 | extern void elimination(FILE *fp);
 8 | int main(int argc, char *argv[]) {
 9 |   FILE *fp = fopen("elimination.csv", "w");
10 |   assert(fp);
11 |   elimination(fp);
12 | 
13 |   printf("Results are written to elimination.csv\n");
14 |   return 0;
15 | }
16 | 


--------------------------------------------------------------------------------
/include/counters.h:
--------------------------------------------------------------------------------
 1 | // declare all raw counters
 2 | #include "include/utils.h"
 3 | DECLARE_RAW_COUNTER(cycles)
 4 | DECLARE_RAW_COUNTER(instructions)
 5 | DECLARE_RAW_COUNTER(branch_misses)
 6 | DECLARE_RAW_COUNTER(cond_branch_misses)
 7 | DECLARE_RAW_COUNTER(llc_misses)
 8 | DECLARE_RAW_COUNTER(llc_loads)
 9 | 
10 | // declare computed counters
11 | DECLARE_COMPUTED_COUNTER(counter_per_cycle, instructions_per_cycle)


--------------------------------------------------------------------------------
/agner/testp/PMCTest/a64.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #compile and run PMCTest in 64 bit mode with yasm assembly syntax
 3 | 
 4 | # Compile A file if modified
 5 | if [ PMCTestA.cpp -nt a64.o ] ; then
 6 | g++ -O2 -c -m64 -oa64.o PMCTestA.cpp
 7 | fi
 8 | 
 9 | nasm -f elf64 -l b64.lst -o b64.o -DWINDOWS=0 PMCTestB64.nasm
10 | if [ $? -ne 0 ] ; then exit ; fi
11 | 
12 | g++ a64.o b64.o -ox -lpthread -z noexecstack
13 | if [ $? -ne 0 ] ; then exit ; fi
14 | 
15 | ./x
16 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/a64.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #compile and run PMCTest in 64 bit mode with yasm assembly syntax
 3 | 
 4 | # Compile A file if modified
 5 | if [ PMCTestA.cpp -nt a64.o ] ; then
 6 | g++ -O2 -c -m64 -oa64.o PMCTestA.cpp
 7 | fi
 8 | 
 9 | nasm -f elf64 -l b64.lst -o b64.o -DWINDOWS=0 PMCTestB64.nasm
10 | if [ $? -ne 0 ] ; then exit ; fi
11 | 
12 | g++ a64.o b64.o -ox -lpthread -z noexecstack
13 | if [ $? -ne 0 ] ; then exit ; fi
14 | 
15 | ./x
16 | 


--------------------------------------------------------------------------------
/agner/testp/PMCTest/c64.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Compile and run PMCTest in 64 bit mode using C++
 3 | # (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses
 4 | 
 5 | # Compile A file if modified
 6 | if [ PMCTestA.cpp -nt a64.o ] ; then
 7 | g++ -O2 -c -m64 -oa64.o PMCTestA.cpp
 8 | fi
 9 | 
10 | # Compile B file and link
11 | g++ -O2 -m64 a64.o PMCTestB.cpp -lpthread
12 | if [ $? -ne 0 ] ; then exit ; fi
13 | ./a.out
14 | 
15 | # read -p "Press [Enter]"
16 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/c64.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Compile and run PMCTest in 64 bit mode using C++
 3 | # (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses
 4 | 
 5 | # Compile A file if modified
 6 | if [ PMCTestA.cpp -nt a64.o ] ; then
 7 | g++ -O2 -c -m64 -oa64.o PMCTestA.cpp
 8 | fi
 9 | 
10 | # Compile B file and link
11 | g++ -O2 -m64 a64.o PMCTestB.cpp -lpthread
12 | if [ $? -ne 0 ] ; then exit ; fi
13 | ./a.out
14 | 
15 | # read -p "Press [Enter]"
16 | 


--------------------------------------------------------------------------------
/figures/plot_dtlb_size.py:
--------------------------------------------------------------------------------
 1 | from matplotlib import pyplot as plt
 2 | import csv
 3 | 
 4 | 
 5 | size_data = []
 6 | cycles_data = []
 7 | 
 8 | with open("dtlb_size.csv", newline="") as f:
 9 |     r = csv.DictReader(f)
10 |     for row in r:
11 |         size_data.append(float(row["pages"]))
12 |         cycles_data.append(float(row["cycles"]))
13 | 
14 | plt.figure(figsize=(7, 6))
15 | plt.plot(size_data, cycles_data)
16 | plt.ylabel("Cycles")
17 | plt.xlabel("Pages")
18 | plt.grid()
19 | plt.savefig("plot_dtlb_size.png")
20 | plt.cla()
21 | 


--------------------------------------------------------------------------------
/agner/testp/PMCTest/a32.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Compile and run PMCTest in 32 bit mode using yasm assembler syntax
 3 | # In 32-bit Linux: Remove -m32 flag on g++ commands
 4 | # In 64-bit Linux: Must install g++-multilib first
 5 | 
 6 | # Compile A file if modified
 7 | if [ PMCTestA.cpp -nt a32.o ] ; then
 8 | g++ -O2 -c -m32 -oa32.o PMCTestA.cpp
 9 | fi
10 | 
11 | nasm -f elf32 -l b32.lst -o b32.o PMCTestB32.nasm
12 | if [ $? -ne 0 ] ; then exit ; fi
13 | 
14 | g++  -m32 a32.o b32.o -ox -lpthread -z noexecstack
15 | if [ $? -ne 0 ] ; then exit ; fi
16 | 
17 | ./x


--------------------------------------------------------------------------------
/agner/testp/TestScripts/a32.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Compile and run PMCTest in 32 bit mode using yasm assembler syntax
 3 | # In 32-bit Linux: Remove -m32 flag on g++ commands
 4 | # In 64-bit Linux: Must install g++-multilib first
 5 | 
 6 | # Compile A file if modified
 7 | if [ PMCTestA.cpp -nt a32.o ] ; then
 8 | g++ -O2 -c -m32 -oa32.o PMCTestA.cpp
 9 | fi
10 | 
11 | nasm -f elf32 -l b32.lst -o b32.o PMCTestB32.nasm
12 | if [ $? -ne 0 ] ; then exit ; fi
13 | 
14 | g++  -m32 a32.o b32.o -ox -lpthread -z noexecstack
15 | if [ $? -ne 0 ] ; then exit ; fi
16 | 
17 | ./x


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cpu-micro-benchmarks
 2 | 
 3 | Inspired by:
 4 | 
 5 | - https://github.com/travisdowns/robsize
 6 | - https://github.com/Veedrac/microarchitecturometer
 7 | - https://github.com/ChipsandCheese/Microbenchmarks
 8 | 
 9 | ## Paper
10 | 
11 | This repo contains code and results for paper [Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon for Software Optimization and Architectural Analysis](https://arxiv.org/abs/2411.13900), please refer to [the report](./reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/README.md) for details.
12 | 


--------------------------------------------------------------------------------
/agner/testp/PMCTest/c32.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Compile and run PMCTest in 32 bit mode using C++
 3 | # (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses
 4 | 
 5 | # In 32-bit Linux: Remove -m32 flag on g++ commands
 6 | # In 64-bit Linux: Must install g++-multilib first
 7 | 
 8 | # Compile A file if modified
 9 | if [ PMCTestA.cpp -nt a32.o ] ; then
10 | g++ -O2 -c -m32 -oa32.o PMCTestA.cpp
11 | fi
12 | 
13 | # Compile B file and link
14 | g++ -O2 -m32 a32.o PMCTestB.cpp -lpthread
15 | if [ $? -ne 0 ] ; then exit ; fi
16 | ./a.out
17 | 
18 | # read -p "Press [Enter]"
19 | 


--------------------------------------------------------------------------------
/figures/plot_rob_size.py:
--------------------------------------------------------------------------------
 1 | from matplotlib import pyplot as plt
 2 | import csv
 3 | 
 4 | size_data = []
 5 | min_data = []
 6 | avg_data = []
 7 | 
 8 | with open('rob_size.csv', newline='') as f:
 9 | 	r = csv.DictReader(f)
10 | 	for row in r:
11 | 		size_data.append(float(row["size"]))
12 | 		min_data.append(float(row["min"]))
13 | 		avg_data.append(float(row["avg"]))
14 | 
15 | plt.plot(size_data, min_data, label="min")
16 | plt.plot(size_data, avg_data, label="avg")
17 | plt.ylabel('Time')
18 | plt.xlabel('Instruction Block Size')
19 | plt.legend()
20 | plt.savefig('plot_rob_size.png')
21 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/c32.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Compile and run PMCTest in 32 bit mode using C++
 3 | # (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses
 4 | 
 5 | # In 32-bit Linux: Remove -m32 flag on g++ commands
 6 | # In 64-bit Linux: Must install g++-multilib first
 7 | 
 8 | # Compile A file if modified
 9 | if [ PMCTestA.cpp -nt a32.o ] ; then
10 | g++ -O2 -c -m32 -oa32.o PMCTestA.cpp
11 | fi
12 | 
13 | # Compile B file and link
14 | g++ -O2 -m32 a32.o PMCTestB.cpp -lpthread
15 | if [ $? -ne 0 ] ; then exit ; fi
16 | ./a.out
17 | 
18 | # read -p "Press [Enter]"
19 | 


--------------------------------------------------------------------------------
/figures/plot_ras_size.py:
--------------------------------------------------------------------------------
 1 | from matplotlib import pyplot as plt
 2 | import csv
 3 | 
 4 | size_data = []
 5 | min_data = []
 6 | avg_data = []
 7 | 
 8 | with open('ras_size.csv', newline='') as f:
 9 | 	r = csv.DictReader(f)
10 | 	for row in r:
11 | 		size_data.append(float(row["size"]))
12 | 		min_data.append(float(row["min"]))
13 | 		avg_data.append(float(row["avg"]))
14 | 
15 | plt.plot(size_data, min_data, label="min")
16 | plt.plot(size_data, avg_data, label="avg")
17 | plt.ylabel('Time')
18 | plt.xlabel('Call Depth')
19 | plt.legend()
20 | plt.grid()
21 | plt.savefig('plot_ras_size.png')
22 | 


--------------------------------------------------------------------------------
/figures/plot_phr_size.py:
--------------------------------------------------------------------------------
 1 | from matplotlib import pyplot as plt
 2 | import csv
 3 | 
 4 | size_data = []
 5 | min_data = []
 6 | 
 7 | with open("phr_size.csv", newline="") as f:
 8 |     r = csv.DictReader(f)
 9 |     for row in r:
10 |         size_data.append(float(row["size"]))
11 |         min_data.append(float(row["avg"]) * 100)
12 | 
13 | plt.figure(figsize=(5, 2))
14 | plt.plot(size_data, min_data)
15 | plt.yticks([0, 50])
16 | plt.ylabel("Misprediction rate (%)")
17 | plt.xlabel("# Branches before the last conditional branch")
18 | plt.savefig("plot_phr_size.png")
19 | plt.savefig("plot_phr_size.pdf", bbox_inches="tight")
20 | 


--------------------------------------------------------------------------------
/stale.yml:
--------------------------------------------------------------------------------
 1 | name: 'Close stale issues and PRs'
 2 | on:
 3 |   schedule:
 4 |     - cron: '0 0 * * *'
 5 | 
 6 | jobs:
 7 |   stale:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/stale@v9
11 |         with:
12 |           stale-issue-message: 'This issue is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
13 |           stale-pr-message: 'This pull request is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
14 |           days-before-stale: 60
15 |           days-before-close: 7
16 | 


--------------------------------------------------------------------------------
/agner/testp/DriverSrcLinux/Makefile:
--------------------------------------------------------------------------------
 1 | # Make MSR driver for Linux
 2 | # Last modified: 2020-08-17 Agner Fog
 3 | # See https://www.kernel.org/doc/html/latest/kbuild/modules.html
 4 | 
 5 | 
 6 | KERNELDIR := /lib/modules/`uname -r`/build
 7 | obj-m := MSRdrv.o
 8 | PWD := $(shell pwd)
 9 | 
10 | default:
11 | 	$(MAKE) -C $(KERNELDIR) M=$(PWD) modules
12 |     
13 | #$(MAKE) -C $(KERNELDIR) SUBDIRS=$(PWD) modules    
14 | 
15 | clean:
16 | 	rm -f .MSRdrv.*
17 | 	rm -f -r .tmp_versions
18 | 	rm -f *~
19 | 	rm -f MSRdrv.ko
20 | 	rm -f MSRdrv.o
21 | 	rm -f MSRdrv.mod.*
22 | 	rm -f linux.mdl
23 | 	rm -f Modules.symvers
24 |     
25 | install:
26 | 	./install.sh
27 |     


--------------------------------------------------------------------------------
/agner/testp/TestScripts/allsh2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                           2016-10-27 Agner Fog
 3 | # Compile and run PMCTest with various scripts
 4 | # looping through scripts with extension .sh2
 5 | 
 6 | # (c) 2012-2016 by Agner Fog. GNU General Public License www.gnu.org/licenses
 7 | 
 8 | # various initializations (only necessary first time):
 9 | 
10 | # mkdir results2
11 | 
12 | . vars.sh
13 | 
14 | # warm up processor to max clock frequency
15 | echo -e "\nwarmup\n"
16 | 
17 | ./warmup_fp.sh2
18 | 
19 | # run all test scripts
20 | for xscript in  *.sh2
21 | do
22 |   echo -e "\n$xscript"
23 |   ./$xscript
24 | done
25 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/allcsv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                           2016-10-27 Agner Fog
 3 | # Compile and run PMCTest with various instructions defined in comma-separated lists
 4 | # looping through all lists with extension .csv
 5 | 
 6 | # (c) 2016 by Agner Fog. GNU General Public License www.gnu.org/licenses
 7 | 
 8 | # various initializations (only necessary first time):
 9 | 
10 | # mkdir results
11 | 
12 | . vars.sh
13 | 
14 | export outdir=results
15 | 
16 | # warm up processor to max clock frequency
17 | # echo -e "\nwarmup\n"
18 | # ./warmup_fp.sh2
19 | 
20 | # run all test scripts
21 | for xscript in  *.csv
22 | do
23 |   echo -e "\n$xscript"
24 |   ./runlist.sh $xscript
25 | done
26 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_phr_length.inc:
--------------------------------------------------------------------------------
 1 | ; reproduce Figure 2 of Half&Half
 2 | 
 3 | ; number of dummy branches
 4 | %ifndef dummybranches
 5 |     %define dummybranches 200
 6 | %endif
 7 | 
 8 | %macro testinit3 0
 9 |     mov rdi, 1000
10 |     READ_PMC_START
11 | 
12 | loop_begin:
13 | 
14 |     ; train branch
15 |     rdrand eax
16 |     and eax, 1
17 |     jnz first_target
18 | first_target:
19 | 
20 |     ; dummy branches
21 |     %assign i 0
22 |     %rep dummybranches
23 |     jmp dummy_branch_%+ i
24 | dummy_branch_%+ i:
25 |     %assign i i+1
26 |     %endrep
27 | 
28 |     ; test branch
29 |     test eax, eax
30 |     jnz second_target
31 | second_target:
32 | 
33 |     dec rdi
34 |     jnz loop_begin
35 | 
36 |     READ_PMC_END
37 | 
38 | 
39 | %endmacro


--------------------------------------------------------------------------------
/results/instruction_latency/loongson_3a6000.csv:
--------------------------------------------------------------------------------
 1 | name,latency,throughput
 2 | fp_fadd_d,3.00,0.25
 3 | fp_fadd_s,3.00,0.25
 4 | fp_fmadd_d,5.00,0.50
 5 | fp_fmadd_s,5.00,0.50
 6 | fp_fmul_d,5.00,0.50
 7 | fp_fmul_s,5.00,0.50
 8 | int_add32,1.00,0.25
 9 | int_add64,1.00,0.25
10 | int_crc_w_b_w,1.00,0.50
11 | int_crc_w_d_w,1.00,0.50
12 | int_crc_w_h_w,1.00,0.50
13 | int_crc_w_w_w,1.00,0.50
14 | int_crcc_w_d_w,1.00,0.00
15 | int_crcc_w_w_w,1.00,0.00
16 | int_div64,19.00,0.00
17 | int_mul32,4.00,0.50
18 | int_mul64,4.00,0.50
19 | lasx_fp_xvfadd_d,3.00,0.25
20 | lasx_fp_xvfmadd_d,5.00,0.50
21 | lasx_fp_xvfmul_d,5.00,0.50
22 | lasx_int_xvadd_d,1.00,0.25
23 | lasx_int_xvmul_d,4.00,0.50
24 | lsx_fp_vfadd_d,3.00,0.25
25 | lsx_fp_vfmadd_d,5.00,0.50
26 | lsx_fp_vfmul_d,5.00,0.50
27 | lsx_int_vadd_d,1.00,0.25
28 | lsx_int_vmul_d,4.00,0.50
29 | 


--------------------------------------------------------------------------------
/results/instruction_latency/loongson_3c5000.csv:
--------------------------------------------------------------------------------
 1 | name,latency,throughput
 2 | fp_fadd_d,5.00,0.50
 3 | fp_fadd_s,5.00,0.50
 4 | fp_fmadd_d,5.00,0.50
 5 | fp_fmadd_s,5.00,0.50
 6 | fp_fmul_d,5.00,0.50
 7 | fp_fmul_s,5.00,0.50
 8 | int_add32,1.00,0.25
 9 | int_add64,1.00,0.25
10 | int_crc_w_b_w,5.00,3.50
11 | int_crc_w_d_w,19.00,10.50
12 | int_crc_w_h_w,7.00,4.50
13 | int_crc_w_w_w,11.00,6.50
14 | int_crcc_w_d_w,19.00,0.00
15 | int_crcc_w_w_w,11.00,0.00
16 | int_div64,4.00,0.00
17 | int_mul32,4.00,0.50
18 | int_mul64,4.00,0.50
19 | lasx_fp_xvfadd_d,5.00,0.50
20 | lasx_fp_xvfmadd_d,5.00,0.50
21 | lasx_fp_xvfmul_d,5.00,0.50
22 | lasx_int_xvadd_d,1.00,0.50
23 | lasx_int_xvmul_d,4.00,0.50
24 | lsx_fp_vfadd_d,5.00,0.50
25 | lsx_fp_vfmadd_d,5.00,0.50
26 | lsx_fp_vfmul_d,5.00,0.50
27 | lsx_int_vadd_d,1.00,0.50
28 | lsx_int_vmul_d,4.00,0.50
29 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/testmemcpyalign.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                  2013-08-07 Agner Fog
 3 | 
 4 | # Compile and run PMCTest for different implementations of memcpy
 5 | # (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses
 6 | 
 7 | . vars.sh
 8 | 
 9 | # Assemble testmemcpyal.nasm file
10 | $ass -f elf64 -o t64.o testmemcpyal.nasm
11 | if [ $? -ne 0 ] ; then exit ; fi
12 | 
13 | # Compile cpp file and link
14 | g++ -O2 -m64 testmemcpyalign.cpp t64.o -ox.exe
15 | if [ $? -ne 0 ] ; then exit ; fi
16 | 
17 | echo -e "Test if memory copying has penalty for false dependence between source and destination addressses\n" > results2/testmemcpyalign.txt
18 | 
19 | # Run test
20 | ./x.exe  >> results2/testmemcpyalign.txt
21 | 
22 | echo -e "\n"  >> results2/testmemcpyalign.txt
23 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/allsh1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                           2016-10-27 Agner Fog
 3 | # Compile and run PMCTest with various scripts
 4 | # looping through scripts with extension .sh1
 5 | # (c) Copyright 2012-2016 by Agner Fog. GNU General Public License www.gnu.org/licenses
 6 | 
 7 | # various initializations (only necessary first time):
 8 | . vars.sh
 9 | 
10 | # warm up processor to max clock frequency
11 | echo -e "\nwarmup\n"
12 | $ass -f elf64 -o b64.o -Dinstruct=nop -DWARMUPCOUNT=10000000 -Dnthreads=1 TemplateB64.nasm
13 | if [ $? -ne 0 ] ; then exit ; fi
14 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread -z noexecstack
15 | if [ $? -ne 0 ] ; then exit ; fi
16 | ./x >> /dev/null
17 | 
18 | # run all test scripts
19 | for xscript in  *.sh1
20 | do
21 |   echo -e "\n$xscript"
22 |   ./$xscript
23 | done
24 | 
25 | 


--------------------------------------------------------------------------------
/results/instruction_latency/huawei_kunpeng920.csv:
--------------------------------------------------------------------------------
 1 | name,latency,throughput
 2 | asimd_fp_fadd_double,4.00,1.00
 3 | asimd_fp_fadd_single,5.00,0.50
 4 | asimd_fp_fmla_single,4.00/5.00,0.00
 5 | asimd_fp_fmul_double,5.00,1.00
 6 | asimd_fp_fmul_single,5.00,0.50
 7 | asimd_int_add_double,2.00,0.50
 8 | asimd_int_add_half,2.00,0.50
 9 | asimd_int_add_single,2.00,0.50
10 | asimd_int_mul_half,7.00,0.00
11 | asimd_int_mul_single,7.00,2.83
12 | fp_fadd_double,4.00,0.50
13 | fp_fadd_single,5.00,0.50
14 | fp_fdiv_double,6.00,0.00
15 | fp_fdiv_single,6.00,0.00
16 | fp_fmadd_double,5.00/7.00,0.50
17 | fp_fmadd_single,4.00/5.00,0.50
18 | fp_fmul_double,5.00,0.50
19 | fp_fmul_single,5.00,0.50
20 | fp_fsqrt_double,9.00,0.00
21 | fp_fsqrt_single,7.00,0.00
22 | int_add,1.00,0.33
23 | int_madd,1.00/4.00,1.00
24 | int_mul,4.00,1.00
25 | int_sdiv,6.00,5.00
26 | int_smull,3.00,1.00
27 | int_udiv,6.00,5.00
28 | 


--------------------------------------------------------------------------------
/ANDROID-PERF.md:
--------------------------------------------------------------------------------
 1 | # Use perf counters on Android
 2 | 
 3 | There are different ways to access perf counters on Android:
 4 | 
 5 | 1. On root-ed device, you can access PMU via root user
 6 | 2. Execute microbenchmarks using `adb shell`
 7 | 
 8 | You need to copy executables to Android using `adb push`. But beware that some partitions are mounted as noexec e.g. `/storage/emulated/0/`.
 9 | 
10 | If you find it hard to find a target directory for `adb push`, you can:
11 | 
12 | 1. Run `sshd` in Termux to launch a SSH server
13 | 2. Use `scp` to copy program to home directory under termux
14 | 3. Run `run-as com.termux` to enter Termux data directory under `adb shell`
15 | 4. Run `cd files/home` and run programs there using perf counters
16 | 
17 | You can use `simpleperf` from NDK via `adb shell` in the same way. You can also run `usr/bin/sshd` from Termux in `adb shell` instead of running in Termux app.
18 | 


--------------------------------------------------------------------------------
/figures/plot_bp_size.py:
--------------------------------------------------------------------------------
 1 | from matplotlib import pyplot as plt
 2 | import csv
 3 | import numpy as np
 4 | 
 5 | size_data = []
 6 | history_data = []
 7 | min_data = []
 8 | avg_data = []
 9 | 
10 | with open('bp_size.csv', newline='') as f:
11 | 	r = csv.DictReader(f)
12 | 	for row in r:
13 | 		size_data.append(float(row["size"]))
14 | 		history_data.append(float(row["history"]))
15 | 		min_data.append(float(row["min"]))
16 | 		avg_data.append(float(row["avg"]))
17 | 
18 | plt.imshow(np.array(avg_data).reshape((11, 17)), vmax=10)
19 | plt.colorbar()
20 | 
21 | xticks = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,65536]
22 | plt.xticks(range(len(xticks)), xticks, rotation='vertical')
23 | 
24 | yticks = [1,2,4,8,16,32,64,128,256,512,1024]
25 | plt.yticks(range(len(yticks)), yticks)
26 | 
27 | plt.xlabel('Pattern Length')
28 | plt.ylabel('Branch Num')
29 | plt.savefig('plot_bp_size.png')
30 | 


--------------------------------------------------------------------------------
/ios-cross.txt:
--------------------------------------------------------------------------------
 1 | # from https://github.com/mesonbuild/meson/blob/master/cross/iphone.txt
 2 | [binaries]
 3 | c = ['clang', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk']
 4 | cpp = ['clang++', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk']
 5 | objc = ['clang', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk']
 6 | objcpp = ['clang++', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk']
 7 | ar = 'ar'
 8 | strip = 'strip'
 9 | 
10 | [host_machine]
11 | system = 'darwin'
12 | subsystem = 'ios'
13 | kernel = 'xnu'
14 | cpu_family = 'aarch64'
15 | cpu = 'aarch64'
16 | endian = 'little'


--------------------------------------------------------------------------------
/figures/plot_pht_index_bits_xor.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | import csv
 5 | 
 6 | # Test branch toggles
 7 | x_data = []
 8 | y_data = []
 9 | z_data = []
10 | 
11 | with open("pht_index_bits_xor.csv", newline="") as f:
12 |     r = csv.DictReader(f)
13 |     for row in r:
14 |         x_data.append(int(row["branches"]))
15 |         y_data.append(int(row["dummy"]))
16 |         z_data.append(min(float(row["min"]), 0.5))
17 |     z_data = np.array(z_data)
18 | 
19 | x_data = list(sorted(set(x_data)))
20 | y_data = list(sorted(set(y_data)))
21 | z_data = z_data.reshape((len(x_data), len(y_data)))
22 | 
23 | plt.figure(figsize=(20, 6))
24 | plt.imshow(z_data)
25 | plt.ylabel("Predict branches")
26 | plt.yticks(range(len(x_data)), x_data)
27 | plt.xlabel("PHR bit position")
28 | plt.xticks(range(len(y_data)), y_data, rotation=90)
29 | plt.savefig("plot_pht_index_bits_xor.png")
30 | plt.cla()
31 | 


--------------------------------------------------------------------------------
/src/instruction_latency.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | #include <assert.h>
 3 | #include <set>
 4 | #include <stdio.h>
 5 | #include <unistd.h>
 6 | #include <utility>
 7 | #include <vector>
 8 | 
 9 | extern void instruction_latency(FILE *fp);
10 | extern bool instruction_latency_use_perf;
11 | extern int instruction_latency_loop_count;
12 | int main(int argc, char *argv[]) {
13 |   int opt;
14 |   while ((opt = getopt(argc, argv, "n:p")) != -1) {
15 |     switch (opt) {
16 |     case 'n':
17 |       sscanf(optarg, "%d", &instruction_latency_loop_count);
18 |       break;
19 |     case 'p':
20 |       instruction_latency_use_perf = true;
21 |       break;
22 |     default:
23 |       fprintf(stderr, "Usage: %s [-p]\n", argv[0]);
24 |       exit(EXIT_FAILURE);
25 |     }
26 |   }
27 | 
28 |   FILE *fp = fopen("instruction_latency.csv", "w");
29 |   assert(fp);
30 |   instruction_latency(fp);
31 |   printf("Result written to instruction_latency.csv\n");
32 |   return 0;
33 | }
34 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/returnstack.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                                        2012-02-19 AgF
 3 | 
 4 | # Compile and run PMCTest for detecting return stack buffer size
 5 | # (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses
 6 | 
 7 | echo -e "Test of return stack buffer size\n"  > results2/returnstack.txt
 8 | echo -e "The size of the return stack buffer is the last level that has few or no mispredictions\n\n"  >> results2/returnstack.txt
 9 | 
10 | . vars.sh
11 | 
12 | for nesting in {2..66}
13 | do
14 | 
15 | echo -e "\n\nNesting level $nesting"  >> results2/returnstack.txt
16 | $ass -f elf64 -o b64.o -lxx.lst -Dnesting=$nesting -Dcounters=$BranchPMCs -Preturnstack.inc TemplateB64.nasm
17 | if [ $? -ne 0 ] ; then exit ; fi
18 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
19 | if [ $? -ne 0 ] ; then exit ; fi
20 | ./x >> results2/returnstack.txt
21 | 
22 | done
23 | 
24 | echo -e "\n"  >> results2/returnstack.txt
25 | 
26 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/instruct_boundaries.inc:
--------------------------------------------------------------------------------
 1 | ;----------------------------------------------------------------------------
 2 | ;                instruct_boundaries.inc                2013-07-11 Agner Fog
 3 | ;
 4 | ; PMC Test program for testing if instruction boundaries are marked in instruction cache
 5 | ;
 6 | ; (c) Copyright 2013 by Agner Fog. GNU General Public License www.gnu.org/licenses
 7 | ;-----------------------------------------------------------------------------
 8 | 
 9 | ; loops in template
10 | %define repeat0 16
11 | %define repeat1  1
12 | %define repeat2  1
13 | 
14 | %ifndef repeatu
15 |    %define repeatu 100
16 | %endif
17 | 
18 | %macro testcode 0
19 | 
20 |    test r14d, 2  ; loop counter
21 |    jnz $+4       ; jump past previous instruction boundary
22 |    
23 |    %rep repeatu
24 |       mov eax,0xB8B8B8B8
25 |       mov ebx,0xBBBBBBBB
26 |    %endrep 
27 |    
28 |    times 4 nop   ; absorb last partial instruction  
29 | 
30 | %endmacro
31 | 


--------------------------------------------------------------------------------
/figures/plot_memory_latency.py:
--------------------------------------------------------------------------------
 1 | from matplotlib import pyplot as plt
 2 | import csv
 3 | 
 4 | size_data = []
 5 | time_data = []
 6 | llc_miss_data = []
 7 | llc_load_data = []
 8 | 
 9 | with open('memory_latency.csv', newline='') as f:
10 | 	r = csv.reader(f)
11 | 	for row in r:
12 | 		if row[0] == "size":
13 | 			continue
14 | 		if len(row) < 5:
15 | 			continue
16 | 		size_data.append(float(row[0]))
17 | 		time_data.append(float(row[1]))
18 | 		llc_miss_data.append(float(row[3]))
19 | 		llc_load_data.append(float(row[4]))
20 | 
21 | fig, ax = plt.subplots()
22 | 
23 | ax.plot(size_data, time_data)
24 | ax.set_xscale('log')
25 | ax.set_ylabel('Time (ns)')
26 | ax.set_xlabel('Memory Block Size (B)')
27 | 
28 | ax2 = ax.twinx()
29 | 
30 | ax2.plot(size_data, llc_load_data, 'r.-', label='LLC Loads')
31 | ax2.plot(size_data, llc_miss_data, 'g.-', label='LLC Misses')
32 | ax2.legend()
33 | ax2.set_xscale('log')
34 | ax2.set_ylabel('LLC Load/Miss per Access')
35 | ax2.set_xlabel('Memory Block Size (B)')
36 | 
37 | plt.savefig('plot_memory_latency.png')
38 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch5.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | 
 5 | # Reproduce Figure 5 of Half&Half
 6 | x_data = range(1, 20)
 7 | y_data = []
 8 | for branch_align in x_data:
 9 |     output = subprocess.check_output(
10 |         ["./my_branch5.sh2", str(branch_align)], encoding="utf-8"
11 |     )
12 |     heading = False
13 |     data = []
14 |     for line in output.splitlines():
15 |         parts = list(filter(lambda s: len(s) > 0, line.strip().split(" ")))
16 |         if len(parts) > 0:
17 |             if not heading:
18 |                 assert parts[5] == "BrMisCond"
19 |                 heading = True
20 |             else:
21 |                 data.append(int(parts[4]))
22 |     avg = np.average(np.array(data)) / 10000  # 1 branches, 10000 loops
23 |     print(branch_align, f"{avg:.2f}")
24 |     y_data.append(avg)
25 | 
26 | plt.plot(x_data, y_data)
27 | plt.xlabel("Branch alignment bits")
28 | plt.ylabel("Miss Rate")
29 | plt.xticks(x_data)
30 | plt.savefig("my_branch5.png")
31 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_phr_length.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | 
 5 | # Reproduce Figure 2(a) of Half&Half
 6 | x_data = range(200)
 7 | y_data = []
 8 | for dummy_branches in x_data:
 9 |     output = subprocess.check_output(
10 |         ["./my_phr_length.sh2", str(dummy_branches)], encoding="utf-8"
11 |     )
12 |     heading = False
13 |     data = []
14 |     for line in output.splitlines():
15 |         parts = list(filter(lambda s: len(s) > 0, line.strip().split(" ")))
16 |         if len(parts) > 0:
17 |             if not heading:
18 |                 assert parts[5] == "BrMisCond"
19 |                 heading = True
20 |             else:
21 |                 data.append(int(parts[4]))
22 |     avg = np.average(np.array(data)) / 2000  # 2 branches, 1000 loops
23 |     print(dummy_branches, f"{avg:.2f}")
24 |     y_data.append(avg)
25 | 
26 | plt.plot(x_data, y_data)
27 | plt.xlabel("Num. of Dummy Branches")
28 | plt.ylabel("Miss Rate")
29 | plt.yticks([0.25, 0.50])
30 | plt.savefig("my_phr_length.png")
31 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/instruct_boundaries.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                instruct_boundaries.sh2             2013-07-11 Agner Fog
 3 | #
 4 | # PMC Test program for testing if instruction boundaries are marked in
 5 | # instruction cache by making jump past previous instruction boundary
 6 | #
 7 | # (c) 2013 GNU General Public License www.gnu.org/licenses
 8 | #
 9 | # repeatu:  number of instructions in sequency
10 | 
11 | . vars.sh
12 | 
13 | nthreads=1
14 | 
15 | echo -e "Test if instruction boundaries are marked in instruction cache"  > results2/instruct_boundaries.txt
16 | 
17 | for repeatu in 10 100 1000
18 | do
19 | echo -e "\n$repeatu instructions" >> results2/instruct_boundaries.txt
20 | 
21 | $ass -f elf64 -o b64.o -Drepeatu=$repeatu -Dcounters=$BranchPMCs -Dnthreads=$nthreads -Pinstruct_boundaries.inc TemplateB64.nasm
22 | if [ $? -ne 0 ] ; then exit ; fi
23 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
24 | if [ $? -ne 0 ] ; then exit ; fi
25 | ./x >> results2/instruct_boundaries.txt
26 | done
27 | 
28 | echo -e "\n"  >> results2/instruct_boundaries.txt
29 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/memcpy.inc:
--------------------------------------------------------------------------------
 1 | ; memcpy.inc                                                       2016-11-02
 2 | 
 3 | ; Run PMCTest for for different implementations of memcpy
 4 | 
 5 | ; (c) 2012-2016 by Agner Fog. GNU General Public License www.gnu.org/licenses
 6 | 
 7 | ; Parameters:
 8 | ;
 9 | ; mversion:   version of memcpy function
10 | ;
11 | ; dsize:      Size of memory block to move
12 | ; srcoff:     Source offset relative to cache line (64B)
13 | ; dstoff:     Destination offset relative to cache line (64B)
14 | 
15 | ; define external memcpy function
16 | extern mversion
17 | 
18 | ; data size for each thread
19 | %define threaddatasize ((dsize*2+10FFh) & (-100h))
20 | 
21 | ; allocate data for all threads
22 | %macro testdata 0
23 |    times (threaddatasize * nthreads)  DB 0
24 | %endmacro
25 | 
26 | ; main testcode macro
27 | %macro testcode 0
28 | 
29 |    imul rax, r15, threaddatasize
30 |    lea rsi, [UserData + rax]
31 |    lea rdi, [rsi + ((dsize+7FH)&(-80H))+100H+dstoff]
32 |    mov edx, dsize
33 |    call mversion
34 | 
35 | %endmacro
36 | 
37 | ; default test loops
38 | %define repeat1 100
39 | %define repeat2 1
40 | 


--------------------------------------------------------------------------------
/src/itlb_size.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <unistd.h>
 5 | 
 6 | extern void itlb_size(FILE *fp);
 7 | extern bool avoid_hugepage_merging;
 8 | extern int stride;
 9 | extern int fake_page_size;
10 | int main(int argc, char *argv[]) {
11 |   int opt;
12 |   while ((opt = getopt(argc, argv, "hs:f:")) != -1) {
13 |     switch (opt) {
14 |     case 'h':
15 |       avoid_hugepage_merging = true;
16 |       break;
17 |     case 's':
18 |       sscanf(optarg, "%d", &stride);
19 |       break;
20 |     case 'f':
21 |       sscanf(optarg, "%d", &fake_page_size);
22 |       break;
23 |     default:
24 |       fprintf(stderr, "Usage: %s [-h] [-s stride] [-f page_size]\n", argv[0]);
25 |       fprintf(stderr, "\t-h: avoid huge page merging\n");
26 |       fprintf(stderr, "\t-s stride: set branch address stride\n");
27 |       fprintf(stderr, "\t-f page_size: fake page size\n");
28 |       exit(EXIT_FAILURE);
29 |     }
30 |   }
31 | 
32 |   FILE *fp = fopen("itlb_size.csv", "w");
33 |   assert(fp);
34 |   itlb_size(fp);
35 |   printf("Results are written to itlb_size.csv\n");
36 |   return 0;
37 | }
38 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/alltests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # alltests.sh                                            2016-10-28 Agner Fog
 3 | # (c) Copyright 2013-2016 by Agner Fog. GNU General Public License www.gnu.org/licenses
 4 | 
 5 | 
 6 | # initalization
 7 | ./init.sh $1
 8 | 
 9 | echo -e "Running all tests\n`date`\n`./cpugetinfo brand`.\nFamily `./cpugetinfo family hex`, model `./cpugetinfo model hex`"  >> results2/statistics.txt
10 | 
11 | Starttime=`date +%s`
12 | 
13 | # measure latencies and throughputs of instructions based on lists in .csv files
14 | ./allcsv.sh
15 | 
16 | # measure latencies and throughputs of instructions
17 | ./allsh1.sh
18 | 
19 | # other microarchitecture tests
20 | ./allsh2.sh
21 | 
22 | Endtime=`date +%s`
23 | Elapsedtime=$(($Endtime - $Starttime))
24 | Minutes=$(($Elapsedtime/60))
25 | Seconds=$(($Elapsedtime-($Minutes*60)))
26 | Numscripts=$( ls *.sh1 *.sh2 *.csv | wc -w)
27 | 
28 | echo Executed $Numscripts scripts. Elapsed time $Minutes m, $Seconds s
29 | echo -e "\nExecuted $Numscripts scripts. Elapsed time $Minutes m, $Seconds s\n\n"  >> results2/statistics.txt
30 | 
31 | # pack all results into zipfile
32 | ./pack_results.sh
33 | 


--------------------------------------------------------------------------------
/figures/plot_btb_size.py:
--------------------------------------------------------------------------------
 1 | from matplotlib import pyplot as plt
 2 | import csv
 3 | 
 4 | # Mimic https://chipsandcheese.com/2023/10/27/cortex-x2-arm-aims-high/
 5 | 
 6 | size_data = []
 7 | stride_data = []
 8 | min_data = []
 9 | avg_data = []
10 | 
11 | with open('btb_size.csv', newline='') as f:
12 | 	r = csv.DictReader(f)
13 | 	for row in r:
14 | 		size_data.append(float(row["size"]))
15 | 		stride_data.append(float(row["stride"]))
16 | 		min_data.append(float(row["min"]))
17 | 		avg_data.append(float(row["avg"]))
18 | 
19 | for stride in [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]:
20 | 	x_data = []
21 | 	y_data = []
22 | 	for i in range(len(stride_data)):
23 | 		if stride_data[i] == stride and size_data[i] <= 8192:
24 | 			x_data.append(size_data[i])
25 | 			y_data.append(min_data[i])
26 | 	plt.plot(x_data, y_data, label=f"Branch Per {stride}B")
27 | plt.xscale('log')
28 | ticks = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
29 | plt.xticks(ticks, ticks)
30 | plt.yticks(range(10))
31 | plt.ylim((0, 10))
32 | plt.grid()
33 | plt.xlabel('Branches in loop')
34 | plt.ylabel('Cycles Per Branch')
35 | plt.legend()
36 | plt.savefig('plot_btb_size.png')
37 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/cache_banks.inc:
--------------------------------------------------------------------------------
 1 | ; cache_banks.inc                                       2015-12-20 Agner Fog
 2 | 
 3 | ; Measure cache bank conflicts
 4 | ; (c) 2014 - 2015 by Agner Fog. GNU General Public License www.gnu.org/licenses
 5 | ;
 6 | ; Parameters:
 7 | ;
 8 | ; stride1:     Bigger than all banks
 9 | ;
10 | ; stride2:     Bank size
11 | ;
12 | ; tmode:       Test mode:
13 | ;              1:    read from two addresses spaced by stride1
14 | ;              2:    read from two addresses spaced by stride1 + stride2
15 | ;              3:    read and write from two addresses spaced by stride1
16 | ;              4:    read and write from two addresses spaced by stride1 + stride2
17 | 
18 | %macro testinit1 0
19 | 
20 | %endmacro
21 | 
22 | 
23 | ; main testcode macro
24 | 
25 | %macro testcode 0
26 | %if tmode == 1    ; 
27 |     mov eax,[rsi]
28 |     mov ebx,[rsi + stride1]
29 | %elif tmode == 2    ; 
30 |     mov eax,[rsi]
31 |     mov ebx,[rsi + stride1 + stride2]
32 | %elif tmode == 3    ; 
33 |     mov eax,[rsi]
34 |     mov [rsi + stride1],eax
35 | %elif tmode == 4    ; 
36 |     mov eax,[rsi]
37 |     mov [rsi + stride1 + stride2],eax
38 | %endif
39 | %endmacro
40 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/stack_sync_uops.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                                    2013-07-19 Agner Fog
 3 | # Compile and run PMCTest for testing branch prediction penalty with and without
 4 | # microop cache
 5 | 
 6 | # (c) Copyright 2013 by Agner Fog. GNU General Public License www.gnu.org/licenses
 7 | 
 8 | # Detect CPU specific variables
 9 | . vars.sh
10 | 
11 | 
12 | echo -e "Stack synchronization micro-ops"  > results2/stack_sync_uops.txt
13 | 
14 | repeat0=10
15 | count1=10
16 | 
17 | let tcase=0
18 | 
19 | for xcase in  Push_and_pop_only  added_mov_r,[rsp]  further_added_mov_r,rsp  \
20 |               call_and_ret  call_and_ret_imm  call_and_ret_and_add_rsp,const
21 | do
22 | let tcase+=1
23 | 
24 | echo -e "\n\nCase $tcase: $xcase"  >> results2/stack_sync_uops.txt
25 | 
26 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dcount1=$count1 -Drepeat0=$repeat0 -Pstack_sync_uops.inc TemplateB64.nasm
27 | if [ $? -ne 0 ] ; then exit ; fi
28 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
29 | if [ $? -ne 0 ] ; then exit ; fi
30 | ./x >> results2/stack_sync_uops.txt
31 | done
32 | 
33 | 
34 | echo -e "\n"  >> results2/stack_sync_uops.txt
35 | 


--------------------------------------------------------------------------------
/include/uarch.h:
--------------------------------------------------------------------------------
 1 | #ifndef __UARCH_H__
 2 | #define __UARCH_H__
 3 | 
 4 | enum uarch {
 5 |   // special
 6 |   unknown,
 7 | 
 8 |   // arm64
 9 |   // apple
10 |   // m1
11 |   firestorm,
12 |   icestorm,
13 |   // m2
14 |   avalanche,
15 |   blizzard,
16 |   // m4
17 |   m4_pcore,
18 |   m4_ecore,
19 |   // qualcomm
20 |   oryon,
21 |   // arm
22 |   cortex_a77,
23 |   cortex_a78,
24 |   cortex_x1,
25 |   neoverse_n1,
26 |   neoverse_v1,
27 |   neoverse_n2,
28 |   neoverse_v2,
29 |   // hisilicon
30 |   tsv110,
31 | 
32 |   unknown_arm64,
33 |   arm64_begin = firestorm,
34 |   arm64_end = unknown_arm64,
35 | 
36 |   // loongarch
37 |   la464,
38 |   la664,
39 |   unknown_loongarch64,
40 |   loongarch64_begin = la464,
41 |   loongarch64_end = unknown_loongarch64,
42 | 
43 |   // intel
44 |   golden_cove,
45 |   gracemont,
46 |   sunny_cove,
47 |   skylake,
48 |   broadwell,
49 |   // amd
50 |   zen1,
51 |   zen2,
52 |   zen3,
53 |   zen4,
54 |   zen5,
55 | 
56 |   unknown_amd64,
57 | 
58 |   // valid range
59 |   all_begin = firestorm,
60 |   all_end = unknown_amd64,
61 | };
62 | 
63 | // detect uarch
64 | enum uarch get_uarch();
65 | // which core to bind
66 | int get_bind_core();
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/figures/plot_pht_associativity.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | import csv
 5 | 
 6 | # Test branch toggles
 7 | for prefix in [""]:
 8 |     x_data = []
 9 |     y_data = []
10 |     z_data = []
11 | 
12 |     with open(f"{prefix}pht_associativity.csv", newline="") as f:
13 |         r = csv.DictReader(f)
14 |         for row in r:
15 |             x_data.append(int(row["branches"]))
16 |             y_data.append(int(row["align"]))
17 |             z_data.append(min(float(row["min"]), 0.5))
18 |         z_data = np.array(z_data)
19 | 
20 |     x_data = list(sorted(set(x_data)))
21 |     y_data = list(sorted(set(y_data)))
22 |     z_data = z_data.reshape((len(x_data), len(y_data)))
23 | 
24 |     plt.imshow(z_data.transpose())
25 |     plt.xlabel("# Conditional branches")
26 |     plt.xticks(range(len(x_data)), x_data, rotation=90)
27 |     plt.ylabel("Log2 branch base address")
28 |     plt.yticks(range(len(y_data)), y_data)
29 |     bar = plt.colorbar(shrink=0.5)
30 |     bar.ax.set_ylabel("Misprediction rate", fontsize=8, rotation=270, labelpad=9.0)
31 |     plt.savefig(f"plot_{prefix}pht_associativity.png")
32 |     plt.savefig(f"plot_{prefix}pht_associativity.pdf", bbox_inches="tight")
33 |     plt.cla()
34 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all compile clean
 2 | 
 3 | ifeq ($(shell which meson),)
 4 |     $(error Please install meson first!)
 5 | endif
 6 | 
 7 | ifeq ($(shell which ninja),)
 8 |     $(error Please install ninja first!)
 9 | endif
10 | 
11 | all: builddir/stamp compile
12 | 
13 | builddir/stamp:
14 | 	meson setup builddir --buildtype=release
15 | 	touch $@
16 | 
17 | compile: builddir/stamp
18 | 	ninja -C builddir
19 | 
20 | clean: builddir/stamp
21 | 	ninja -C builddir clean
22 | 
23 | distclean:
24 | 	rm -rf builddir
25 | 
26 | builddir-ios:
27 | 	meson setup builddir-ios -Dios=true --buildtype=release --cross-file ios-cross.txt
28 | 
29 | ios: builddir-ios
30 | 	ninja -C builddir-ios
31 | 
32 | builddir-android:
33 | 	meson setup builddir-android -Dandroid=true --buildtype=release --cross-file android-cross.txt
34 | 
35 | android: builddir-android
36 | 	ninja -C builddir-android
37 | 
38 | builddir-aarch64-linux:
39 | 	meson setup builddir-aarch64-linux -Dlinux-cross=aarch64 --buildtype=release --cross-file aarch64-linux-cross.txt
40 | 
41 | aarch64-linux: builddir-aarch64-linux
42 | 	ninja -C builddir-aarch64-linux
43 | 
44 | builddir-gem5:
45 | 	meson setup builddir-gem5 -Dgem5=true --buildtype=release
46 | 
47 | gem5: builddir-gem5
48 | 	ninja -C builddir-gem5
49 | 
50 | .PHONY: gem5
51 | 


--------------------------------------------------------------------------------
/agner/testp/DriverSrcLinux/DriverSrcLinux.txt:
--------------------------------------------------------------------------------
 1 | Instructions for Linux driver                     2011-06-19 Agner Fog
 2 | 
 3 | To install the Linux driver for PMCTest under Linux, 32 or 64 bit, 
 4 | unzip DriverSrcLinux.zip, make and install the driver according to 
 5 | the following commands. Must reinstall after reboot.
 6 | The driver has only been tested in Ubuntu.
 7 | 
 8 | 
 9 | make
10 | chmod 744 *.sh
11 | sudo ./install.sh
12 | 
13 | 
14 | In some older systems you may need to replace MSRdrv.c with MSRdrv1.c if
15 | compilation gives errors.
16 | 
17 | If build directory is missing:
18 | 
19 | sudo ln -s /usr/src/linux-headers-`uname -r` /lib/modules/`uname -r`/build
20 | 
21 | Or if the target doesn't exist, e.g.:
22 | 
23 | sudo ln -s /usr/src/linux-headers-2.6.24-23-server /lib/modules/`uname -r`/build
24 | 
25 | In Red Hat/Fedora you may need the following:
26 | rpm -q kernel kernel-source
27 | or
28 | yum -y install kernel-devel kernel-headers
29 | If it installs a wrong version, run:
30 | yum distro-sync
31 | reboot
32 | ./install2.sh
33 | 
34 | 
35 | install.sh:
36 | 
37 | 	mknod /dev/MSRdrv c 222 0
38 | 	chmod 666 /dev/MSRdrv
39 | 	insmod -f MSRdrv.ko
40 | 	
41 | uninstall.sh:
42 | 
43 | 	rm -f /dev/MSRdrv
44 | 	rmmod MSRdrv
45 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch2.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | 
 5 | # Reproduce Figure 4 of Half&Half
 6 | x_data = range(0, 16)
 7 | y_data = range(0, 6)
 8 | z_data = []
 9 | for branch_toggle in x_data:
10 |     temp = []
11 |     for target_toggle in y_data:
12 |         output = subprocess.check_output(
13 |             ["./my_branch2.sh2", "16", "16", str(branch_toggle), str(target_toggle)],
14 |             encoding="utf-8",
15 |         )
16 |         heading = False
17 |         data = []
18 |         for line in output.splitlines():
19 |             parts = list(filter(lambda s: len(s) > 0, line.strip().split(" ")))
20 |             if len(parts) > 0:
21 |                 if not heading:
22 |                     assert parts[5] == "BrMisCond"
23 |                     heading = True
24 |                 else:
25 |                     data.append(int(parts[4]))
26 |         avg = np.average(np.array(data)) / 2000  # 2 branches, 1000 loops
27 |         print(branch_toggle, target_toggle, f"{avg:.2f}")
28 |         temp.append(avg)
29 |     z_data.append(temp)
30 | 
31 | plt.imshow(z_data)
32 | plt.xlabel("Target toggle bit")
33 | plt.xticks(y_data)
34 | plt.ylabel("Branch toggle bit")
35 | plt.yticks(x_data)
36 | plt.savefig("my_branch2.png")
37 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/pointer_chasing.inc:
--------------------------------------------------------------------------------
 1 | ; pointer_chasing.inc                                                        2021-01-30 Agner Fog
 2 | 
 3 | ; Test cache access time
 4 | ; (c) 2013 - 2021 by Agner Fog. GNU General Public License www.gnu.org/licenses
 5 | 
 6 | ; Parameters:
 7 | ;
 8 | ; tcase: addressing mode
 9 | 
10 | ; define test data
11 | %macro testdata 0
12 | pointers dq 0,0,0,0
13 | %endmacro
14 | 
15 | ; initialization of pointers
16 | %macro testinit1 0
17 |     lea rsi, [pointers]
18 |     lea rdi, [rsi+8]
19 |     mov [rsi], rdi
20 |     add rdi, 8
21 |     mov [rsi+8], rdi
22 |     add rdi, 8
23 |     mov [rsi+16], rdi
24 |     add rdi, 8
25 |     mov [rsi+24], rsi
26 |     mov rdi, rsi
27 |     xor ebp, ebp
28 |     xor r8d, r8d
29 |     mov r9d, 1
30 | %endmacro
31 | 
32 | 
33 | ; main testcode macro
34 | %macro testcode 0
35 |     %if tcase == 1
36 |         mov rdi, [rdi]            ; base pointer only
37 |     %elif tcase == 2
38 |         mov rdi, [rdi + rbp * 2]  ; base + scaled index
39 |     %elif tcase == 3
40 |         mov rdi, [r8 + rdi * 1]   ; base + scaled index. latency through index
41 |     %elif tcase == 4
42 |         mov rdi, [rdi + r9 * 4 - 4]  ; base + scaled index + offset
43 |     %endif
44 | %endmacro
45 | 
46 | ; disable default test loops
47 | %define repeat1 1000
48 | %define repeat2 100
49 | 


--------------------------------------------------------------------------------
/figures/plot_pht_tag_bits_xor.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | import csv
 5 | 
 6 | # Test branch toggles
 7 | x_data = [[], []]
 8 | y_data = [[], []]
 9 | z_data = [[], []]
10 | 
11 | with open("pht_tag_bits_xor.csv", newline="") as f:
12 |     r = csv.DictReader(f)
13 |     for row in r:
14 |         target = int(row["target"])
15 |         x_data[target].append(int(row["align"]))
16 |         y_data[target].append(int(row["dummy_branches"]))
17 |         z_data[target].append(float(row["min"]))
18 |     for i in range(2):
19 |         z_data[i] = np.array(z_data[i])
20 | 
21 | fig, axes = plt.subplots(2, figsize=(20, 10))
22 | for i in range(2):
23 |     x_data[i] = list(sorted(set(x_data[i])))
24 |     y_data[i] = list(sorted(set(y_data[i])))
25 |     z_data[i].resize(len(x_data[i]) * len(y_data[i]))
26 |     z_data[i] = z_data[i].reshape((len(x_data[i]), len(y_data[i])))
27 | 
28 |     axes[i].imshow(z_data[i])
29 |     axes[i].set_ylabel("Branch alignment")
30 |     axes[i].set_yticks(range(len(x_data[i])), x_data[i])
31 |     if i == 0:
32 |         axes[i].set_xlabel("PHRB bits")
33 |     else:
34 |         axes[i].set_xlabel("PHRT bits")
35 |     axes[i].set_xticks(range(len(y_data[i])), y_data[i], rotation=90)
36 |     axes[i].grid()
37 | plt.savefig("plot_pht_tag_bits_xor.png")
38 | plt.cla()
39 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch4.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | 
 5 | # Reproduce Figure 4 of Half&Half
 6 | x_data = range(0, 16)
 7 | y_data = range(0, 6)
 8 | z_data = []
 9 | for branch_toggle in x_data:
10 |     temp = []
11 |     for target_toggle in y_data:
12 |         output = subprocess.check_output(
13 |             ["./my_branch4.sh2", "16", "16", str(branch_toggle), str(target_toggle)],
14 |             encoding="utf-8",
15 |         )
16 |         heading = False
17 |         data = []
18 |         for line in output.splitlines():
19 |             parts = list(filter(lambda s: len(s) > 0, line.strip().split(" ")))
20 |             if len(parts) > 0:
21 |                 if not heading:
22 |                     assert parts[5] == "BrMisCond"
23 |                     heading = True
24 |                 else:
25 |                     data.append(int(parts[4]))
26 |         # remove 500 mis-predictions in dummy branches
27 |         avg = (np.average(np.array(data)) - 500) / 2000  # 2 branches, 1000 loops
28 |         print(branch_toggle, target_toggle, f"{avg:.2f}")
29 |         temp.append(avg)
30 |     z_data.append(temp)
31 | 
32 | plt.imshow(z_data)
33 | plt.xlabel("Target toggle bit")
34 | plt.xticks(y_data)
35 | plt.ylabel("Branch toggle bit")
36 | plt.yticks(x_data)
37 | plt.savefig("my_branch4.png")
38 | 


--------------------------------------------------------------------------------
/figures/plot_pht_tag_bits_xor_phr.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | import csv
 5 | 
 6 | # Test branch toggles
 7 | x_data = [[], []]
 8 | y_data = [[], []]
 9 | z_data = [[], []]
10 | 
11 | with open("pht_tag_bits_xor_phr.csv", newline="") as f:
12 |     r = csv.DictReader(f)
13 |     for row in r:
14 |         target = int(row["target"])
15 |         x_data[target].append(int(row["first_phr_bit"]))
16 |         y_data[target].append(int(row["dummy_branches"]))
17 |         z_data[target].append(float(row["min"]))
18 |     for i in range(2):
19 |         z_data[i] = np.array(z_data[i])
20 | 
21 | fig, axes = plt.subplots(2, figsize=(15, 10))
22 | for i in range(2):
23 |     x_data[i] = list(sorted(set(x_data[i])))
24 |     y_data[i] = list(sorted(set(y_data[i])))
25 |     z_data[i].resize(len(x_data[i]) * len(y_data[i]))
26 |     z_data[i] = z_data[i].reshape((len(x_data[i]), len(y_data[i])))
27 | 
28 |     axes[i].imshow(z_data[i])
29 |     if i == 0:
30 |         axes[i].set_ylabel("PHRB bits")
31 |     else:
32 |         axes[i].set_ylabel("PHRT bits")
33 |     axes[i].set_yticks(range(len(x_data[i])), x_data[i])
34 |     axes[i].set_xlabel("PHRT bits")
35 |     axes[i].set_xticks(range(len(y_data[i])), y_data[i], rotation=90)
36 |     axes[i].grid()
37 | 
38 | plt.savefig("plot_pht_tag_bits_xor_phr.png")
39 | plt.cla()
40 | 


--------------------------------------------------------------------------------
/figures/plot_phr_branch_bits_location.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | import csv
 5 | 
 6 | # Reproduce Table 2 of Half&Half
 7 | # Test branch toggles
 8 | for prefix in [""]:
 9 |     x_data = []
10 |     y_data = []
11 |     z_data = []
12 | 
13 |     with open(f"{prefix}phr_branch_bits_location.csv", newline="") as f:
14 |         r = csv.DictReader(f)
15 |         for row in r:
16 |             x_data.append(int(row["toggle"]))
17 |             y_data.append(int(row["dummy"]))
18 |             z_data.append(min(float(row["avg"]), 0.5))
19 |         z_data = np.array(z_data)
20 | 
21 |     x_data = list(sorted(set(x_data)))
22 |     y_data = list(sorted(set(y_data)))
23 |     z_data = z_data.reshape((len(x_data), len(y_data)))
24 | 
25 |     plt.figure(figsize=(len(y_data) / 4, len(x_data) / 4))
26 |     plt.imshow(z_data)
27 |     plt.xlabel("Dummy branches")
28 |     plt.xticks(range(len(y_data)), y_data, rotation=90)
29 |     plt.ylabel("Branch toggle bit")
30 |     plt.yticks(range(len(x_data)), x_data)
31 |     bar = plt.colorbar(shrink=0.8, ticks=[0.25, 0.50])
32 |     bar.ax.set_ylabel("Misprediction rate", fontsize=8, rotation=270)
33 |     plt.clim(0.25, 0.50)
34 |     plt.grid()
35 |     plt.savefig(f"plot_{prefix}phr_branch_bits_location.png")
36 |     plt.savefig(f"plot_{prefix}phr_branch_bits_location.pdf", bbox_inches="tight")
37 |     plt.cla()
38 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/returnstack.inc:
--------------------------------------------------------------------------------
 1 | ; returnstack.inc                                                                 2012-02-22 AgF
 2 | 
 3 | ; Test return stack buffer
 4 | ; (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses
 5 | 
 6 | ; Parameters:
 7 | ;
 8 | ; nesting: function nesting level (minimum value = 2)
 9 | 
10 | %ifndef nesting
11 |    %define nesting 8
12 | %endif
13 | 
14 | ; define function named F%1 calling F%2
15 | %macro definefunc 2
16 |    F%1: nop
17 |    nop
18 |    nop
19 |    nop
20 |    call F%2
21 |    nop
22 |    nop
23 |    nop
24 |    nop
25 |    ret
26 |    nop
27 |    nop
28 | %endmacro
29 | 
30 | ; main testcode macro
31 | %macro testcode 0
32 |    jmp ZZZ
33 |    nop
34 |    nop
35 | 
36 |    align 16
37 |    ; define nested functions
38 |    %assign n nesting-1
39 |    %if n < 1
40 |       %assign n 1
41 |    %endif
42 | 
43 |    %assign i 0
44 |    %rep n
45 |       %assign i i+1
46 |       %assign j i+1
47 |       %if j > n
48 |          %assign j 9999
49 |       %endif
50 |       definefunc i, j
51 |    %endrep
52 | 
53 |    ; Last function
54 |    F9999:
55 |       nop
56 |       nop
57 |       ret
58 | 
59 |    align 16
60 |    ZZZ: nop
61 |       nop
62 |       call F1
63 |       nop
64 |       nop
65 |       call F1
66 |       nop
67 |       nop
68 | 
69 | %endmacro
70 | 
71 | ; disable default test loops
72 | %define repeat1 100
73 | %define repeat2 1
74 | 
75 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/out_of_order.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                out_of_order.sh2                          2014-08-03 Agner Fog
 3 | #
 4 | # PMC Test program for measuring out-of-order capacity
 5 | #
 6 | # (c) 2014 GNU General Public License www.gnu.org/licenses
 7 | 
 8 | # Test cases:
 9 | # 1: integer add
10 | # 2: integer mul
11 | # 3: floating point add
12 | # 4: floating point mul
13 | 
14 | . vars.sh
15 | 
16 | echo -e "Test out-of-order execution capacity"  > results2/out_of_order.txt
17 | 
18 | for tcase in {1..4}
19 | do
20 | 
21 | for chainlength in 10 100 1000 1000000
22 | do
23 | 
24 | if [ $tcase -eq 1 ]; then echo -e "\n\nCase 1: integer add, chainlength = $chainlength" >> results2/out_of_order.txt ; fi
25 | if [ $tcase -eq 2 ]; then echo -e "\n\nCase 2: integer mul, chainlength = $chainlength" >> results2/out_of_order.txt ; fi
26 | if [ $tcase -eq 3 ]; then echo -e "\n\nCase 3: floating point add, chainlength = $chainlength" >> results2/out_of_order.txt ; fi
27 | if [ $tcase -eq 4 ]; then echo -e "\n\nCase 4: floating point mul, chainlength = $chainlength" >> results2/out_of_order.txt ; fi
28 | 
29 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dchainlength=$chainlength -Dnthreads=1 -Pout_of_order.inc TemplateB64.nasm
30 | if [ $? -ne 0 ] ; then exit ; fi
31 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
32 | if [ $? -ne 0 ] ; then exit ; fi
33 | ./x >> results2/out_of_order.txt
34 | 
35 | done
36 | done
37 | 
38 | echo -e "\n"  >> results2/out_of_order.txt
39 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/unaligned_mem.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                                               2014-01-31 AgF
 3 | 
 4 | # Test unaligned memory read and write throughput
 5 | # (c) 2014 by Agner Fog. GNU General Public License www.gnu.org/licenses
 6 | 
 7 | # Parameters:
 8 | #
 9 | # regsize:   size of read or write operand, default = 32
10 | # roffset:   offset to aligned boundary, default = 0
11 | # alignment: alignment of boundary to cross
12 | # tmode:     R: read, W: write, WR: write, then read (store forwarding)
13 | 
14 | 
15 | . vars.sh
16 | 
17 | echo -e "\n\nMeasure unaligned read/write throughput\n"  > results2/unaligned_mem.txt
18 | 
19 | cachelinesize=`./cpugetinfo cachelinesize`
20 | 
21 | for tmode in R W WR
22 | do
23 | 
24 | for alignment in $(($cachelinesize/2)) $cachelinesize 4096
25 | do
26 | 
27 | for regsize in 32 128
28 | do
29 | 
30 | for roffset in 0 $(($regsize/8/4)) $(($regsize/8/2))
31 | do
32 | 
33 | 
34 | echo -e "\n\ntmode = $tmode, alignment = $alignment, register size = $regsize, offset = -$roffset \n"  >> results2/unaligned_mem.txt
35 | $ass -f elf64 -o b64.o -Dtmode=$tmode -Dalignment=$alignment -Droffset=$roffset -Dregsize=$regsize -Punaligned_mem.inc TemplateB64.nasm
36 | if [ $? -ne 0 ] ; then exit ; fi
37 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
38 | if [ $? -ne 0 ] ; then exit ; fi
39 | ./x >> results2/unaligned_mem.txt
40 | 
41 | done
42 | done
43 | done
44 | done
45 | 
46 | echo -e "\n"  >> results2/unaligned_mem.txt
47 | 


--------------------------------------------------------------------------------
/figures/plot_pht_index_bits_xor_phr.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | import csv
 5 | 
 6 | # collect phr bits
 7 | phr_bits = []
 8 | with open("pht_index_bits_xor_phr.csv", newline="") as f:
 9 |     r = csv.DictReader(f)
10 |     for row in r:
11 |         phr_bits.append(int(row["phr"]))
12 | 
13 | phr_bits = list(sorted(list(set(phr_bits))))
14 | fig, axes = plt.subplots(len(phr_bits), figsize=(16, len(phr_bits) * 2.5))
15 | 
16 | for i, phr_bit in enumerate(phr_bits):
17 |     x_data = []
18 |     y_data = []
19 |     z_data = []
20 | 
21 |     with open("pht_index_bits_xor_phr.csv", newline="") as f:
22 |         r = csv.DictReader(f)
23 |         for row in r:
24 |             if int(row["phr"]) == phr_bit:
25 |                 x_data.append(int(row["branches"]))
26 |                 y_data.append(int(row["dummy"]))
27 |                 z_data.append(min(float(row["min"]), 0.5))
28 |         z_data = np.array(z_data)
29 | 
30 |     x_data = list(sorted(set(x_data)))
31 |     y_data = list(sorted(set(y_data)))
32 |     z_data = z_data.reshape((len(x_data), len(y_data)))
33 | 
34 |     axes[i].imshow(z_data)
35 |     axes[i].set_ylabel("Predict branches")
36 |     axes[i].set_yticks(range(len(x_data)), x_data)
37 |     axes[i].set_xlabel(f"PHR bit position injecting PHR[{phr_bit}]")
38 |     axes[i].set_xticks(range(len(y_data)), y_data, rotation=90)
39 | plt.subplots_adjust(hspace=1.0)
40 | plt.savefig("plot_pht_index_bits_xor_phr.png")
41 | plt.cla()
42 | 


--------------------------------------------------------------------------------
/figures/plot_phr_target_bits_location.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | import csv
 5 | 
 6 | # Reproduce Table 2 of Half&Half
 7 | # Test target toggles
 8 | for prefix in [""]:
 9 |     x_data = []
10 |     y_data = []
11 |     z_data = []
12 | 
13 |     with open(f"{prefix}phr_target_bits_location.csv", newline="") as f:
14 |         r = csv.DictReader(f)
15 |         for row in r:
16 |             x_data.append(int(row["toggle"]))
17 |             y_data.append(int(row["dummy"]))
18 |             z_data.append(float(row["avg"]))
19 |         z_data = np.array(z_data)
20 | 
21 |     x_data = list(sorted(set(x_data)))
22 |     y_data = list(sorted(set(y_data)))
23 |     z_data = z_data.reshape((len(x_data), len(y_data)))
24 | 
25 |     plt.figure(figsize=(len(y_data) / 4, len(x_data) / 4))
26 |     plt.imshow(z_data)
27 |     plt.xlabel("Dummy branches")
28 |     plt.xticks(range(len(y_data)), y_data, rotation=90)
29 |     plt.ylabel("Target toggle bit")
30 |     plt.yticks(range(len(x_data)), x_data)
31 |     bar = plt.colorbar(shrink=0.8)
32 |     bar.ax.set_ylabel("Misprediction rate", fontsize=8, rotation=270, labelpad=9.0)
33 |     if np.max(z_data) > 0.7:
34 |         # missing conditional branch misses
35 |         plt.clim(0.5, 1.0)
36 |     else:
37 |         plt.clim(0, 0.5)
38 |     plt.grid()
39 |     plt.savefig(f"plot_{prefix}phr_target_bits_location.png")
40 |     plt.savefig(f"plot_{prefix}phr_target_bits_location.pdf", bbox_inches="tight")
41 |     plt.cla()
42 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/latencycf.inc:
--------------------------------------------------------------------------------
 1 | ; latencycf.inc                                              2016-10-23 Agner Fog
 2 | ; Define test code for measuring latency of miscellaneous instructions with carry flag output
 3 | ; (c) Copyright 2016 by Agner Fog. GNU General Public License www.gnu.org/licenses
 4 | 
 5 | ; Parameters:
 6 | ;
 7 | ; instruct:   Instruction to test
 8 | ;
 9 | ; regsize:    Register size: 8, 16, 32, 64, 128, 256, 512.
10 | ;
11 | ; regtype:    Register type: v = vector register 128 bits and bigger, k = mask register. 
12 | ;
13 | ; numop:      Number of register operands, not including flags register
14 | ;
15 | ; numimm:     Number of immediate operands. Default = 0
16 | ;
17 | ; immvalue:   Value of first immediate operand. Default = 0
18 | 
19 | 
20 | ; initialize eax
21 | %macro testinit2 0
22 |    xor eax, eax
23 | %endmacro
24 | 
25 | ; Define specific test code for each register type
26 | 
27 | %ifidni regtype, v
28 | 
29 |    %macro testcode 0
30 |       %if numop == 2
31 |          instruct reg1, reg1
32 |          setb al
33 |          %if regsize == 128
34 |             movd xmm1, eax
35 |          %else
36 |             vmovd xmm1, eax
37 |          %endif
38 |       %else
39 |          %error unsupported number of operands numop
40 |       %endif
41 |    %endmacro
42 | 
43 | %elifidni regtype, k
44 | 
45 |    %macro testcode 0
46 |       %if numop == 2
47 |          instruct k1,k1
48 |          setb al
49 |          kmovw k1,eax
50 |       %else
51 |          %error unsupported number of operands numop
52 |       %endif
53 |    %endmacro
54 | 
55 | %else
56 | 
57 |    %error unknown register type regtype
58 | 
59 | %endif
60 | 
61 | 


--------------------------------------------------------------------------------
/agner/testp/PMCTest/make_a_obj.bat:
--------------------------------------------------------------------------------
 1 | rem              make_a_obj.bat                     2014-04-16 Agner Fog
 2 | 
 3 | rem  compiles PMCTestA.cpp into PMCTestA32.obj and PMCTestA64.obj
 4 | 
 5 | rem  System requirements:
 6 | rem  Windows 2000 or NT or later
 7 | rem  Microsoft Visual C++ compiler or other C++ compiler
 8 | 
 9 | rem  You have to change all paths to the actual paths on your computer
10 | 
11 | rem  Set path to 32 bit compiler
12 | set VSroot=C:\Program Files (x86)\Microsoft Visual Studio 11.0
13 | set SDKroot=C:\Program Files (x86)\Windows Kits\8.0\
14 | set path1=%path%
15 | set path=%VSroot%\VC\bin;%VSroot%\Common7\IDE;%path1%
16 | 
17 | rem  Set path to *.h include files.
18 | set include=%VSroot%\VC\include;%SDKroot%\Include\um;%SDKroot%\Include\shared
19 | 
20 | rem  Set path to *.lib library files. 
21 | set LIB="%VSroot%\VC\lib;%SDKroot%\Lib\win8\um\x86"
22 | 
23 | rem compile 32 bit object file
24 | cl /c /O2 /FoPMCTestA32.obj PMCTestA.cpp
25 | if errorlevel 1 pause
26 | 
27 | rem compile 32bit exe file
28 | rem cl /O2 /MT /Fepmctest.exe PMCTestA32.obj PMCTestB.cpp "%SDKroot%\Lib\win8\um\x86\uuid.lib" "%VSroot%\VC\lib\libcmt.lib" "%VSroot%\VC\lib\oldnames.lib"
29 | 
30 | cl /O2 /MT /Fepmctest.exe PMCTestA.cpp PMCTestB.cpp Advapi32.lib /link /LIBPATH:"%SDKroot%\Lib\win8\um\x86" /LIBPATH:"%VSroot%\VC\lib"
31 | if errorlevel 1 pause
32 | 
33 | 
34 | 
35 | 
36 | rem  Set path to 64 bit compiler
37 | set path=%VSroot%\VC\bin\x86_amd64;%VSroot%\Common7\IDE;%path1%
38 | 
39 | rem  Set path to *.lib library files. 
40 | set lib="%VSroot%\VC\lib\amd64"
41 | 
42 | rem compile 64 bit version
43 | cl /c /O2 /FoPMCTestA64.obj PMCTestA.cpp
44 | if errorlevel 1 pause
45 | 
46 | pause
47 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch.inc:
--------------------------------------------------------------------------------
 1 | ; reproduce Figure 3 of Half&Half
 2 | 
 3 | ; alignment bits of branch instruction address
 4 | %ifndef branchalign
 5 |     %define branchalign 18
 6 | %endif
 7 | 
 8 | ; alignment bits of branch target address
 9 | %ifndef targetalign
10 |     %define targetalign 5
11 | %endif
12 | 
13 | %macro testinit3 0
14 |     mov rdi, 1000
15 | 
16 |     ; loop 300 times to clear phr
17 |     ; since we only consider branch misprediction of the last two branches
18 |     ; we do not have to be accurate here e.g. 93/194
19 | loop_begin:
20 |     mov eax, 300
21 |     align 64
22 |     jmp dummy_target
23 | 
24 |     align 1<<19
25 |     %rep (1<<19)-(1<<8)
26 |         nop
27 |     %endrep
28 | 
29 |     ; dummy_target aligned to 1<<8
30 | dummy_target:
31 |     %rep (1<<8)-7
32 |         nop
33 |     %endrep
34 |     dec eax ; 2 bytes
35 |     ; the last byte of jnz aligned to 1<<19
36 |     ; jnz dummy_target
37 |     db 0x0f
38 |     db 0x85
39 |     dd dummy_target - $ - 4
40 | 
41 |     READ_PMC_START
42 |     rdrand eax
43 |     and eax, 1
44 | 
45 |     ; READ_PMC_START: 166
46 |     ; rdrand eax: 3 bytes
47 |     ; and eax, 1: 3 bytes
48 |     ; jnz first_target: 6 bytes
49 | 
50 |     %rep (1<<branchalign)-166-6-6
51 |         nop
52 |     %endrep
53 | 
54 |     ; the last byte of jnz aligned to 1<<branchalign
55 |     ; jnz first_target
56 |     db 0x0f
57 |     db 0x85
58 |     dd first_target - $ - 4
59 | 
60 |     %rep (1<<targetalign)-1
61 |         nop
62 |     %endrep
63 |     ; target aligned to 1<<targetalign
64 | first_target:
65 | 
66 |     align 64
67 |     jnz second_target
68 | second_target:
69 | 
70 |     READ_PMC_END
71 | 
72 |     align 64
73 |     dec rdi
74 |     jnz loop_begin
75 | %endmacro


--------------------------------------------------------------------------------
/agner/testp/TestScripts/32bitinstr.sh1:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                                        2021-01-25 Agner Fog
 3 | #Test instructions that are only defined in 32 bit mode
 4 | # (c) Copyright 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses
 5 | 
 6 | # Skip if 32 bit not supported
 7 | if   [ `grep -c -i "no32bit" cpuinfo.txt ` -ne 0 ] ; then
 8 | exit
 9 | fi
10 | 
11 | # Detect CPU specific variables
12 | . vars.sh
13 | 
14 | echo -e "\n"  > results1/32bitinstr.txt
15 | 
16 | if [ $support32bit -eq 0 ] ; then
17 | echo -e "\nError: 32-bit instructions cannot be compiled on this platform\n"  >> results1/32bitinstr.txt
18 | exit
19 | fi
20 | 
21 | for instruct in aaa aas daa das aad aam_latency aam_throughput bound into lahf sahf lahf_sahf leave pushad popad salc salc_inc_al
22 | do
23 | 
24 | echo -e "\n\ninstruction: $instruct "  >> results1/32bitinstr.txt
25 | for cts in $PMClist
26 | do
27 | $ass -f elf32 -o b32.o -Dinstruct=$instruct -Drepeat1=100 -Dcounters=$cts -P32bitinstr.inc TemplateB32.nasm
28 | if [ $? -ne 0 ] ; then exit ; fi
29 | g++ -fno-pie -no-pie -m32 -fno-pie -no-pie a32.o b32.o -ox -lpthread
30 | if [ $? -ne 0 ] ; then exit ; fi
31 | ./x  >> results1/32bitinstr.txt
32 | done
33 | done
34 | 
35 | for immvalue in {0..4}
36 | do 
37 | echo -e "\n\ninstruction: enter 4, $immvalue "  >> results1/32bitinstr.txt
38 | for cts in $PMClist
39 | do 
40 | $ass -f elf32 -o b32.o -Dinstruct=enter -Dimmvalue=$immvalue -Dcounters=$cts -P32bitinstr.inc TemplateB32.nasm
41 | if [ $? -ne 0 ] ; then exit ; fi
42 | g++ -fno-pie -no-pie -m32 -fno-pie -no-pie a32.o b32.o -ox -lpthread
43 | if [ $? -ne 0 ] ; then exit ; fi
44 | ./x  >> results1/32bitinstr.txt
45 | done
46 | done
47 | 
48 | echo -e "\n"  >> results1/32bitinstr.txt
49 | 
50 | 


--------------------------------------------------------------------------------
/src/ras_size_lib.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | #include <assert.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <vector>
 6 | 
 7 | // ref:
 8 | // https://zhuanlan.zhihu.com/p/595585895
 9 | 
10 | // generated in ras_size_gen.cpp
11 | // args: loop count
12 | typedef void (*gadget)(size_t);
13 | extern "C" {
14 | extern gadget ras_size_gadgets[];
15 | }
16 | 
17 | void ras_size(FILE *fp) {
18 | #ifdef GEM5
19 |   int loop_count = 10;
20 | #else
21 |   int loop_count = 1000;
22 | #endif
23 |   // match gen_ras_test
24 |   int min_size = 1;
25 |   int max_size = 128;
26 | 
27 |   bind_to_core();
28 |   setup_time_or_cycles();
29 |   fprintf(fp, "size,min,avg,max\n");
30 |   int gadget_index = 0;
31 |   for (int size = min_size; size <= max_size; size++) {
32 |     std::vector<double> history;
33 |     int iterations = 100;
34 |     history.reserve(iterations);
35 | 
36 |     double sum = 0;
37 |     // run several times
38 |     for (int i = 0; i < iterations; i++) {
39 |       uint64_t begin = get_time_or_cycles();
40 |       ras_size_gadgets[gadget_index](loop_count);
41 |       uint64_t elapsed = get_time_or_cycles() - begin;
42 | 
43 |       // skip warmup
44 |       if (i >= 10) {
45 |         double time = (double)elapsed / loop_count / size;
46 |         history.push_back(time);
47 |         sum += time;
48 |       }
49 |     }
50 |     gadget_index++;
51 | 
52 |     double min = history[0];
53 |     double max = history[0];
54 |     for (size_t i = 0; i < history.size(); i++) {
55 |       if (min > history[i]) {
56 |         min = history[i];
57 |       }
58 |       if (max < history[i]) {
59 |         max = history[i];
60 |       }
61 |     }
62 |     fprintf(fp, "%d,%.2lf,%.2lf,%.2lf\n", size, min, sum / history.size(), max);
63 |     fflush(fp);
64 |   }
65 |   return;
66 | }
67 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/unaligned_mem.inc:
--------------------------------------------------------------------------------
 1 | ; unaligned_mem.inc                                                        2014-01-31 AgF
 2 | 
 3 | ; Test unaligned memory read and write throughput
 4 | ; (c) 2014 by Agner Fog. GNU General Public License www.gnu.org/licenses
 5 | 
 6 | ; Parameters:
 7 | ;
 8 | ; regsize:   size of read or write operand, default = 32
 9 | ; roffset:   offset to aligned boundary, default = 0
10 | ; alignment: alignment of boundary to cross
11 | ; tmode:     R: read, W: write, WR: write, then read (store forwarding)
12 | 
13 | %ifndef regsize
14 |    %define roffset 32
15 | %endif
16 | 
17 | %ifndef roffset
18 |    %define roffset 0
19 | %endif
20 | 
21 | %ifndef alignment
22 |    %define alignment 64
23 | %endif
24 | 
25 | %ifndef tmode
26 |    %define tmode R
27 | %endif
28 | 
29 | ; define move instruction
30 | %if regsize < 65  
31 |    %define moveinst  mov
32 | %elif regsize == 65  
33 |    %define moveinst  movq
34 | %elif regsize == 128  
35 |    %define moveinst  movdqu
36 | %elif regsize == 256  
37 |    %define moveinst  vmovdqu
38 | %else
39 |    %error unknown register size
40 | %endif
41 | 
42 | 
43 | ; initialization of aligned or misaligned pointer
44 | %macro testinit1 0
45 |     lea psi, [UserData + 1000h]
46 |     and psi, -1000h
47 | %endmacro
48 | 
49 | 
50 | ; main testcode macro
51 | %macro testcode 0
52 |    %ifidni tmode, R
53 |       moveinst reg0, [psi+alignment-roffset]
54 |    %elifidni tmode, W
55 |       moveinst [psi+alignment-roffset], reg0
56 |    %elifidni tmode, WR
57 |       moveinst [psi+alignment-roffset], reg0
58 |       moveinst reg0, [psi-roffset]
59 |    %endif
60 | %endmacro
61 | 
62 | 
63 | %macro testdata 0
64 |         times 10000H  DB 0
65 | %endmacro
66 | 
67 | 
68 | ; test loops
69 | %define repeat1 1000
70 | %define repeat2 100
71 | 


--------------------------------------------------------------------------------
/agner/testp/PMCTest/MSRDriver.h:
--------------------------------------------------------------------------------
 1 | //                       msrdriver.h                     2012-03-02 Agner Fog
 2 | 
 3 | // Device driver for access to Model-specific registers and control registers
 4 | // in Windows 2000 and later and Linux (32 and 64 bit x86 platform) 
 5 | 
 6 | // (c) Copyright 2005-2012 by Agner Fog. GNU General Public License www.gnu.org/licences
 7 | 
 8 | #pragma once
 9 | 
10 | // list of input/output data structures for MSR driver
11 | #define MAX_QUE_ENTRIES 32                  // maximum number of entries in queue
12 | 
13 | // commands for MSR driver. Shared with application program
14 | enum EMSR_COMMAND {
15 |     MSR_IGNORE = 0,                // do nothing
16 |     MSR_STOP   = 1,                // skip rest of list
17 |     MSR_READ   = 2,                // read model specific register
18 |     MSR_WRITE  = 3,                // write model specific register
19 |     CR_READ    = 4,                // read control register
20 |     CR_WRITE   = 5,                // write control register
21 |     PMC_ENABLE = 6,                // Enable RDPMC and RDTSC instructions
22 |     PMC_DISABLE= 7,                // Disable RDPMC instruction (RDTSC remains enabled)
23 |     PROC_GET   = 8,                // Get processor number (In multiprocessor systems. 0-based)
24 |     PROC_SET   = 9,                // Set processor number (In multiprocessor systems. 0-based)
25 |     UNUSED1    = 0x7fffffff        // make sure this enum takes 32 bits
26 | };
27 | 
28 | 
29 | // input/output data structure for MSR driver
30 | struct SMSRInOut {
31 |     enum EMSR_COMMAND msr_command;      // command for read or write register
32 |     unsigned int register_number;  // register number
33 |     union {
34 |         long long value;            // 64 bit value to read or write
35 |         unsigned int val[2];        // lower and upper 32 bits
36 |     };
37 | };
38 | 


--------------------------------------------------------------------------------
/agner/testp/DriverSrcLinux/MSRDriver.h:
--------------------------------------------------------------------------------
 1 | //                       msrdriver.h                     2012-03-02 Agner Fog
 2 | 
 3 | // Device driver for access to Model-specific registers and control registers
 4 | // in Windows 2000 and later and Linux (32 and 64 bit x86 platform) 
 5 | 
 6 | // (c) Copyright 2005-2012 by Agner Fog. GNU General Public License www.gnu.org/licences
 7 | 
 8 | #pragma once
 9 | 
10 | // list of input/output data structures for MSR driver
11 | #define MAX_QUE_ENTRIES 32                  // maximum number of entries in queue
12 | 
13 | // commands for MSR driver. Shared with application program
14 | enum EMSR_COMMAND {
15 |     MSR_IGNORE = 0,                // do nothing
16 |     MSR_STOP   = 1,                // skip rest of list
17 |     MSR_READ   = 2,                // read model specific register
18 |     MSR_WRITE  = 3,                // write model specific register
19 |     CR_READ    = 4,                // read control register
20 |     CR_WRITE   = 5,                // write control register
21 |     PMC_ENABLE = 6,                // Enable RDPMC and RDTSC instructions
22 |     PMC_DISABLE= 7,                // Disable RDPMC instruction (RDTSC remains enabled)
23 |     PROC_GET   = 8,                // Get processor number (In multiprocessor systems. 0-based)
24 |     PROC_SET   = 9,                // Set processor number (In multiprocessor systems. 0-based)
25 |     UNUSED1    = 0x7fffffff        // make sure this enum takes 32 bits
26 | };
27 | 
28 | 
29 | // input/output data structure for MSR driver
30 | struct SMSRInOut {
31 |     enum EMSR_COMMAND msr_command;      // command for read or write register
32 |     unsigned int register_number;  // register number
33 |     union {
34 |         long long value;            // 64 bit value to read or write
35 |         unsigned int val[2];        // lower and upper 32 bits
36 |     };
37 | };
38 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/MSRDriver.h:
--------------------------------------------------------------------------------
 1 | //                       msrdriver.h                     2012-03-02 Agner Fog
 2 | 
 3 | // Device driver for access to Model-specific registers and control registers
 4 | // in Windows 2000 and later and Linux (32 and 64 bit x86 platform) 
 5 | 
 6 | // (c) Copyright 2005-2012 by Agner Fog. GNU General Public License www.gnu.org/licences
 7 | 
 8 | #pragma once
 9 | 
10 | // list of input/output data structures for MSR driver
11 | #define MAX_QUE_ENTRIES 32                  // maximum number of entries in queue
12 | 
13 | // commands for MSR driver. Shared with application program
14 | enum EMSR_COMMAND {
15 |     MSR_IGNORE = 0,                // do nothing
16 |     MSR_STOP   = 1,                // skip rest of list
17 |     MSR_READ   = 2,                // read model specific register
18 |     MSR_WRITE  = 3,                // write model specific register
19 |     CR_READ    = 4,                // read control register
20 |     CR_WRITE   = 5,                // write control register
21 |     PMC_ENABLE = 6,                // Enable RDPMC and RDTSC instructions
22 |     PMC_DISABLE= 7,                // Disable RDPMC instruction (RDTSC remains enabled)
23 |     PROC_GET   = 8,                // Get processor number (In multiprocessor systems. 0-based)
24 |     PROC_SET   = 9,                // Set processor number (In multiprocessor systems. 0-based)
25 |     UNUSED1    = 0x7fffffff        // make sure this enum takes 32 bits
26 | };
27 | 
28 | 
29 | // input/output data structure for MSR driver
30 | struct SMSRInOut {
31 |     enum EMSR_COMMAND msr_command;      // command for read or write register
32 |     unsigned int register_number;  // register number
33 |     union {
34 |         long long value;            // 64 bit value to read or write
35 |         unsigned int val[2];        // lower and upper 32 bits
36 |     };
37 | };
38 | 


--------------------------------------------------------------------------------
/results/rob_size/README.md:
--------------------------------------------------------------------------------
 1 | # Measured ROB Size
 2 | 
 3 | - AMD Zen 1: 192
 4 | - AMD Zen 2: 224
 5 | - Intel Ivy Bridge EP: 168
 6 | - Intel Broadwell: 192
 7 | - Intel Cascade Lake: 224
 8 | - ARM Cortex A72: 40
 9 | - ARM Neoverse N1: 128
10 | - IBM Power 8: ?
11 | 
12 | References:
13 | 
14 | https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)
15 | - Intel Broadwell: 192 
16 | - Intel Skylake(client): 224
17 | 
18 | https://en.wikichip.org/wiki/intel/microarchitectures/cascade_lake
19 | - Intel Cascade Lake: 224
20 | 
21 | https://www.anandtech.com/show/14514/examining-intels-ice-lake-microarchitecture-and-sunny-cove/3
22 | - Intel Haswell: 182
23 | - Intel Skylake: 224
24 | - Intel Sunny Cove: 352
25 | 
26 | https://en.wikichip.org/wiki/intel/microarchitectures/broadwell_(client)
27 | - Intel Broadwell: 192
28 | 
29 | https://en.wikichip.org/wiki/intel/microarchitectures/haswell_(client)
30 | - Intel Haswell: 192
31 | 
32 | https://en.wikichip.org/wiki/amd/microarchitectures/zen_2
33 | - AMD Zen 1: 192
34 | - AMD Zen 2: 224
35 | 
36 | https://www.anandtech.com/show/16226/apple-silicon-m1-a14-deep-dive/2
37 | - Apple M1: ~630
38 | 
39 | https://www.anandtech.com/show/10435/assessing-ibms-power8-part-1/3
40 | - IBM Power 8: 224
41 | - Intel Broadwell: 192
42 | - Intel Skylake: 224
43 | 
44 | https://en.wikichip.org/wiki/arm_holdings/microarchitectures/neoverse_n1
45 | - ARM Neoverse N1: 128
46 | 
47 | https://www.tomshardware.com/reviews/arm-cortex-a72-architecture,4424.html
48 | - ARM Cortex A72: 128
49 | 
50 | https://travisdowns.github.io/blog/2019/06/11/speed-limits.html#ooo-table
51 | - Intel Sandy Bridge: 168
52 | - Intel Ivy Bridge: 168
53 | - Intel Haswell: 192
54 | - Intel Broadwell: 192
55 | - Intel Skylake: 224
56 | - Intel Sunny Cove: 352
57 | - AMD Zen: 192
58 | - AMD Zen 2: 224
59 | - AMD Zen 3: 256
60 | - Apple M1 Firestorm: 636
61 | - Apple M1 Icestorm: 111
62 | - Amazon Graviton 2: ~124


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | 
 5 | # Reproduce Figure 3(a) of Half&Half
 6 | x_data = range(1, 20)
 7 | y_data = []
 8 | for branch_align in x_data:
 9 |     output = subprocess.check_output(
10 |         ["./my_branch.sh2", str(branch_align), "6"], encoding="utf-8"
11 |     )
12 |     heading = False
13 |     data = []
14 |     for line in output.splitlines():
15 |         parts = list(filter(lambda s: len(s) > 0, line.strip().split(" ")))
16 |         if len(parts) > 0:
17 |             if not heading:
18 |                 assert parts[5] == "BrMisCond"
19 |                 heading = True
20 |             else:
21 |                 data.append(int(parts[4]))
22 |     avg = np.average(np.array(data)) / 2000  # 2 branches, 1000 loops
23 |     print(branch_align, 6, f"{avg:.2f}")
24 |     y_data.append(avg)
25 | 
26 | plt.plot(x_data, y_data)
27 | plt.xlabel("Branch alignment bits")
28 | plt.ylabel("Miss Rate")
29 | plt.yticks([0.25, 0.50])
30 | plt.savefig("my_branch_1.png")
31 | plt.cla()
32 | 
33 | # Reproduce Figure 3(b) of Half&Half
34 | x_data = range(1, 19)
35 | y_data = []
36 | for target_align in x_data:
37 |     output = subprocess.check_output(
38 |         ["./my_branch.sh2", "16", str(target_align)], encoding="utf-8"
39 |     )
40 |     heading = False
41 |     data = []
42 |     for line in output.splitlines():
43 |         parts = list(filter(lambda s: len(s) > 0, line.strip().split(" ")))
44 |         if len(parts) > 0:
45 |             if not heading:
46 |                 assert parts[5] == "BrMisCond"
47 |                 heading = True
48 |             else:
49 |                 data.append(int(parts[4]))
50 |     avg = np.average(np.array(data)) / 2000  # 2 branches, 1000 loops
51 |     print(15, target_align, f"{avg:.2f}")
52 |     y_data.append(avg)
53 | 
54 | plt.plot(x_data, y_data)
55 | plt.xlabel("Target alignment bits")
56 | plt.ylabel("Miss Rate")
57 | plt.yticks([0.25, 0.50])
58 | plt.savefig("my_branch_2.png")
59 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/jmp.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                       2013-07-19 Agner Fog
 3 | 
 4 | #Test jump instructions performance
 5 | 
 6 | # (c) Copyright 2013 by Agner Fog. GNU General Public License www.gnu.org/licenses
 7 | 
 8 | # Detect CPU specific variables
 9 | . vars.sh
10 | 
11 | # Performance counters
12 | if  [ "$CPUbrand" = "Intel" -a $imodel -ne 28 ] ; then
13 | # This one is for Intel processors with uop cache:
14 | cts="1,9,100,25,26,207"
15 | else
16 | cts=$BranchPMCs
17 | fi
18 | 
19 | 
20 | echo -e "Test jump instructions"  > results2/jmp.txt
21 | 
22 | for jmp_per_16b in 1 2 3 4 5 6 8
23 | do 
24 | echo -e "\n\njmp ($jmp_per_16b per 16 bytes), 64 bit"  >> results2/jmp.txt
25 | $ass -f elf64 -o b64.o -Dinstruct=jmp -Drepeat1=1000 -Dregsize=64 -Djmp_per_16b=$jmp_per_16b -Dcounters=$cts -Pmisc_int.inc TemplateB64.nasm
26 | if [ $? -ne 0 ] ; then exit ; fi
27 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
28 | if [ $? -ne 0 ] ; then exit ; fi
29 | ./x  >> results2/jmp.txt
30 | done
31 | 
32 | for jmptaken in yes no
33 | do
34 | echo -e "\n\nconditional jump, taken=$jmptaken, 2 per 16 bytes, 64 bit"  >> results2/jmp.txt
35 | $ass -f elf64 -o b64.o -Dinstruct=conditional_jmp -Drepeat1=1000 -Dregsize=64 -Djmp_per_16b=2 -Djmptaken=$jmptaken -Dcounters=$cts -Pmisc_int.inc TemplateB64.nasm
36 | if [ $? -ne 0 ] ; then exit ; fi
37 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
38 | if [ $? -ne 0 ] ; then exit ; fi
39 | ./x  >> results2/jmp.txt
40 | done
41 | 
42 | for jmp_per_16b in 1 2 3 4 5 6 8
43 | do
44 | echo -e "\n\nconditional jump, taken=alternate, $jmp_per_16b per 16 bytes, 64 bit"  >> results2/jmp.txt
45 | $ass -f elf64 -o b64.o -Dinstruct=conditional_jmp -Drepeat1=1000 -Dregsize=64 -Djmp_per_16b=$jmp_per_16b -Djmptaken=alternate -Drepeat0=10 -Dcounters=$cts  -Pmisc_int.inc TemplateB64.nasm
46 | if [ $? -ne 0 ] ; then exit ; fi
47 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
48 | if [ $? -ne 0 ] ; then exit ; fi
49 | ./x  >> results2/jmp.txt
50 | done
51 | 
52 | echo -e "\n"  >> results2/jmp.txt
53 | 
54 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch2.inc:
--------------------------------------------------------------------------------
 1 | ; reproduce Figure 4 of Half&Half
 2 | 
 3 | ; alignment bits of branch instruction address
 4 | %ifndef branchalign
 5 |     %define branchalign 18
 6 | %endif
 7 | 
 8 | ; alignment bits of branch target address
 9 | %ifndef targetalign
10 |     %define targetalign 5
11 | %endif
12 | 
13 | ; toggle bit of branch address
14 | %ifndef branchtoggle
15 |     %define branchtoggle 0
16 | %endif
17 | 
18 | ; toggle bit of target address
19 | %ifndef targettoggle
20 |     %define targettoggle 0
21 | %endif
22 | 
23 | %macro testinit3 0
24 |     mov rdi, 1000
25 | 
26 |     ; loop 300 times to clear phr
27 |     ; since we only consider branch misprediction of the last two branches
28 |     ; we do not have to be accurate here e.g. 93/194
29 | loop_begin:
30 |     mov eax, 300
31 |     align 64
32 |     jmp dummy_target
33 | 
34 |     align 1<<19
35 |     %rep (1<<19)-(1<<8)
36 |         nop
37 |     %endrep
38 | 
39 |     ; dummy_target aligned to 1<<8
40 | dummy_target:
41 |     %rep (1<<8)-7
42 |         nop
43 |     %endrep
44 |     dec eax ; 2 bytes
45 |     ; the last byte of jnz aligned to 1<<19
46 |     ; jnz dummy_target
47 |     db 0x0f
48 |     db 0x85
49 |     dd dummy_target - $ - 4
50 | 
51 |     READ_PMC_START
52 |     rdrand eax
53 |     and eax, 1
54 | 
55 |     ; READ_PMC_START: 166
56 |     ; rdrand eax: 3 bytes
57 |     ; and eax, 1: 3 bytes
58 |     ; jnz first_target: 6 bytes
59 | 
60 |     %rep (1<<branchalign)-166-6-6
61 |         nop
62 |     %endrep
63 | 
64 |     %rep (1<<branchtoggle)
65 |         nop
66 |     %endrep
67 | 
68 |     ; the last byte of jnz minus 1<<branchtoggle aligned to 1<<branchalign
69 |     ; jnz first_target
70 |     db 0x0f
71 |     db 0x85
72 |     dd first_target - $ - 4
73 | 
74 |     %rep (1<<targetalign)-1-(1<<branchtoggle)
75 |         nop
76 |     %endrep
77 |     %rep (1<<targettoggle)
78 |         nop
79 |     %endrep
80 |     ; target minus 1<<targettoggle aligned to 1<<targetalign
81 | first_target:
82 | 
83 |     align 64
84 |     jnz second_target
85 | second_target:
86 | 
87 |     READ_PMC_END
88 | 
89 |     align 64
90 |     dec rdi
91 |     jnz loop_begin
92 | %endmacro


--------------------------------------------------------------------------------
/src/ghr_size_lib.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | #include <assert.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <vector>
 6 | 
 7 | // generated in ghr_size_gen.cpp
 8 | // args: loop count, buffer
 9 | typedef void (*gadget)(size_t, uint32_t *);
10 | extern "C" {
11 | extern gadget ghr_size_gadgets[];
12 | }
13 | 
14 | void ghr_size(FILE *fp) {
15 |   int loop_count = 1000;
16 |   // match gen_ghr_test
17 |   int repeat = 2;
18 |   int min_size = 1;
19 |   int max_size = 1024;
20 | 
21 |   bind_to_core();
22 | #ifdef IOS
23 |   // no pmu
24 | #else
25 |   setup_perf_branch_misses();
26 | #endif
27 |   assert(fp);
28 | 
29 |   uint32_t *buffer = new uint32_t[loop_count + 1];
30 | 
31 |   fprintf(fp, "size,min,avg,max\n");
32 |   int gadget_index = 0;
33 |   for (int size = min_size; size <= max_size; size++) {
34 |     std::vector<double> history;
35 |     int iterations = 100;
36 |     history.reserve(iterations);
37 | 
38 |     double sum = 0;
39 |     // run several times
40 |     for (int i = 0; i < iterations; i++) {
41 |       for (int i = 0; i <= loop_count; i++) {
42 |         buffer[i] = rand() % 2;
43 |       }
44 | #ifdef IOS
45 |       // fallback
46 |       uint64_t begin = get_time();
47 | #else
48 |       uint64_t begin = perf_read_branch_misses();
49 | #endif
50 | 
51 |       ghr_size_gadgets[gadget_index](loop_count, buffer);
52 | 
53 | #ifdef IOS
54 |       // fallback
55 |       uint64_t elapsed = get_time() - begin;
56 | #else
57 |       uint64_t elapsed = perf_read_branch_misses() - begin;
58 | #endif
59 | 
60 |       // skip warmup
61 |       if (i >= 10) {
62 |         double time = (double)elapsed / loop_count / repeat;
63 |         history.push_back(time);
64 |         sum += time;
65 |       }
66 |     }
67 |     gadget_index++;
68 | 
69 |     double min = history[0];
70 |     double max = history[0];
71 |     for (size_t i = 0; i < history.size(); i++) {
72 |       if (min > history[i]) {
73 |         min = history[i];
74 |       }
75 |       if (max < history[i]) {
76 |         max = history[i];
77 |       }
78 |     }
79 |     fprintf(fp, "%d,%.2lf,%.2lf,%.2lf\n", size, min, sum / history.size(), max);
80 |     fflush(fp);
81 |   }
82 |   delete[] buffer;
83 | }
84 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch4.inc:
--------------------------------------------------------------------------------
 1 | ; reproduce Figure 4 of Half&Half
 2 | ; additionally shift PHR by 16 bits to avoid tag collision
 3 | 
 4 | ; alignment bits of branch instruction address
 5 | %ifndef branchalign
 6 |     %define branchalign 18
 7 | %endif
 8 | 
 9 | ; alignment bits of branch target address
10 | %ifndef targetalign
11 |     %define targetalign 5
12 | %endif
13 | 
14 | ; toggle bit of branch address
15 | %ifndef branchtoggle
16 |     %define branchtoggle 0
17 | %endif
18 | 
19 | ; toggle bit of target address
20 | %ifndef targettoggle
21 |     %define targettoggle 0
22 | %endif
23 | 
24 | %macro SHIFT_PHR 1
25 |     mov eax, %1+1
26 | 
27 |     align 1<<16
28 |     %rep (1<<16)-(1<<6)
29 |         nop
30 |     %endrep
31 | 
32 |     ; dummy_target aligned to 1<<6
33 | %%shift_phr_dummy_target:
34 |     %rep (1<<6)-7
35 |         nop
36 |     %endrep
37 |     dec eax ; 2 bytes
38 |     ; the last byte of jnz aligned to 1<<16
39 |     ; jnz shift_phr_dummy_target
40 |     db 0x0f
41 |     db 0x85
42 |     dd %%shift_phr_dummy_target - $ - 4
43 | %endmacro
44 | 
45 | %macro testinit3 0
46 |     mov rdi, 1000
47 | 
48 | loop_begin:
49 |     ; loop 300 times to clear phr
50 |     SHIFT_PHR 300
51 | 
52 |     READ_PMC_START
53 |     rdrand eax
54 |     and eax, 1
55 | 
56 |     ; READ_PMC_START: 166
57 |     ; rdrand eax: 3 bytes
58 |     ; and eax, 1: 3 bytes
59 |     ; jnz first_target: 6 bytes
60 | 
61 |     %rep (1<<branchalign)-166-6-6
62 |         nop
63 |     %endrep
64 | 
65 |     %rep (1<<branchtoggle)
66 |         nop
67 |     %endrep
68 | 
69 |     ; the last byte of jnz minus 1<<branchtoggle aligned to 1<<branchalign
70 |     ; jnz first_target
71 |     db 0x0f
72 |     db 0x85
73 |     dd first_target - $ - 4
74 | 
75 |     %rep (1<<targetalign)-1-(1<<branchtoggle)
76 |         nop
77 |     %endrep
78 |     %rep (1<<targettoggle)
79 |         nop
80 |     %endrep
81 |     ; target minus 1<<targettoggle aligned to 1<<targetalign
82 | first_target:
83 | 
84 |     ; loop to shift phr by 16 bits in 8 loops
85 |     SHIFT_PHR 8
86 | 
87 |     align 64
88 |     jnz second_target
89 | second_target:
90 | 
91 |     READ_PMC_END
92 | 
93 |     align 64
94 |     dec rdi
95 |     jnz loop_begin
96 | %endmacro


--------------------------------------------------------------------------------
/src/rob_size.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | #include <assert.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <vector>
 6 | 
 7 | // generated in rob_size_gen.cpp
 8 | // args: buffer1, buffer2, loop count
 9 | typedef void (*gadget)(char ***, char ***, size_t);
10 | extern "C" {
11 | extern gadget rob_size_gadgets[];
12 | }
13 | 
14 | int main(int argc, char *argv[]) {
15 | #ifdef GEM5
16 |   int loop_count = 10;
17 | #else
18 |   int loop_count = 1000;
19 | #endif
20 |   // match gen_rob_test
21 |   int repeat = 20;
22 |   int min_size = 1;
23 |   int max_size = 1024;
24 | 
25 |   bind_to_core();
26 |   setup_time_or_cycles();
27 |   FILE *fp = fopen("rob_size.csv", "w");
28 |   assert(fp);
29 | 
30 | #ifdef GEM5
31 |   size_t buffer_size = 1024 * 1024 * 16; // 16 MB
32 | #else
33 |   size_t buffer_size = 1024 * 1024 * 256; // 256 MB
34 | #endif
35 |   char **buffer1 = generate_random_pointer_chasing(buffer_size);
36 |   char **p1 = buffer1;
37 |   char **buffer2 = generate_random_pointer_chasing(buffer_size);
38 |   char **p2 = buffer2;
39 |   fprintf(fp, "size,min,avg,max\n");
40 |   for (int size = min_size; size <= max_size; size++) {
41 |     std::vector<double> history;
42 |     int iterations = 100;
43 |     history.reserve(iterations);
44 | 
45 |     double sum = 0;
46 |     // run several times
47 |     for (int i = 0; i < iterations; i++) {
48 |       uint64_t begin = get_time_or_cycles();
49 |       rob_size_gadgets[size - min_size](&p1, &p2, loop_count);
50 |       uint64_t elapsed = get_time_or_cycles() - begin;
51 | 
52 |       // skip warmup
53 |       if (i >= 10) {
54 |         double time = (double)elapsed / loop_count / repeat;
55 |         history.push_back(time);
56 |         sum += time;
57 |       }
58 |     }
59 | 
60 |     double min = history[0];
61 |     double max = history[0];
62 |     for (size_t i = 0; i < history.size(); i++) {
63 |       if (min > history[i]) {
64 |         min = history[i];
65 |       }
66 |       if (max < history[i]) {
67 |         max = history[i];
68 |       }
69 |     }
70 |     fprintf(fp, "%d,%.2lf,%.2lf,%.2lf\n", size, min, sum / history.size(), max);
71 |     fflush(fp);
72 |   }
73 | 
74 |   printf("Results are written to rob_size.csv\n");
75 |   delete[] buffer1;
76 |   delete[] buffer2;
77 |   return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch5.inc:
--------------------------------------------------------------------------------
  1 | ; reproduce Figure 5 of Half&Half
  2 | 
  3 | ; alignment bits of branch instruction address
  4 | %ifndef branchalign
  5 |     %define branchalign 18
  6 | %endif
  7 | 
  8 | %macro SHIFT_PHR 1
  9 |     mov eax, %1+1
 10 | 
 11 |     align 1<<16
 12 |     %rep (1<<16)-(1<<6)
 13 |         nop
 14 |     %endrep
 15 | 
 16 |     ; dummy_target aligned to 1<<6
 17 | %%shift_phr_dummy_target:
 18 |     %rep (1<<6)-7
 19 |         nop
 20 |     %endrep
 21 |     dec eax ; 2 bytes
 22 |     ; the last byte of jnz aligned to 1<<16
 23 |     ; jnz shift_phr_dummy_target
 24 |     db 0x0f
 25 |     db 0x85
 26 |     dd %%shift_phr_dummy_target - $ - 4
 27 | %endmacro
 28 | 
 29 | %macro testinit3 0
 30 |     mov rdi, 10000
 31 | 
 32 | loop_begin:
 33 |     ; k = rand() % 2
 34 |     rdrand ebx
 35 |     and ebx, 1
 36 | 
 37 |     ; set phr to k0...0
 38 |     ; set lower two bits to 0bk0
 39 |     align 1<<5
 40 |     %rep (1<<4)-1
 41 |         nop
 42 |     %endrep
 43 |     ; B3=0, B4=1
 44 |     jnz zero_target
 45 | 
 46 |     align 1<<5
 47 |     %rep (1<<5)-1
 48 |         nop
 49 |     %endrep
 50 |     ; B3=0, B4=0
 51 |     jz zero_target
 52 | 
 53 |     ; T0=T1=0
 54 |     align 1<<6
 55 | zero_target:
 56 | 
 57 |     ; shift phr by 193 times
 58 |     SHIFT_PHR 193
 59 |     
 60 |     ; first test branch
 61 |     test ebx, ebx
 62 |     align 1<<(branchalign+1)
 63 |     %rep (1<<branchalign)-1
 64 |         nop
 65 |     %endrep
 66 |     jnz first_target
 67 | first_target:
 68 | 
 69 |     ; set phr to k0...0 again
 70 |     test ebx, ebx
 71 |     
 72 |     ; set lower two bits to 0bk0
 73 |     align 1<<5
 74 |     %rep (1<<4)-1
 75 |         nop
 76 |     %endrep
 77 |     ; B3=0, B4=1
 78 |     jnz zero_target_2
 79 |     
 80 |     align 1<<5
 81 |     %rep (1<<5)-1
 82 |         nop
 83 |     %endrep
 84 |     ; B3=0, B4=0
 85 |     jz zero_target_2
 86 | 
 87 |     ; T0=T1=0
 88 |     align 1<<6
 89 | zero_target_2:
 90 | 
 91 |     ; shift phr by 193 times
 92 |     SHIFT_PHR 193
 93 | 
 94 |     READ_PMC_START
 95 |     ; second test branch
 96 |     ; in the opposite direction
 97 |     test ebx, ebx ; 2 bytes
 98 |     align 1<<(branchalign+1)
 99 |     %rep (1<<branchalign)-1
100 |         nop
101 |     %endrep
102 |     jz second_target
103 | second_target:
104 | 
105 |     READ_PMC_END
106 | 
107 |     align 64
108 |     dec rdi
109 |     jnz loop_begin
110 |     
111 | %endmacro


--------------------------------------------------------------------------------
/agner/testp/TestScripts/ucache_misprediction.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                                    2013-07-19 Agner Fog
 3 | # Compile and run PMCTest for testing branch prediction penalty with and without
 4 | # microop cache
 5 | 
 6 | # (c) Copyright 2013 by Agner Fog. GNU General Public License www.gnu.org/licenses
 7 | 
 8 | # Detect CPU specific variables
 9 | . vars.sh
10 | 
11 | # Performance counters
12 | if  [ "$CPUbrand" = "Intel" -a $imodel -ne 28 ] ; then
13 | # This one is for Intel processors with uop cache:
14 | cts="1,9,100,25,26,207"
15 | else
16 | cts=$BranchPMCs
17 | fi
18 | 
19 | 
20 | echo -e "Branch prediction"  > results2/ucache_misprediction.txt
21 | 
22 | repeat0=20
23 | 
24 | echo -e "\n\nCase 1: Tiny loop. Expect loop counter to be used"  >> results2/ucache_misprediction.txt
25 | tcase=1
26 | 
27 | for count1 in  10 100 1000
28 | do
29 | echo -e "\n\nLoop count $count1"  >> results2/ucache_misprediction.txt
30 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dcount1=$count1 -Dcounters=$cts -Drepeat0=$repeat0 -Pucache_misprediction.inc TemplateB64.nasm
31 | if [ $? -ne 0 ] ; then exit ; fi
32 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
33 | if [ $? -ne 0 ] ; then exit ; fi
34 | ./x >> results2/ucache_misprediction.txt
35 | done
36 | 
37 | echo -e "\n\nCase 2: Normal loop. Expect uop cache to be used"  >> results2/ucache_misprediction.txt
38 | tcase=2
39 | 
40 | for count1 in  4 10 100
41 | do
42 | echo -e "\n\nNumber of branches $count1"  >> results2/ucache_misprediction.txt
43 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dcount1=$count1 -Dcounters=$cts -Drepeat0=$repeat0 -Pucache_misprediction.inc TemplateB64.nasm
44 | if [ $? -ne 0 ] ; then exit ; fi
45 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
46 | if [ $? -ne 0 ] ; then exit ; fi
47 | ./x >> results2/ucache_misprediction.txt
48 | done
49 | 
50 | echo -e "\n\nCase 3: Extremely big loop. Expect only fetch and decode to be used"  >> results2/ucache_misprediction.txt
51 | tcase=3
52 | 
53 | for count1 in  4 10 100
54 | do
55 | echo -e "\n\nNumber of branches $count1"  >> results2/ucache_misprediction.txt
56 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dcount1=$count1 -Dcounters=$cts -Drepeat0=$repeat0 -Pucache_misprediction.inc TemplateB64.nasm
57 | if [ $? -ne 0 ] ; then exit ; fi
58 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
59 | if [ $? -ne 0 ] ; then exit ; fi
60 | ./x >> results2/ucache_misprediction.txt
61 | done
62 | 
63 | 
64 | 
65 | echo -e "\n"  >> results2/ucache_misprediction.txt
66 | 


--------------------------------------------------------------------------------
/src/pht_associativity.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | #include <assert.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <vector>
 6 | 
 7 | // defined in gen_pht_associativity_test()
 8 | // args: loop count, buffer
 9 | typedef void (*gadget)(size_t, uint32_t *);
10 | extern "C" {
11 | extern gadget pht_associativity_gadgets[];
12 | }
13 | 
14 | int main(int argc, char *argv[]) {
15 |   int loop_count = 1000;
16 |   // match gen_pht_associativity_test
17 |   int min_branches = 1;
18 |   int max_branches = 32;
19 |   int min_branch_align = 3;
20 | #ifdef __APPLE__
21 |   // alignment cannot surpass page size
22 |   int max_branch_align = 8;
23 | #else
24 |   int max_branch_align = 19;
25 | #endif
26 | 
27 |   bind_to_core();
28 |   setup_perf_cond_branch_misses();
29 |   FILE *fp = fopen("pht_associativity.csv", "w");
30 |   assert(fp);
31 | 
32 |   uint32_t *buffer = new uint32_t[loop_count + 1];
33 |   for (int i = 0; i <= loop_count; i++) {
34 |     buffer[i] = rand() % 2;
35 |   }
36 | 
37 |   fprintf(fp, "branches,align,min,avg,max\n");
38 |   int gadget_index = 0;
39 |   for (int branches = min_branches; branches <= max_branches; branches++) {
40 |     for (int branch_align = min_branch_align; branch_align <= max_branch_align;
41 |          branch_align++) {
42 |       std::vector<double> history;
43 |       int iterations = 100;
44 |       history.reserve(iterations);
45 | 
46 |       double sum = 0;
47 |       // run several times
48 |       for (int i = 0; i < iterations; i++) {
49 |         uint64_t begin = perf_read_cond_branch_misses();
50 |         pht_associativity_gadgets[gadget_index](loop_count, buffer);
51 |         uint64_t elapsed = perf_read_cond_branch_misses() - begin;
52 | 
53 |         // skip warmup
54 |         if (i >= 10) {
55 |           double time = (double)elapsed / loop_count;
56 |           history.push_back(time);
57 |           sum += time;
58 |         }
59 |       }
60 |       gadget_index++;
61 | 
62 |       double min = history[0];
63 |       double max = history[0];
64 |       for (size_t i = 0; i < history.size(); i++) {
65 |         if (min > history[i]) {
66 |           min = history[i];
67 |         }
68 |         if (max < history[i]) {
69 |           max = history[i];
70 |         }
71 |       }
72 |       fprintf(fp, "%d,%d,%.2lf,%.2lf,%.2lf\n", branches, branch_align, min,
73 |               sum / history.size(), max);
74 |       fflush(fp);
75 |     }
76 |   }
77 | 
78 |   printf("Results are written to pht_associativity.csv\n");
79 |   delete[] buffer;
80 |   return 0;
81 | }
82 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/out_of_order.inc:
--------------------------------------------------------------------------------
  1 | ;----------------------------------------------------------------------------
  2 | ;                out_of_order.inc                          2014-08-03 Agner Fog
  3 | ;
  4 | ; PMC Test program for measuring out-of-order capacity
  5 | ;
  6 | ; (c) 2014 GNU General Public License www.gnu.org/licenses
  7 | ;
  8 | ; Test cases:
  9 | ; 1: integer add
 10 | ; 2: integer mul
 11 | ; 3: floating point add
 12 | ; 4: floating point mul
 13 | ;-----------------------------------------------------------------------------
 14 | ; Define any undefined macros
 15 | 
 16 | ; total number of instructions
 17 | %ifndef totalrep
 18 |    %define totalrep 1000000
 19 | %endif
 20 | 
 21 | %if chainlength < totalrep
 22 |    %define repeat1 (totalrep / chainlength / 2)
 23 |    %define clength chainlength
 24 | %else
 25 |    %define repeat1 (totalrep / 100)
 26 |    %define clength 100
 27 | %endif
 28 | 
 29 | 
 30 | ; Define test cases
 31 | 
 32 | %if tcase == 1   ; integer add
 33 | 
 34 |    %macro chain1 0
 35 |       %rep clength
 36 |          add eax,eax
 37 |       %endrep
 38 |    %endmacro
 39 | 
 40 |    %macro chain2 0
 41 |       %rep clength
 42 |          add ebx,ebx
 43 |       %endrep
 44 |    %endmacro
 45 | 
 46 | %elif tcase == 2   ; integer mul
 47 | 
 48 |    %macro chain1 0
 49 |       %rep clength
 50 |          imul eax,eax
 51 |       %endrep
 52 |    %endmacro
 53 | 
 54 |    %macro chain2 0
 55 |       %rep clength
 56 |          imul ebx,ebx
 57 |       %endrep
 58 |    %endmacro
 59 | 
 60 | %elif tcase == 3   ; floating point add
 61 | 
 62 |    %macro chain1 0
 63 |       %rep clength
 64 |          addps xmm1,xmm1
 65 |       %endrep
 66 |    %endmacro
 67 | 
 68 |    %macro chain2 0
 69 |       %rep clength
 70 |          addps xmm2,xmm2
 71 |       %endrep
 72 |    %endmacro
 73 | 
 74 | %elif tcase == 4   ; floating point mul
 75 | 
 76 |    %macro chain1 0
 77 |       %rep clength
 78 |          mulps xmm1,xmm1
 79 |       %endrep
 80 |    %endmacro
 81 | 
 82 |    %macro chain2 0
 83 |       %rep clength
 84 |          mulps xmm2,xmm2
 85 |       %endrep
 86 |    %endmacro
 87 | 
 88 | %else
 89 |    %error unknown test case tcase
 90 | %endif
 91 | 
 92 | 
 93 | %macro testcode 0
 94 |    %if chainlength < totalrep
 95 |       chain1
 96 |       chain2
 97 |    %else
 98 |       chain1
 99 |    %endif
100 | %endmacro
101 | 
102 | 
103 | ; disable default test loops
104 | %define repeat2 1
105 | 
106 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/read_write_bandwidth.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                        2021-03-23 Agner Fog
 3 | 
 4 | # Test the maximum number of memory reads and writes per clock cycle
 5 | 
 6 | # (c) 2013-2021 by Agner Fog. GNU General Public License www.gnu.org/licenses
 7 | 
 8 | . vars.sh
 9 | 
10 | pmclist1="$PMClist"
11 | 
12 | echo -e "\nTest the maximum number of memory reads and writes per clock cycle"  > results2/read_write_bandwidth.txt
13 | echo -e "Test modes:"  >> results2/read_write_bandwidth.txt
14 | echo -e "R:    read only"  >> results2/read_write_bandwidth.txt
15 | echo -e "W:    write only"  >> results2/read_write_bandwidth.txt
16 | echo -e "RW:   one read and one write"  >> results2/read_write_bandwidth.txt
17 | echo -e "RRW:  two reads and one write"  >> results2/read_write_bandwidth.txt
18 | echo -e "RRRW: three reads and one write"  >> results2/read_write_bandwidth.txt
19 | echo -e "RWW:  one read and two writes"  >> results2/read_write_bandwidth.txt
20 | echo -e "RWW2: one read and two writes to different cache lines"  >> results2/read_write_bandwidth.txt
21 | 
22 | for tmode in R W RW RRW RRRW RWW RWW2
23 | do
24 | 
25 | # The modes are:
26 | # R:    read only
27 | # W:    write only
28 | # RW:   one read and one write
29 | # RRW:  two reads and one write
30 | # RRRW: three reads and one write
31 | # RWW:  one read and two writes
32 | 
33 | echo -e "\n\n===========================================\n"  >> results2/read_write_bandwidth.txt
34 | echo -e "test mode = $tmode\n"  >> results2/read_write_bandwidth.txt
35 | echo -e "===========================================\n\n"  >> results2/read_write_bandwidth.txt
36 | 
37 | # Check if AVX supported
38 | if  [ `grep -c -i "avx"  cpuinfo.txt ` -ne 0 ] ; then
39 | reg256=256
40 | else
41 | reg256=
42 | fi
43 | 
44 | # Check if AVX512 supported
45 | if  [ `grep -c -i "avx512"  cpuinfo.txt ` -ne 0 ] ; then
46 | reg512=512
47 | else
48 | reg512=
49 | fi
50 | 
51 | 
52 | for regsize in 8 16 32 64 128 $reg256 $reg512
53 | do
54 | 
55 | echo -e "\n\nRegister size = $regsize bits"  >> results2/read_write_bandwidth.txt
56 | 
57 | IFS=" "
58 | for pmc in $pmclist1 ; do
59 | 
60 | $ass -f elf64 -o b64.o -Dtmode=$tmode -Dregsize=$regsize -Dcounters=$pmc -Pread_write_bandwidth.inc TemplateB64.nasm
61 | if [ $? -ne 0 ] ; then exit ; fi
62 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
63 | if [ $? -ne 0 ] ; then exit ; fi
64 | ./x >> results2/read_write_bandwidth.txt
65 | 
66 | done
67 | IFS=","
68 | 
69 | done
70 | done
71 | 
72 | echo -e "\n"  >> results2/read_write_bandwidth.txt
73 | 


--------------------------------------------------------------------------------
/src/ras_size_gen.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | 
 3 | // https://github.com/ChipsandCheese/Microbenchmarks/blob/master/AsmGen/tests/ReturnStackTest.cs
 4 | int main(int argc, char *argv[]) {
 5 |   FILE *fp = fopen(argv[1], "w");
 6 |   assert(fp);
 7 |   int min_size = 1;
 8 |   int max_size = 128;
 9 | 
10 |   // args: loop count
11 |   fprintf(fp, ".text\n");
12 |   for (int size = min_size; size <= max_size; size++) {
13 |     // entry
14 |     fprintf(fp, ".global ras_size_%d\n", size);
15 |     fprintf(fp, ".balign 64\n");
16 |     fprintf(fp, "ras_size_%d:\n", size);
17 | #ifdef HOST_AARCH64
18 |     // save lr
19 |     fprintf(fp, "\tsub sp, sp, #0x20\n");
20 |     fprintf(fp, "\tstp x29, x30, [sp, #0x10]\n");
21 | 
22 |     fprintf(fp, "\t1:\n");
23 |     // call function
24 |     fprintf(fp, "\tbl ras_func_%d\n", size - 1);
25 |     fprintf(fp, "\tsubs x0, x0, #1\n");
26 |     fprintf(fp, "\tbne 1b\n");
27 | 
28 |     // restore lr
29 |     fprintf(fp, "\tldp x29, x30, [sp, #0x10]\n");
30 |     fprintf(fp, "\tadd sp, sp, #0x20\n");
31 |     fprintf(fp, "\tret\n");
32 | #elif defined(HOST_AMD64)
33 |     fprintf(fp, "\t1:\n");
34 |     // call function
35 |     fprintf(fp, "\tcall ras_func_%d\n", size - 1);
36 |     fprintf(fp, "\tdec %%rdi\n");
37 |     fprintf(fp, "\tjne 1b\n");
38 |     fprintf(fp, "\tret\n");
39 | #endif
40 | 
41 |     // inner function
42 |     fprintf(fp, ".global ras_func_%d\n", size);
43 |     fprintf(fp, ".balign 64\n");
44 |     fprintf(fp, "ras_func_%d:\n", size);
45 | 
46 |     // TODO: if we don't want BTB to predict target address for ret
47 |     // we can use two bl, and alternate between the two using x0
48 | 
49 | #ifdef HOST_AARCH64
50 |     // save lr
51 |     fprintf(fp, "\tsub sp, sp, #0x20\n");
52 |     fprintf(fp, "\tstp x29, x30, [sp, #0x10]\n");
53 | 
54 |     // call lower function
55 |     fprintf(fp, "\tbl ras_func_%d\n", size - 1);
56 | 
57 |     // restore lr
58 |     fprintf(fp, "\tldp x29, x30, [sp, #0x10]\n");
59 |     fprintf(fp, "\tadd sp, sp, #0x20\n");
60 |     fprintf(fp, "\tret\n");
61 | #elif defined(HOST_AMD64)
62 |     fprintf(fp, "\tcall ras_func_%d\n", size - 1);
63 |     fprintf(fp, "\tret\n");
64 | #endif
65 |   }
66 | 
67 |   // recursion base
68 |   fprintf(fp, ".global ras_func_%d\n", 0);
69 |   fprintf(fp, ".balign 32\n");
70 |   fprintf(fp, "ras_func_%d:\n", 0);
71 |   fprintf(fp, "\tret\n");
72 | 
73 |   define_gadgets_array(fp, "ras_size_gadgets");
74 |   for (int size = min_size; size <= max_size; size++) {
75 |     add_gadget(fp, "ras_size_%d", size);
76 |   }
77 |   return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/src/rob_size_gen.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | 
 3 | int main(int argc, char *argv[]) {
 4 |   FILE *fp = fopen(argv[1], "w");
 5 |   assert(fp);
 6 |   int repeat = 20;
 7 |   int min_size = 1;
 8 |   int max_size = 1024;
 9 |   // args: loop count, buffer
10 |   fprintf(fp, ".text\n");
11 |   for (int size = min_size; size <= max_size; size++) {
12 |     fprintf(fp, ".global rob_size_%d\n", size);
13 |     fprintf(fp, ".align 4\n");
14 |     fprintf(fp, "rob_size_%d:\n", size);
15 | #ifdef HOST_AARCH64
16 |     // int sqrt_count = 8;
17 |     fprintf(fp, "\tldr x3, [x0]\n");
18 |     fprintf(fp, "\t1:\n");
19 |     for (int i = 0; i < repeat; i++) {
20 |       fprintf(fp, "\tldr x3, [x3]\n");
21 |       // use sqrt if necessary
22 |       // for (int j = 0; j < sqrt_count; j++) {
23 |       //   fprintf(fp, "\tfsqrt d0, d0\n");
24 |       // }
25 |       for (int j = 0; j < size - 1; j++) {
26 |         fprintf(fp, "\tnop\n");
27 |       }
28 |     }
29 |     fprintf(fp, "\tsubs x2, x2, #1\n");
30 |     fprintf(fp, "\tbne 1b\n");
31 |     fprintf(fp, "\tstr x3, [x0]\n");
32 |     fprintf(fp, "\tret\n");
33 | #elif defined(HOST_AMD64)
34 |     fprintf(fp, "\tmovq 0(%%rdi), %%r8\n");
35 |     fprintf(fp, "\tmovq 0(%%rsi), %%r9\n");
36 |     fprintf(fp, "\tmovq %%rdx, %%rax\n");
37 |     fprintf(fp, "\t1:\n");
38 |     for (int i = 0; i < repeat; i++) {
39 |       fprintf(fp, "\tmovq (%%r8), %%r8\n");
40 |       for (int j = 0; j < size - 1; j++) {
41 |         fprintf(fp, "\tnop\n");
42 |       }
43 |       fprintf(fp, "\tmovq (%%r9), %%r9\n");
44 |       // forbit further speculation
45 |       fprintf(fp, "\tlfence\n");
46 |       fprintf(fp, "\tmfence\n");
47 |     }
48 |     fprintf(fp, "\tsubl $1, %%eax\n");
49 |     fprintf(fp, "\tjne 1b\n");
50 |     fprintf(fp, "\tmovq %%r8, 0(%%rdi)\n");
51 |     fprintf(fp, "\tmovq %%r9, 0(%%rsi)\n");
52 |     fprintf(fp, "\tret\n");
53 | #elif defined(__loongarch__)
54 |     fprintf(fp, "\tld.d $a3, $a0, 0\n");
55 |     fprintf(fp, "\t1:\n");
56 |     for (int i = 0; i < repeat; i++) {
57 |       fprintf(fp, "\tld.d $a3, $a3, 0\n");
58 |       for (int j = 0; j < size - 1; j++) {
59 |         fprintf(fp, "\tnop\n");
60 |       }
61 |     }
62 |     fprintf(fp, "\taddi.d $a2, $a2, -1\n");
63 |     fprintf(fp, "\tbne $a2, $zero, 1b\n");
64 |     fprintf(fp, "\tst.d $a3, $a0, 0\n");
65 |     fprintf(fp, "\tret\n");
66 | #endif
67 |   }
68 | 
69 |   define_gadgets_array(fp, "rob_size_gadgets");
70 |   for (int size = min_size; size <= max_size; size++) {
71 |     add_gadget(fp, "rob_size_%d", size);
72 |   }
73 |   return 0;
74 | }
75 | 


--------------------------------------------------------------------------------
/src/pht_index_tag_bits.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | #include <assert.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <vector>
 6 | 
 7 | // defined in gen_pht_index_tag_bits_test()
 8 | // args: loop count, buffer
 9 | typedef void (*gadget)(size_t, uint32_t *);
10 | extern "C" {
11 | extern gadget pht_index_tag_bits_gadgets[];
12 | }
13 | 
14 | int main(int argc, char *argv[]) {
15 |   int loop_count = 1000;
16 |   // match gen_pht_index_tag_bits_test
17 |   int min_branch_align = 2;
18 | #ifdef __APPLE__
19 |   // cannot surpass page size
20 |   int max_branch_align = 13;
21 | #else
22 |   int max_branch_align = 19;
23 | #endif
24 | 
25 |   bind_to_core();
26 | #ifdef NO_COND_BRANCH_MISSES
27 |   setup_perf_branch_misses();
28 | #else
29 |   setup_perf_cond_branch_misses();
30 | #endif
31 |   FILE *fp = fopen("pht_index_tag_bits.csv", "w");
32 |   assert(fp);
33 | 
34 |   uint32_t *buffer = new uint32_t[loop_count + 1];
35 | 
36 |   fprintf(fp, "align,min,avg,max\n");
37 |   int gadget_index = 0;
38 |   for (int branch_align = min_branch_align; branch_align <= max_branch_align;
39 |        branch_align++) {
40 |     std::vector<double> history;
41 |     int iterations = 100;
42 |     history.reserve(iterations);
43 | 
44 |     double sum = 0;
45 |     // run several times
46 |     for (int i = 0; i < iterations; i++) {
47 |       for (int i = 0; i <= loop_count; i++) {
48 |         buffer[i] = rand() % 2;
49 |       }
50 | 
51 | #ifdef NO_COND_BRANCH_MISSES
52 |       uint64_t begin = perf_read_branch_misses();
53 | #else
54 |       uint64_t begin = perf_read_cond_branch_misses();
55 | #endif
56 |       pht_index_tag_bits_gadgets[gadget_index](loop_count, buffer);
57 | #ifdef NO_COND_BRANCH_MISSES
58 |       uint64_t elapsed = perf_read_branch_misses() - begin;
59 | #else
60 |       uint64_t elapsed = perf_read_cond_branch_misses() - begin;
61 | #endif
62 | 
63 |       // skip warmup
64 |       if (i >= 10) {
65 |         double time = (double)elapsed / loop_count;
66 |         history.push_back(time);
67 |         sum += time;
68 |       }
69 |     }
70 |     gadget_index++;
71 | 
72 |     double min = history[0];
73 |     double max = history[0];
74 |     for (size_t i = 0; i < history.size(); i++) {
75 |       if (min > history[i]) {
76 |         min = history[i];
77 |       }
78 |       if (max < history[i]) {
79 |         max = history[i];
80 |       }
81 |     }
82 |     fprintf(fp, "%d,%.2lf,%.2lf,%.2lf\n", branch_align, min,
83 |             sum / history.size(), max);
84 |     fflush(fp);
85 |   }
86 | 
87 |   printf("Results are written to pht_index_tag_bits.csv\n");
88 |   delete[] buffer;
89 |   return 0;
90 | }
91 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch3.inc:
--------------------------------------------------------------------------------
  1 | ; reproduce Table 2 of Half&Half
  2 | 
  3 | ; alignment bits of branch instruction address
  4 | %ifndef branchalign
  5 |     %define branchalign 18
  6 | %endif
  7 | 
  8 | ; alignment bits of branch target address
  9 | %ifndef targetalign
 10 |     %define targetalign 5
 11 | %endif
 12 | 
 13 | ; toggle bit of branch address (-1 means do not toggle)
 14 | %ifndef branchtoggle
 15 |     %define branchtoggle 0
 16 | %endif
 17 | 
 18 | ; toggle bit of target address (-1 means do not toggle)
 19 | %ifndef targettoggle
 20 |     %define targettoggle 0
 21 | %endif
 22 | 
 23 | ; number of dummy branches
 24 | %ifndef dummybranches
 25 |     %define dummybranches 5
 26 | %endif
 27 | 
 28 | %macro SHIFT_PHR 1
 29 |     mov eax, %1+1
 30 | 
 31 |     align 1<<16
 32 |     %rep (1<<16)-(1<<6)
 33 |         nop
 34 |     %endrep
 35 | 
 36 |     ; dummy_target aligned to 1<<6
 37 | %%shift_phr_dummy_target:
 38 |     %rep (1<<6)-7
 39 |         nop
 40 |     %endrep
 41 |     dec eax ; 2 bytes
 42 |     ; the last byte of jnz aligned to 1<<16
 43 |     ; jnz shift_phr_dummy_target
 44 |     db 0x0f
 45 |     db 0x85
 46 |     dd %%shift_phr_dummy_target - $ - 4
 47 | %endmacro
 48 | 
 49 | %macro testinit3 0
 50 |     mov rdi, 1000
 51 | 
 52 | loop_begin:
 53 | 
 54 |     ; loop to clear phr
 55 |     SHIFT_PHR 200
 56 | 
 57 |     ; train branch
 58 |     READ_PMC_START
 59 |     rdrand ebx
 60 |     and ebx, 1
 61 |     ; READ_PMC_START: 166 bytes
 62 |     ; rdrand ebx: 3 bytes
 63 |     ; and ebx, 1: 3 bytes
 64 |     ; jnz first_target: 6 bytes
 65 |     %rep (1<<branchalign)-166-6-6
 66 |         nop
 67 |     %endrep
 68 | 
 69 |     %if branchtoggle != -1
 70 |     %rep (1<<branchtoggle)
 71 |         nop
 72 |     %endrep
 73 |     %endif
 74 |     ; the last byte of jnz - 1<<branchtoggle aligned to 1<<branchalign
 75 |     ; jnz first_target
 76 |     db 0x0f
 77 |     db 0x85
 78 |     dd first_target - $ - 4
 79 | 
 80 |     ; target aligned to 1<<targetalign
 81 |     %if branchtoggle != -1
 82 |     %rep (1<<targetalign)-1-(1<<branchtoggle)
 83 |         nop
 84 |     %endrep
 85 |     %else
 86 |     %rep (1<<targetalign)-1
 87 |         nop
 88 |     %endrep
 89 |     %endif
 90 | 
 91 |     %if targettoggle != -1
 92 |     %rep (1<<targettoggle)
 93 |         nop
 94 |     %endrep
 95 |     %endif
 96 | first_target:
 97 | 
 98 |     ; loop to shift phr
 99 |     SHIFT_PHR dummybranches
100 |     
101 |     ; test branch
102 |     align 64
103 |     and ebx, 1
104 |     jnz second_target
105 | second_target:
106 |     READ_PMC_END
107 | 
108 |     align 64
109 |     dec rdi
110 |     jnz loop_begin
111 | 
112 | %endmacro


--------------------------------------------------------------------------------
/src/ghr_size_gen.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | 
 3 | // ref:
 4 | // https://cseweb.ucsd.edu/~dstefan/pubs/yavarzadeh:2023:half.pdf
 5 | int main(int argc, char *argv[]) {
 6 |   FILE *fp = fopen(argv[1], "w");
 7 |   assert(fp);
 8 |   int min_size = 1;
 9 |   int max_size = 1024;
10 | 
11 |   // args: loop count, random array
12 |   fprintf(fp, ".text\n");
13 |   for (int size = min_size; size <= max_size; size++) {
14 |     // always taken or not taken for dummy branches
15 |     fprintf(fp, ".global ghr_size_%d\n", size);
16 |     fprintf(fp, ".balign 32\n");
17 |     fprintf(fp, "ghr_size_%d:\n", size);
18 | #ifdef HOST_AARCH64
19 |     fprintf(fp, "\t1:\n");
20 | 
21 |     // always taken branches ahead
22 |     for (int i = 0; i < size - 2; i++) {
23 |       fprintf(fp, "\tcbnz x1, 2f\n");
24 |       // alignment is required, otherwise too many branches in a cache line
25 |       fprintf(fp, "\t.balign 32\n");
26 |       fprintf(fp, "\t2:\n");
27 |     }
28 | 
29 |     // taken/not taken based on x0 & 1
30 |     fprintf(fp, "\tand x2, x0, #1\n");
31 |     fprintf(fp, "\tcbnz x2, 2f\n");
32 |     fprintf(fp, "\t2:\n");
33 | 
34 |     fprintf(fp, "\tsubs x0, x0, #1\n");
35 |     fprintf(fp, "\tbne 1b\n");
36 | 
37 |     // restore regs
38 |     fprintf(fp, "\tret\n");
39 | #elif defined(HOST_AMD64)
40 |     // save registers
41 |     fprintf(fp, "\tpush %%rbx\n");
42 |     fprintf(fp, "\tpush %%rcx\n");
43 | 
44 |     fprintf(fp, "\t1:\n");
45 | 
46 |     // always taken branches ahead
47 |     fprintf(fp, "\tmov $1, %%rcx\n");
48 |     fprintf(fp, "\ttest %%rcx, %%rcx\n");
49 |     for (int i = 0; i < size - 2; i++) {
50 |       fprintf(fp, "\tjnz 2f\n");
51 |       // alignment is required, otherwise too many branches in a cache line
52 |       fprintf(fp, "\t.balign 64\n");
53 |       fprintf(fp, "\t2:\n");
54 |       fprintf(fp, "\tnop\n");
55 |       fprintf(fp, "\t.balign 64\n");
56 |     }
57 | 
58 |     // taken/not taken based on rdi & 1
59 |     fprintf(fp, "\tmov %%rdi, %%rcx\n");
60 |     fprintf(fp, "\tand $1, %%rcx\n");
61 |     fprintf(fp, "\ttest %%rcx, %%rcx\n");
62 |     fprintf(fp, "\tjnz 2f\n");
63 |     fprintf(fp, "\t.balign 64\n");
64 |     fprintf(fp, "\t2:\n");
65 |     fprintf(fp, "\tnop\n");
66 |     fprintf(fp, "\t.balign 64\n");
67 | 
68 |     fprintf(fp, "\tdec %%rdi\n");
69 |     fprintf(fp, "\tjnz 1b\n");
70 | 
71 |     // restore regs
72 |     fprintf(fp, "\tpop %%rcx\n");
73 |     fprintf(fp, "\tpop %%rbx\n");
74 |     fprintf(fp, "\tret\n");
75 | #endif
76 |   }
77 | 
78 |   define_gadgets_array(fp, "ghr_size_gadgets");
79 |   for (int size = min_size; size <= max_size; size++) {
80 |     add_gadget(fp, "ghr_size_%d", size);
81 |   }
82 |   return 0;
83 | }
84 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/fused_branch.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                                        2021-03-15 AgF
 3 | 
 4 | # Compile and run PMCTest for various combinations of arithmetic instructions and branch instructions
 5 | # to test for instruction fusion
 6 | 
 7 | # (c) Copyright 2013 - 2021 by Agner Fog. GNU General Public License www.gnu.org/licenses
 8 | 
 9 | # Detect CPU specific variables
10 | . vars.sh
11 | 
12 | 
13 | echo -e "Test instruction fusion\n"  > results2/fused_branch.txt
14 | 
15 | let case=0
16 | 
17 | for instr1 in cmp test add sub and or xor
18 | do
19 | for instr2 in jz ja jb jg jl js jo jp
20 | do
21 | 
22 | let optype=0
23 | for xoptype in  reg,reg  reg,imm
24 | do
25 | let optype+=1
26 | 
27 | echo -e "\n\n$instr1 $xoptype / $instr2 \n"  >> results2/fused_branch.txt
28 | $ass -f elf64 -o b64.o -Dcase=$case -Doptype=$optype -Dtaken=0 -Dinstr1=$instr1 -Dinstr2=$instr2 -Dcounters=$BranchPMCs -Pfused_branch.inc TemplateB64.nasm
29 | if [ $? -ne 0 ] ; then exit ; fi
30 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
31 | if [ $? -ne 0 ] ; then exit ; fi
32 | ./x >> results2/fused_branch.txt
33 | 
34 | done
35 | done
36 | done
37 | 
38 | let case=1
39 | 
40 | for instr1 in inc dec neg not
41 | do
42 | for instr2 in jz ja jb jg jl js jo
43 | do
44 | 
45 | let optype=0
46 | for xoptype in  reg,reg  reg,imm
47 | do
48 | let optype+=1
49 | 
50 | echo -e "\n\n$instr1 $xoptype / $instr2 \n"  >> results2/fused_branch.txt
51 | $ass -f elf64 -o b64.o -Dcase=$case -Doptype=$optype -Dtaken=0 -Dinstr1=$instr1 -Dinstr2=$instr2 -Dcounters=$BranchPMCs -Pfused_branch.inc TemplateB64.nasm
52 | if [ $? -ne 0 ] ; then exit ; fi
53 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
54 | if [ $? -ne 0 ] ; then exit ; fi
55 | ./x >> results2/fused_branch.txt
56 | 
57 | done
58 | done
59 | done
60 | 
61 | 
62 | let optype=1
63 | xoptype="reg,reg"
64 | let case=12
65 | 
66 | for xcase in  ADC+JC/JNC  OR+JZ/JNZ  NOT+JZ/JNZ  SHR+JC/JNC  JECXZ  Boundary_before_jz  Boundary_in_jz
67 | do
68 | let case+=1
69 | let taken=-1
70 | for xtaken in no yes
71 | do
72 | let taken+=1
73 | 
74 | echo -e "\n\n$xcase $xoptype, taken: $xtaken\n"  >> results2/fused_branch.txt
75 | # $ass -f elf64 -o b64.o -l fusedb$case.lst -Dcase=$case -Doptype=$optype -Dtaken=$taken -Dcounters=$BranchPMCs -Pfused_branch.inc
76 | $ass -f elf64 -o b64.o -Dcase=$case -Doptype=$optype -Dtaken=$taken -Dcounters=$BranchPMCs -Pfused_branch.inc TemplateB64.nasm
77 | if [ $? -ne 0 ] ; then exit ; fi
78 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
79 | if [ $? -ne 0 ] ; then exit ; fi
80 | ./x >> results2/fused_branch.txt
81 | done
82 | done
83 | 
84 | echo -e "\n"  >> results2/fused_branch.txt
85 | 


--------------------------------------------------------------------------------
/src/elimination_lib.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | #include <assert.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <vector>
 6 | 
 7 | // generated in elimination_gen.cpp
 8 | // args: loop count
 9 | typedef char **(*gadget)(size_t);
10 | extern "C" {
11 | extern gadget elimination_gadgets[];
12 | }
13 | 
14 | void elimination(FILE *fp) {
15 |   int loop_count = 10000;
16 |   // match gen_rob_test
17 | 
18 |   bind_to_core();
19 |   setup_perf_instructions_per_cycle();
20 |   int num_patterns = 16;
21 |   const char *pattern_names[] = {"int dependent add",
22 |                                  "int independent add",
23 |                                  "int dependent mov",
24 |                                  "int independent mov",
25 |                                  "int dependent zero via xor",
26 |                                  "int dependent zero via sub",
27 |                                  "int independent zero via mov",
28 |                                  "int independent one via mov",
29 |                                  "int independent two via mov",
30 |                                  "int independent 1024 via mov",
31 |                                  "vec dependent mov",
32 |                                  "vec independent mov",
33 |                                  "vec dependent zero via xor",
34 |                                  "vec dependent zero via sub",
35 |                                  "vec independent zero via mov",
36 |                                  "nop"};
37 | 
38 |   int gadget_index = 0;
39 |   fprintf(fp, "pattern,min,avg,max\n");
40 |   for (int pattern = 0; pattern < num_patterns; pattern++) {
41 |     std::vector<double> history;
42 |     int iterations = 100;
43 |     history.reserve(iterations);
44 | 
45 |     double sum = 0;
46 |     // run several times
47 |     for (int i = 0; i < iterations; i++) {
48 |       perf_begin_instructions_per_cycle();
49 |       elimination_gadgets[gadget_index](loop_count);
50 |       counter_per_cycle elapsed = perf_end_instructions_per_cycle();
51 | 
52 |       // skip warmup
53 |       if (i >= 10) {
54 |         double time = (double)elapsed.counter / elapsed.cycles;
55 |         history.push_back(time);
56 |         sum += time;
57 |       }
58 |     }
59 |     gadget_index++;
60 | 
61 |     double min = history[0];
62 |     double max = history[0];
63 |     for (size_t i = 0; i < history.size(); i++) {
64 |       if (min > history[i]) {
65 |         min = history[i];
66 |       }
67 |       if (max < history[i]) {
68 |         max = history[i];
69 |       }
70 |     }
71 |     fprintf(fp, "%s,%.2lf,%.2lf,%.2lf\n", pattern_names[pattern], min,
72 |             sum / history.size(), max);
73 |     fflush(fp);
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/warmup_fp.inc:
--------------------------------------------------------------------------------
 1 | ;----------------------------------------------------------------------------
 2 | ;                warmup_fp.inc                2015-12-21 Agner Fog
 3 | ;
 4 | ; PMC Test program for testing warm up effect of floating point unit
 5 | ;
 6 | ; Constants to be defined:
 7 | ; 
 8 | ; tcase:   1: integer multiplication
 9 | ;          2: x87 floating point multiplication
10 | ;          3: xmm floating point scalar multiplication
11 | ;          4: xmm 128-bit floating point vector multiplication
12 | ;          5: ymm 256-bit floating point vector multiplication
13 | ; 
14 | ; (c) Copyright 2013 - 2015 by Agner Fog. GNU General Public License www.gnu.org/licenses
15 | ;-----------------------------------------------------------------------------
16 | ; Define any undefined macros
17 | 
18 | %ifndef tcase
19 |    %define tcase 1
20 | %endif
21 | 
22 | %define WARMUPCOUNT 0    ; don't use warm up in TemplateB64.nasm
23 | 
24 | ; Let f.p. unit cool down by using integer unit
25 | ; (This is actually the same as in TemplateB64.nasm, but that may change:)
26 | %macro testinit1 0
27 | %if tcase == 2   ; use x87
28 | 		fld1
29 | 		fld1
30 | %endif
31 | 
32 | %ifdef primingdelay
33 |         vxorps ymm0,ymm0,ymm0
34 |         mov ecx, primingdelay/20
35 |         mov eax, 1
36 |         align 16
37 | Wuloop1:
38 |         %rep 20
39 |         add eax,eax
40 |         %endrep
41 |         dec ecx
42 |         jnz Wuloop1
43 | %endif
44 | %endmacro
45 | 
46 | ; define counts in warmup_fp.sh2
47 | ; %define repeat0 20
48 | ; %define repeat1 10
49 | ; %define repeat2 10
50 | 
51 | 
52 | ; Define test cases
53 | 
54 | %if tcase == 1   ; integer multiplication
55 | 
56 |    %macro testcode 0
57 |       imul rax, rbx
58 |    %endmacro
59 | 
60 | %elif tcase == 2   ; x87 floating point multiplication
61 | 
62 |    %macro testcode 0
63 |       ;fmul st(1),st(0)
64 | 	  fmul st1,st0
65 |    %endmacro
66 | 
67 | %elif tcase == 3   ; xmm floating point scalar multiplication
68 | 
69 |    %macro testcode 0
70 |       mulsd xmm1,xmm2
71 |    %endmacro
72 | 
73 | %elif tcase == 4   ; xmm 128-bit floating point vector multiplication
74 | 
75 |    %macro testcode 0
76 |       mulpd xmm1,xmm2
77 |    %endmacro
78 | 
79 | %elif tcase == 5   ; ymm 256-bit floating point vector multiplication latency
80 | 
81 |    %macro testcode 0
82 |       vmulpd ymm1,ymm1,ymm2
83 |    %endmacro
84 | 
85 | %elif tcase == 6   ; ymm 256-bit floating point vector multiplication throughput
86 | 
87 |    %macro testcode 0
88 |       vmulpd ymm1,ymm2,ymm2
89 |       vmulpd ymm3,ymm4,ymm4
90 |    %endmacro
91 | 
92 | %else
93 |    %error unknown test case tcase
94 | %endif
95 | 


--------------------------------------------------------------------------------
/src/pht_tag_bits_xor_phr.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | #include <assert.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <vector>
 6 | 
 7 | // generated in pht_tag_bits_xor_phr_gen.cpp
 8 | // args: loop count, buffer
 9 | typedef void (*gadget)(size_t, uint32_t *);
10 | extern "C" {
11 | extern gadget pht_tag_bits_xor_phr_gadgets[];
12 | }
13 | 
14 | int main(int argc, char *argv[]) {
15 |   int loop_count = 5000;
16 |   // match gen_pht_tag_bits_xor_phr_test
17 |   int min_first_phr_bit = 0;
18 |   int max_first_phr_bit = 35;
19 |   int min_dummy_branches = 0;
20 |   int max_dummy_branches = PHR_BRANCHES;
21 | 
22 |   bind_to_core();
23 |   setup_perf_cond_branch_misses();
24 |   FILE *fp = fopen("pht_tag_bits_xor_phr.csv", "w");
25 |   assert(fp);
26 | 
27 |   uint32_t *buffer = new uint32_t[loop_count + 1];
28 |   for (int i = 0; i <= loop_count; i++) {
29 |     buffer[i] = rand() % 4;
30 |   }
31 | 
32 |   fprintf(fp, "target,first_phr_bit,dummy_branches,min,avg,max\n");
33 |   int gadget_index = 0;
34 |   for (int inject_target = 0; inject_target <= 1; inject_target++) {
35 |     for (int first_phr_bit = min_first_phr_bit;
36 |          first_phr_bit <= max_first_phr_bit; first_phr_bit++) {
37 |       for (int dummy_branches = min_dummy_branches;
38 |            dummy_branches <= max_dummy_branches; dummy_branches++) {
39 |         std::vector<double> history;
40 |         int iterations = 100;
41 |         history.reserve(iterations);
42 | 
43 |         double sum = 0;
44 |         // run several times
45 |         for (int i = 0; i < iterations; i++) {
46 |           uint64_t begin = perf_read_cond_branch_misses();
47 |           pht_tag_bits_xor_phr_gadgets[gadget_index](loop_count, buffer);
48 |           uint64_t elapsed = perf_read_cond_branch_misses() - begin;
49 | 
50 |           // skip warmup
51 |           if (i >= 10) {
52 |             // 1/8 branches
53 |             double time = (double)elapsed / loop_count * 4;
54 |             history.push_back(time);
55 |             sum += time;
56 |           }
57 |         }
58 |         gadget_index++;
59 | 
60 |         double min = history[0];
61 |         double max = history[0];
62 |         for (size_t i = 0; i < history.size(); i++) {
63 |           if (min > history[i]) {
64 |             min = history[i];
65 |           }
66 |           if (max < history[i]) {
67 |             max = history[i];
68 |           }
69 |         }
70 |         fprintf(fp, "%d,%d,%d,%.2lf,%.2lf,%.2lf\n", inject_target,
71 |                 first_phr_bit, dummy_branches, min, sum / history.size(), max);
72 |         fflush(fp);
73 |       }
74 |     }
75 |   }
76 | 
77 |   printf("Results are written to pht_tag_bits_xor_phr.csv\n");
78 |   delete[] buffer;
79 |   return 0;
80 | }
81 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/ucache_misprediction.inc:
--------------------------------------------------------------------------------
  1 | ;----------------------------------------------------------------------------
  2 | ;                       ucache_misprediction.inc        2013-07-21 Agner Fog
  3 | ;
  4 | ;            PMC Test program for testingbranch prediction
  5 | ;                           NASM syntax
  6 | ;
  7 | ; The following macros can be defined on the command line or in include files:
  8 | ; 
  9 | ; tcase:    Test case number. See below for each case
 10 | ;           1. Tiny loop. Expect loop counter to be used
 11 | ;           2. Normal loop. Expect uop cache to be used
 12 | ;           3. Extremely big loop. Expect only fetch and decode to be used
 13 | ; 
 14 | ; count1:   Loop count for outer loop
 15 | ; 
 16 | ; count2:   Loop count for inner loop
 17 | ;
 18 | ;
 19 | ; (c) Copyright 2013 by Agner Fog. GNU General Public License www.gnu.org/licenses
 20 | ;-----------------------------------------------------------------------------
 21 | 
 22 | %ifndef tcase
 23 |    %define case  1          ; default case 1
 24 | %endif
 25 | 
 26 | %ifndef count1
 27 |    %define count1   10      ; default count1
 28 | %endif
 29 | 
 30 | %ifndef count2
 31 |    %define count2   16      ; default count2
 32 | %endif
 33 | 
 34 | 
 35 | ;##############################################################################
 36 | ;#
 37 | ;#                 Test code macros
 38 | ;#
 39 | ;##############################################################################
 40 | 
 41 | ; define long nops
 42 | %ifndef noptype
 43 |    %define noptype 2
 44 | %endif
 45 | 
 46 | %include "nops.inc"
 47 | 
 48 | %if tcase == 1   ; Tiny loop. Expect loop counter to be used
 49 | 
 50 | %macro testcode 0
 51 | nop
 52 | nop
 53 | mov ebp, count1
 54 | align 16
 55 | LL:
 56 |     test r14b,4
 57 |     jz L2
 58 |     nop8
 59 |     L2:
 60 |     nop8
 61 | dec ebp
 62 | jnz LL
 63 | %endmacro
 64 | 
 65 | %elif tcase == 2 ; Normal loop. Expect uop cache to be used
 66 | 
 67 | %macro testcode 0
 68 | %rep count1
 69 |     test r14b,4
 70 |     jnz $+10
 71 |     nop8
 72 |     nop8
 73 | %endrep
 74 | %endmacro
 75 | 
 76 | %elif tcase == 3 ; Extremely big loop. Expect only fetch and decode to be used
 77 | 
 78 | %macro testinitc 0
 79 | %rep 100000    ; lots of code before counters are read, to prevent uop caching
 80 | nop
 81 | %endrep
 82 | %endmacro
 83 | 
 84 | %macro testcode 0
 85 | %rep count1
 86 |     test r14b,4
 87 |     jnz $+10
 88 |     nop8
 89 |     nop8
 90 | %endrep
 91 | %endmacro
 92 | 
 93 | %else
 94 | %error unknown test case tcase
 95 | %endif
 96 | 
 97 | ; disable default test loops
 98 | %define repeat1 1
 99 | %define repeat2 1
100 | 
101 | 


--------------------------------------------------------------------------------
/src/pht_tag_bits_xor.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | #include <assert.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <vector>
 6 | 
 7 | // generated in pht_tag_bits_xor_gen.cpp
 8 | // args: loop count, buffer
 9 | typedef void (*gadget)(size_t, uint32_t *);
10 | extern "C" {
11 | extern gadget pht_tag_bits_xor_gadgets[];
12 | }
13 | 
14 | int main(int argc, char *argv[]) {
15 |   int loop_count = 5000;
16 |   // match gen_pht_tag_bits_xor_test
17 |   int min_branch_align = 3;
18 | #ifdef __APPLE__
19 |   int max_branch_align = 13;
20 | #else
21 |   int max_branch_align = 18;
22 | #endif
23 |   int min_dummy_branches = 0;
24 |   int max_dummy_branches = PHR_BRANCHES + 5;
25 | 
26 |   bind_to_core();
27 |   setup_perf_cond_branch_misses();
28 |   FILE *fp = fopen("pht_tag_bits_xor.csv", "w");
29 |   assert(fp);
30 | 
31 |   uint32_t *buffer = new uint32_t[loop_count + 1];
32 |   for (int i = 0; i <= loop_count; i++) {
33 |     buffer[i] = rand() % 2;
34 |   }
35 | 
36 |   fprintf(fp, "target,align,dummy_branches,min,avg,max\n");
37 |   int gadget_index = 0;
38 |   for (int inject_target = 0; inject_target <= 1; inject_target++) {
39 |     for (int branch_align = min_branch_align; branch_align <= max_branch_align;
40 |          branch_align++) {
41 |       for (int dummy_branches = min_dummy_branches;
42 |            dummy_branches <= max_dummy_branches; dummy_branches++) {
43 |         std::vector<double> history;
44 |         int iterations = 100;
45 |         history.reserve(iterations);
46 | 
47 |         double sum = 0;
48 |         // run several times
49 |         for (int i = 0; i < iterations; i++) {
50 |           uint64_t begin = perf_read_cond_branch_misses();
51 |           pht_tag_bits_xor_gadgets[gadget_index](loop_count, buffer);
52 |           uint64_t elapsed = perf_read_cond_branch_misses() - begin;
53 | 
54 |           // skip warmup
55 |           if (i >= 10) {
56 |             // 1/8 branches
57 |             double time = (double)elapsed / loop_count * 4;
58 |             history.push_back(time);
59 |             sum += time;
60 |           }
61 |         }
62 |         gadget_index++;
63 | 
64 |         double min = history[0];
65 |         double max = history[0];
66 |         for (size_t i = 0; i < history.size(); i++) {
67 |           if (min > history[i]) {
68 |             min = history[i];
69 |           }
70 |           if (max < history[i]) {
71 |             max = history[i];
72 |           }
73 |         }
74 |         fprintf(fp, "%d,%d,%d,%.2lf,%.2lf,%.2lf\n", inject_target, branch_align,
75 |                 dummy_branches, min, sum / history.size(), max);
76 |         fflush(fp);
77 |       }
78 |     }
79 |   }
80 | 
81 |   printf("Results are written to pht_tag_bits_xor.csv\n");
82 |   delete[] buffer;
83 |   return 0;
84 | }
85 | 


--------------------------------------------------------------------------------
/include/counters_mapping.h:
--------------------------------------------------------------------------------
 1 | // make clangd happy
 2 | #ifndef DEFINE_COUNTER
 3 | #define DEFINE_COUNTER(...)
 4 | #endif
 5 | 
 6 | #ifndef DEFINE_COUNTER_RANGE
 7 | #define DEFINE_COUNTER_RANGE(...)
 8 | #endif
 9 | 
10 | #ifndef DEFINE_COUNTER_SUBTRACT
11 | #define DEFINE_COUNTER_SUBTRACT(...)
12 | #endif
13 | 
14 | #ifndef DEFINE_COMPUTED_COUNTER_RANGE
15 | #define DEFINE_COMPUTED_COUNTER_RANGE(...)
16 | #endif
17 | 
18 | #ifdef __APPLE__
19 | // macOS/iOS
20 | DEFINE_COUNTER(cycles, FIXED_CYCLES)
21 | DEFINE_COUNTER(instructions, FIXED_INSTRUCTIONS)
22 | DEFINE_COUNTER(branch_misses, BRANCH_MISPRED_NONSPEC)
23 | DEFINE_COUNTER(cond_branch_misses, BRANCH_COND_MISPRED_NONSPEC)
24 | 
25 | #else
26 | // Linux
27 | // select pmu based on icestorm/firestorm
28 | // 0xb: firestorm pmu
29 | #define PERF_TYPE_FIRESTORM 0xb
30 | // 0xa: icestorm pmu
31 | #define PERF_TYPE_ICESTORM 0xa
32 | 
33 | // 0xa: gracemont pmu
34 | #define PERF_TYPE_GRACEMONT 0xaL
35 | 
36 | // firestorm/icestorm
37 | // 0x02: CORE_ACTIVE_CYCLE from
38 | // https://github.com/jiegec/apple-pmu/blob/master/a14.md
39 | DEFINE_COUNTER(cycles, firestorm, PERF_TYPE_FIRESTORM, 0x02)
40 | // 0x8c: INST_ALL from
41 | // https://github.com/jiegec/apple-pmu/blob/master/a14.md
42 | DEFINE_COUNTER(instructions, firestorm, PERF_TYPE_FIRESTORM, 0x8c)
43 | // 0xcb: BRANCH_MISPRED_NONSPEC from
44 | // https://github.com/jiegec/apple-pmu/blob/master/a14.md
45 | DEFINE_COUNTER(branch_misses, firestorm, PERF_TYPE_FIRESTORM, 0xcb)
46 | // 0xc5: BRANCH_COND_MISPRED_NONSPEC from
47 | // https://github.com/jiegec/apple-pmu/blob/master/a14.md
48 | DEFINE_COUNTER(cond_branch_misses, firestorm, PERF_TYPE_FIRESTORM, 0xc5)
49 | 
50 | // arm64 general
51 | // ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED in
52 | // linux/include/linux/perf/arm_pmuv3.h PERF_COUNT_HW_BRANCH_MISSES was mapped
53 | // to ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, which counts speculative mis-predictions,
54 | // we want retired mis-predictions
55 | DEFINE_COUNTER_RANGE(branch_misses, arm64, PERF_TYPE_RAW, 0x22)
56 | 
57 | // qualcomm oryon
58 | // discovered via find_branch_misses_pmu tool
59 | DEFINE_COUNTER(cond_branch_misses, oryon, PERF_TYPE_RAW, 0x400)
60 | 
61 | // fallback counters
62 | 
63 | // cycles
64 | DEFINE_COUNTER_RANGE(cycles, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)
65 | 
66 | // instructions retired
67 | DEFINE_COUNTER_RANGE(instructions, all, PERF_TYPE_HARDWARE,
68 |                      PERF_COUNT_HW_INSTRUCTIONS)
69 | 
70 | // branch mispredictions
71 | DEFINE_COUNTER_RANGE(branch_misses, all, PERF_TYPE_HARDWARE,
72 |                      PERF_COUNT_HW_BRANCH_MISSES)
73 | 
74 | // counter per cycle
75 | DEFINE_COMPUTED_COUNTER_RANGE(instructions_per_cycle, counter_per_cycle, all,
76 |                               compute_counter_per_cycle, instructions, cycles)
77 | 
78 | #endif
79 | 


--------------------------------------------------------------------------------
/src/phr_size_lib.cpp:
--------------------------------------------------------------------------------
  1 | #include "include/utils.h"
  2 | #include <assert.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <vector>
  6 | 
  7 | // use with generate_gadget tool
  8 | 
  9 | // defined in gen_phr_test()
 10 | // args: loop count, buffer
 11 | typedef void (*gadget)(size_t, uint32_t *);
 12 | extern "C" {
 13 | extern gadget phr_size_gadgets[];
 14 | }
 15 | 
 16 | void phr_size(FILE *fp) {
 17 |   int loop_count = 1000;
 18 |   // match gen_phr_test
 19 |   int min_size = 1;
 20 |   int max_size = 256;
 21 | 
 22 |   bind_to_core();
 23 | #ifdef IOS
 24 |   // no pmu
 25 | #elif defined(NO_COND_BRANCH_MISSES)
 26 |   // fallback
 27 |   setup_perf_branch_misses();
 28 | #else
 29 |   setup_perf_cond_branch_misses();
 30 | #endif
 31 |   assert(fp);
 32 | 
 33 |   uint32_t *buffer = new uint32_t[loop_count + 1];
 34 | 
 35 |   fprintf(fp, "size,min,avg,max\n");
 36 |   int gadget_index = 0;
 37 |   for (int size = min_size; size <= max_size; size++) {
 38 |     std::vector<double> history;
 39 |     int iterations = 100;
 40 |     history.reserve(iterations);
 41 | 
 42 |     double sum = 0;
 43 |     // run several times
 44 |     for (int i = 0; i < iterations; i++) {
 45 | 
 46 |       // random
 47 |       for (int i = 0; i <= loop_count; i++) {
 48 |         buffer[i] = rand() % 2;
 49 |       }
 50 |       // ensures that ldr w11, [x0, w11, uxtw #2] does not change w11
 51 |       buffer[0] = 0;
 52 |       buffer[1] = 1;
 53 | 
 54 | #ifdef IOS
 55 |       // fallback
 56 |       uint64_t begin = get_time();
 57 | #elif defined(NO_COND_BRANCH_MISSES)
 58 |       // fallback
 59 |       uint64_t begin = perf_read_branch_misses();
 60 | #else
 61 |       uint64_t begin = perf_read_cond_branch_misses();
 62 | #endif
 63 | 
 64 |       phr_size_gadgets[gadget_index](loop_count, buffer);
 65 | 
 66 | #ifdef IOS
 67 |       // fallback
 68 |       uint64_t elapsed = get_time() - begin;
 69 | #elif defined(NO_COND_BRANCH_MISSES)
 70 |       // fallback
 71 |       uint64_t elapsed = perf_read_branch_misses() - begin;
 72 | #else
 73 |       uint64_t elapsed = perf_read_cond_branch_misses() - begin;
 74 | #endif
 75 | 
 76 |       // skip warmup
 77 |       if (i >= 10) {
 78 |         double time = (double)elapsed / loop_count;
 79 |         history.push_back(time);
 80 |         sum += time;
 81 |       }
 82 |     }
 83 |     gadget_index++;
 84 | 
 85 |     double min = history[0];
 86 |     double max = history[0];
 87 |     for (size_t i = 0; i < history.size(); i++) {
 88 |       if (min > history[i]) {
 89 |         min = history[i];
 90 |       }
 91 |       if (max < history[i]) {
 92 |         max = history[i];
 93 |       }
 94 |     }
 95 |     fprintf(fp, "%d,%.2lf,%.2lf,%.2lf\n", size, min, sum / history.size(), max);
 96 |     fflush(fp);
 97 |   }
 98 |   delete[] buffer;
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/btb_size_basic_lib.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/utils.h"
 2 | #include <assert.h>
 3 | #include <cstdint>
 4 | #include <set>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <vector>
 8 | 
 9 | // generated in btb_size_basic_gen.cpp
10 | // args: loop count
11 | typedef void (*gadget)(size_t);
12 | extern "C" {
13 | extern gadget btb_size_basic_gadgets[];
14 | }
15 | 
16 | void btb_size_basic(FILE *fp) {
17 |   int loop_count = 100;
18 |   // match gen_btb_test
19 |   uint64_t min_size = 0, max_size = 0, max_product = 0, min_stride = 0,
20 |            max_stride = 0;
21 |   std::vector<uint64_t> mults;
22 |   int num_patterns = 3;
23 | 
24 |   min_size = 2;
25 |   max_size = 65536;
26 |   max_product = 32768;
27 |   min_stride = 4;
28 |   max_stride = 8192;
29 |   mults = {1, 3, 5, 7};
30 | 
31 |   bind_to_core();
32 |   setup_perf_cycles();
33 |   fprintf(fp, "pattern,size,stride,min,avg,max\n");
34 |   int gadget_index = 0;
35 |   for (int pattern = 0; pattern < num_patterns; pattern++) {
36 |     for (uint64_t stride = min_stride; stride <= max_stride; stride *= 2) {
37 |       std::set<int> sizes;
38 |       for (uint64_t size_base = min_size; size_base <= max_product / stride;
39 |            size_base *= 2) {
40 |         for (uint64_t mult : mults) {
41 |           for (uint64_t size = size_base * mult - 1;
42 |                size <= size_base * mult + 1 && size * stride <= max_product &&
43 |                size <= max_size;
44 |                size++) {
45 |             sizes.insert(size);
46 |           }
47 |         }
48 |       }
49 | 
50 |       for (uint64_t size : sizes) {
51 |         gadget entry = btb_size_basic_gadgets[gadget_index];
52 | 
53 |         std::vector<double> history;
54 |         int iterations = 30;
55 |         history.reserve(iterations);
56 | 
57 |         double sum = 0;
58 |         // run several times
59 |         for (int i = 0; i < iterations; i++) {
60 |           uint64_t begin = perf_read_cycles();
61 |           entry(loop_count);
62 |           uint64_t elapsed = perf_read_cycles() - begin;
63 | 
64 |           // skip warmup
65 |           if (i >= 10) {
66 |             double time = (double)elapsed / loop_count / size;
67 |             history.push_back(time);
68 |             sum += time;
69 |           }
70 |         }
71 |         gadget_index++;
72 | 
73 |         double min = history[0];
74 |         double max = history[0];
75 |         for (size_t i = 0; i < history.size(); i++) {
76 |           if (min > history[i]) {
77 |             min = history[i];
78 |           }
79 |           if (max < history[i]) {
80 |             max = history[i];
81 |           }
82 |         }
83 |         fprintf(fp, "%d,%ld,%ld,%.2lf,%.2lf,%.2lf\n", pattern, size, stride,
84 |                 min, sum / history.size(), max);
85 |         fflush(fp);
86 |       }
87 |     }
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/length_chg_prefix.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                length_chg_prefix.sh2             2021-01-21 Agner Fog
 3 | #
 4 | # PMC Test program for testing any loop buffer or microop cache
 5 | #
 6 | # (c) 2013-2021 GNU General Public License www.gnu.org/licenses
 7 | #
 8 | # Parameters:
 9 | #
10 | # tcase:   1: mov register,constant
11 | #          2: add register,constant
12 | #          3: test register,constant
13 | #          4: neg or not register (bogus length-changing prefix)
14 | #          5: lea with address size prefix (must run in 32-bit mode)
15 | #
16 | # tmode:   1: instructions with length-changing prefix, aligned by 16
17 | #          2: instructions with length-changing prefix, crossing 16-bytes boundary
18 | #          3: similar instructions with non-length-changing prefix, aligned by 16
19 | 
20 | . vars.sh
21 | 
22 | nthreads=1
23 | 
24 | echo -e "Test length-changing prefixes"  > results2/length_chg_prefix.txt
25 | 
26 | for tcase in {1..5}
27 | do
28 | 
29 | if [ $tcase -eq 1 ]; then echo -e "\n\nCase 1: mov register,constant\n" >> results2/length_chg_prefix.txt ; fi
30 | if [ $tcase -eq 2 ]; then echo -e "\n\nCase 2: add register,constant\n" >> results2/length_chg_prefix.txt ; fi
31 | if [ $tcase -eq 3 ]; then echo -e "\n\nCase 3: test register,constant\n" >> results2/length_chg_prefix.txt ; fi
32 | if [ $tcase -eq 4 ]; then echo -e "\n\nCase 4: neg or not register (bogus length-changing prefix)\n" >> results2/length_chg_prefix.txt ; fi
33 | if [ $tcase -eq 5 ]; then echo -e "\n\nCase 5: lea with address size prefix\n" >> results2/length_chg_prefix.txt ; fi
34 | 
35 | for tmode in {1..3}
36 | do
37 | if [ $tmode -eq 1 ]; then echo -e "\nA. Instructions with length-changing prefix, aligned by 16" >> results2/length_chg_prefix.txt ; fi
38 | if [ $tmode -eq 2 ]; then echo -e "\nB. Instructions with length-changing prefix, crossing 16-bytes boundary" >> results2/length_chg_prefix.txt ; fi
39 | if [ $tmode -eq 3 ]; then echo -e "\nC. Similar instructions with non-length-changing prefix" >> results2/length_chg_prefix.txt ; fi
40 | 
41 | if [[ $support32bit == 1 ]] ; then
42 |   $ass -f elf32 -o b32.o -l b32.lst -Dnthreads=$nthreads -Dtcase=$tcase -Dtmode=$tmode -Plength_chg_prefix.inc TemplateB32.nasm
43 |   if [ $? -ne 0 ] ; then exit ; fi
44 |   g++ -fno-pie -no-pie -m32 -fno-pie -no-pie a32.o b32.o -ox -lpthread
45 |   if [ $? -ne 0 ] ; then exit ; fi
46 |   ./x >> results2/length_chg_prefix.txt
47 | else # must use 64 bits
48 |   $ass -f elf64 -o b64.o -l b64.lst -Dnthreads=$nthreads -Dtcase=$tcase -Dtmode=$tmode -Plength_chg_prefix.inc TemplateB64.nasm
49 |   if [ $? -ne 0 ] ; then exit ; fi
50 |   g++ -fno-pie -no-pie -m64 -fno-pie -no-pie a64.o b64.o -ox -lpthread
51 |   if [ $? -ne 0 ] ; then exit ; fi
52 |   ./x >> results2/length_chg_prefix.txt
53 | fi
54 | 
55 | done
56 | done
57 | 
58 | echo -e "\n"  >> results2/length_chg_prefix.txt
59 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/32bitinstr.inc:
--------------------------------------------------------------------------------
  1 | ; 32bitinstr.inc
  2 | ; Define test code for instructions in 32-bit mode 
  3 | ; (c) 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses
  4 | 
  5 | ; instruction-specific test codes
  6 | 
  7 | %ifidni instruct, aam_latency
  8 |    %macro testcode 0
  9 |       aam
 10 |    %endmacro
 11 | 
 12 | %elifidni instruct, aam_throughput
 13 |    %macro testcode 0
 14 |       xor eax,eax   ; break dependency
 15 |       aam
 16 |    %endmacro
 17 | 
 18 | %elifidni instruct, bound
 19 |    %macro testinit2 0
 20 |       mov [esi],esi        ; setup bounds to avoid interrupt
 21 |       lea eax, [esi+100]
 22 |       mov [esi+4],eax
 23 |    %endmacro
 24 |    %macro testcode 0
 25 |       bound esi, [esi]
 26 |    %endmacro
 27 | 
 28 | %elifidni instruct, into
 29 |    %macro testinit2 0
 30 |       xor eax,eax   ; clear overflow flag
 31 |    %endmacro
 32 | 
 33 | %elifidni instruct, lahf_sahf
 34 |    %macro testcode 0
 35 |       lahf            ; test combined latency
 36 |       sahf
 37 |    %endmacro
 38 | 
 39 | %elifidni instruct, leave
 40 |    %macro testcode 0
 41 |       mov esi, 100 ; can't use ebp here
 42 |       align 16
 43 |       repeat11loop:
 44 |       mov edi,esp          ; prepare stack frame
 45 |       push 0
 46 |       mov ebp,esp
 47 |       mov [ebp],ebp
 48 |       %rep 100
 49 |          leave
 50 |       %endrep
 51 |       mov esp,edi          ; restore stack
 52 |       dec esi
 53 |       jnz repeat11loop     ; loop
 54 |    %endmacro
 55 |    %define repeat1 0       ; disable default loops
 56 |    %define repeat2 1
 57 | 
 58 | %elifidni instruct, pushad
 59 |    %macro testcode 0
 60 |       mov edi,esp
 61 |       %rep 100
 62 |          pushad
 63 |       %endrep
 64 |       mov esp,edi          ; restore stack
 65 |    %endmacro
 66 |    %define repeat2 1
 67 | 
 68 | %elifidni instruct, popad
 69 |    %macro testcode 0
 70 |       movd xmm0, esp
 71 |       movd xmm1, ebp
 72 |       sub esp, 3200        ; prepare stack
 73 |       %rep 100
 74 |          popad
 75 |       %endrep
 76 |       movd esp, xmm0       ; restore stack
 77 |       movd ebp, xmm1       ; restore loop pointer
 78 |    %endmacro
 79 |    %define repeat2 1
 80 | 
 81 | %elifidni instruct, salc_inc_al
 82 |    %macro testcode 0
 83 |       salc                 ; combined latency
 84 |       inc al
 85 |    %endmacro
 86 | 
 87 | %elifidni instruct, enter
 88 |    %macro testcode 0
 89 |       mov esi, 100          ; can't use ebp here
 90 |       align 16
 91 |       repeat11loop:
 92 |       mov edi, esp
 93 |       lea ebp, [UserData+1000h]   ; dummy frame
 94 |       %REP 100
 95 |       enter 4, immvalue
 96 |       %ENDREP
 97 |       mov esp,edi                 ; restore stack pointer
 98 |       dec esi
 99 |       jnz repeat11loop            ; loop 
100 |    %endmacro
101 |    %define repeat1 0       ; disable default loops
102 |    %define repeat2 1
103 | 
104 | 
105 | %else
106 | 
107 | 
108 | %endif
109 | 
110 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/my_branch3.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | 
 5 | # Reproduce Table 2 of Half&Half
 6 | # Test branch toggles
 7 | x_data = range(0, 16)
 8 | y_data = range(185, 195)
 9 | z_data = []
10 | for branch_toggle in x_data:
11 |     temp = []
12 |     for dummy_branches in y_data:
13 |         output = subprocess.check_output(
14 |             [
15 |                 "./my_branch3.sh2",
16 |                 "16",
17 |                 "16",
18 |                 str(branch_toggle),
19 |                 "-1",
20 |                 str(dummy_branches),
21 |             ],
22 |             encoding="utf-8",
23 |         )
24 |         heading = False
25 |         data = []
26 |         for line in output.splitlines():
27 |             parts = list(filter(lambda s: len(s) > 0, line.strip().split(" ")))
28 |             if len(parts) > 0:
29 |                 if not heading:
30 |                     assert parts[5] == "BrMisCond"
31 |                     heading = True
32 |                 else:
33 |                     data.append(int(parts[4]))
34 |         # skip misprediction from dummy branches
35 |         avg = (np.average(np.array(data)) - 1000) / 2000  # 2 branches, 1000 loops
36 |         print(branch_toggle, dummy_branches, f"{avg:.2f}")
37 |         temp.append(avg)
38 |     z_data.append(temp)
39 | 
40 | plt.imshow(z_data)
41 | plt.xlabel("Dummy branches")
42 | plt.xticks(range(len(y_data)), y_data, rotation=90)
43 | plt.ylabel("Branch toggle bit")
44 | plt.yticks(x_data)
45 | plt.savefig("my_branch3_1.png")
46 | plt.cla()
47 | 
48 | # Test target toggles
49 | x_data = range(0, 6)
50 | y_data = range(185, 195)
51 | z_data = []
52 | for target_toggle in x_data:
53 |     temp = []
54 |     for dummy_branches in y_data:
55 |         output = subprocess.check_output(
56 |             [
57 |                 "./my_branch3.sh2",
58 |                 "16",
59 |                 "16",
60 |                 "-1",
61 |                 str(target_toggle),
62 |                 str(dummy_branches),
63 |             ],
64 |             encoding="utf-8",
65 |         )
66 |         heading = False
67 |         data = []
68 |         for line in output.splitlines():
69 |             parts = list(filter(lambda s: len(s) > 0, line.strip().split(" ")))
70 |             if len(parts) > 0:
71 |                 if not heading:
72 |                     assert parts[5] == "BrMisCond"
73 |                     heading = True
74 |                 else:
75 |                     data.append(int(parts[4]))
76 |         # skip misprediction from dummy branches
77 |         avg = (np.average(np.array(data)) - 1000) / 2000  # 2 branches, 1000 loops
78 |         print(target_toggle, dummy_branches, f"{avg:.2f}")
79 |         temp.append(avg)
80 |     z_data.append(temp)
81 | 
82 | plt.imshow(z_data)
83 | plt.xlabel("Dummy branches")
84 | plt.xticks(range(len(y_data)), y_data, rotation=90)
85 | plt.ylabel("Target toggle bit")
86 | plt.yticks(x_data)
87 | plt.savefig("my_branch3_2.png")
88 | plt.cla()
89 | 


--------------------------------------------------------------------------------
/src/find_branch_misses_pmu.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef __linux__
  2 | #include "include/utils.h"
  3 | #include <assert.h>
  4 | #include <linux/perf_event.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <sys/mman.h>
  8 | #include <unistd.h>
  9 | #include <vector>
 10 | 
 11 | // generated in find_branch_misses_pmu_gen.cpp
 12 | // args: loop count, buffer
 13 | typedef void (*gadget)(size_t, uint32_t *);
 14 | extern "C" {
 15 | extern gadget find_branch_misses_pmu_gadgets[];
 16 | }
 17 | 
 18 | int main(int argc, char *argv[]) {
 19 |   int num_patterns = 3;
 20 |   int loop_count = 1000;
 21 |   int min_counter = 0x0;
 22 |   int max_counter = 0x1000;
 23 |   const char *pattern_names[] = {
 24 |       "50% cond branch miss",
 25 |       "50% indirect branch miss",
 26 |       "50% cond + indirect branch miss",
 27 |   };
 28 | 
 29 |   bind_to_core();
 30 |   FILE *fp = fopen("find_branch_misses_pmu.csv", "w");
 31 |   assert(fp);
 32 | 
 33 |   uint32_t *buffer = new uint32_t[loop_count + 1];
 34 |   for (int i = 0; i <= loop_count; i++) {
 35 |     buffer[i] = rand() % 2;
 36 |   }
 37 | 
 38 |   fprintf(fp, "pattern,counter,min,avg,max\n");
 39 |   int gadget_index = 0;
 40 |   for (int pattern = 0; pattern < num_patterns; pattern++) {
 41 |     for (int counter = min_counter; counter <= max_counter; counter++) {
 42 |       std::vector<double> history;
 43 |       int iterations = 100;
 44 |       history.reserve(iterations);
 45 | 
 46 |       double sum = 0;
 47 |       raw_perf_counter perf =
 48 |           setup_perf_common_failable(PERF_TYPE_RAW, counter);
 49 |       if (perf.fd < 0) {
 50 |         continue;
 51 |       }
 52 | 
 53 |       // run several times
 54 |       for (int i = 0; i < iterations; i++) {
 55 |         uint64_t begin = perf.read();
 56 |         find_branch_misses_pmu_gadgets[gadget_index](loop_count, buffer);
 57 |         uint64_t elapsed = perf.read() - begin;
 58 | 
 59 |         // skip warmup
 60 |         if (i >= 10) {
 61 |           double time = (double)elapsed / loop_count;
 62 |           history.push_back(time);
 63 |           sum += time;
 64 |         }
 65 |       }
 66 |       close(perf.fd);
 67 |       if (perf.page) {
 68 |         munmap(perf.page, getpagesize());
 69 |       }
 70 | 
 71 |       double min = history[0];
 72 |       double max = history[0];
 73 |       for (size_t i = 0; i < history.size(); i++) {
 74 |         if (min > history[i]) {
 75 |           min = history[i];
 76 |         }
 77 |         if (max < history[i]) {
 78 |           max = history[i];
 79 |         }
 80 |       }
 81 |       if (max > 0.0)
 82 |         fprintf(fp, "%s,0x%x,%.2lf,%.2lf,%.2lf\n", pattern_names[pattern],
 83 |                 counter, min, sum / history.size(), max);
 84 |       fflush(fp);
 85 |     }
 86 |     gadget_index++;
 87 |   }
 88 | 
 89 |   printf("Results are written to find_branch_misses_pmu.csv\n");
 90 |   delete[] buffer;
 91 |   return 0;
 92 | }
 93 | #else
 94 | #include <stdio.h>
 95 | int main(int argc, char *argv[]) {
 96 |   printf("Not supported\n");
 97 |   return 0;
 98 | }
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/loop_buffer.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                loop_buffer.sh2             2020-08-26 Agner Fog
 3 | #
 4 | # PMC Test program for testing any loop buffer or microop cache
 5 | #
 6 | # (c) 2013-2020 GNU General Public License www.gnu.org/licenses
 7 | #
 8 | # Parameters:
 9 | #
10 | # nopsize:   Size of NOP instructions (1 - 15)
11 | # 
12 | # noptype:   2: long NOPs (0F 1F ...)
13 | #            3: 66 NOPs (simple NOP with up to 14 operand size prefixes)
14 | #            4: long NOPs up to 11, then other instructions with max 3 prefixes up to 14 (for processors that have penalties for > 3 prefixes)
15 | # 
16 | # repeat1:   Number of loop repetitions
17 | #
18 | # repeat2:   Number of NOPs in loop
19 | #
20 | # nthreads:  Number of simultaneous threads
21 | 
22 | # (You may change the parameters to focus near the limit of the buffer size)
23 | 
24 | . vars.sh
25 | 
26 | nthreads=1
27 | 
28 | repeat1=1000
29 | 
30 | nopsize=12
31 | 
32 | echo -e "Test loop buffer size"  > results2/loop_buffer.txt
33 | 
34 | for noptype in 2 3
35 | do
36 | for repeat2 in 2 10 20 30 32 40 50 100 1000 2000 2200 2500 2800 3000 4000 10000 20000
37 | do
38 | 
39 | totalsize=$(expr 5 + $repeat2 \* $nopsize )
40 | 
41 | echo -e "\n\nNumber of NOPs = $repeat2, noptype = $noptype, nopsize = $nopsize, total size = $totalsize"  >> results2/loop_buffer.txt
42 | 
43 | $ass -f elf64 -o b64.o -Dnthreads=$nthreads -Drepeat1=$repeat1 -Drepeat2=$repeat2 -Dnoptype=$noptype -Dnopsize=$nopsize -Ploop_buffer.inc TemplateB64.nasm
44 | if [ $? -ne 0 ] ; then exit ; fi
45 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
46 | if [ $? -ne 0 ] ; then exit ; fi
47 | ./x >> results2/loop_buffer.txt
48 | 
49 | done
50 | done
51 | 
52 | nopsize=14
53 | noptype=2
54 | 
55 | for repeat2 in 10 30 100 1000 2000 2200 2500 2800 3000 4000 10000 20000
56 | do
57 | 
58 | totalsize=$(expr 5 + $repeat2 \* $nopsize )
59 | 
60 | echo -e "\n\nNumber of NOPs = $repeat2, noptype = $noptype, nopsize = $nopsize, total size = $totalsize"  >> results2/loop_buffer.txt
61 | 
62 | $ass -f elf64 -o b64.o -Dnthreads=$nthreads -Drepeat1=$repeat1 -Drepeat2=$repeat2 -Dnoptype=$noptype -Dnopsize=$nopsize -Ploop_buffer.inc TemplateB64.nasm
63 | if [ $? -ne 0 ] ; then exit ; fi
64 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
65 | if [ $? -ne 0 ] ; then exit ; fi
66 | ./x >> results2/loop_buffer.txt
67 | 
68 | done
69 | 
70 | 
71 | 
72 | 
73 | 
74 | echo -e "\n\n\nTest with multiple threads"  >> results2/loop_buffer.txt
75 | nthreads=3
76 | 
77 | for repeat2 in 100 1000 2000 2500 3000 10000 100000
78 | do
79 | for noptype in 2
80 | do
81 | 
82 | totalsize=$(expr 5 + $repeat2 \* $nopsize )
83 | 
84 | echo -e "\n\nNumber of NOPs = $repeat2, noptype = $noptype, nopsize = $nopsize, total size = $totalsize"  >> results2/loop_buffer.txt
85 | 
86 | $ass -f elf64 -o b64.o -Dnthreads=$nthreads -Drepeat1=$repeat1 -Drepeat2=$repeat2 -Dnoptype=$noptype -Dnopsize=$nopsize -Ploop_buffer.inc TemplateB64.nasm
87 | if [ $? -ne 0 ] ; then exit ; fi
88 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
89 | if [ $? -ne 0 ] ; then exit ; fi
90 | ./x >> results2/loop_buffer.txt
91 | 
92 | done
93 | done
94 | 
95 | echo -e "\n"  >> results2/loop_buffer.txt
96 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/mul.sh1:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                                        2012-01-26 AgF
 3 | # Compile and run PMCTest for integer multiplication instructions
 4 | # looping through list of instructions
 5 | # (c) Copyright 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses
 6 | 
 7 | # Detect CPU specific variables
 8 | . vars.sh
 9 | 
10 | echo -e "Multiplication instructions latency and throughput\n"  > results1/mul.txt
11 | 
12 | # single operand:
13 | 
14 | for i in  mul imul
15 | do
16 | 
17 | for r in 8 16 32 64
18 | do
19 | 
20 | echo -e "\n\nLatency: $i , registersize $r "  >> results1/mul.txt
21 | 
22 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dnumop=1 -Dregsize=$r -Dtmode=L -Pmul.inc TemplateB64.nasm
23 | if [ $? -ne 0 ] ; then exit ; fi
24 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
25 | if [ $? -ne 0 ] ; then exit ; fi
26 | ./x >> results1/mul.txt
27 | 
28 | echo -e "\n\nThroughput: $i , registersize $r (subtract 1 uop)"  >> results1/mul.txt
29 | for cts in $PMClist
30 | do
31 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dnumop=1 -Dregsize=$r -Dtmode=T -Dcounters=$cts -Pmul.inc TemplateB64.nasm
32 | if [ $? -ne 0 ] ; then exit ; fi
33 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
34 | if [ $? -ne 0 ] ; then exit ; fi
35 | ./x >> results1/mul.txt
36 | done
37 | 
38 | echo -e "\n\nThroughput with memory operand: $i , registersize $r (subtract 1 uop)" >> results1/mul.txt
39 | for cts in $PMClist
40 | do
41 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dnumop=1 -Dregsize=$r -Dtmode=M -Dcounters=$cts -Pmul.inc TemplateB64.nasm
42 | if [ $? -ne 0 ] ; then exit ; fi
43 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
44 | if [ $? -ne 0 ] ; then exit ; fi
45 | ./x >> results1/mul.txt
46 | done
47 | done
48 | done
49 | 
50 | echo -e "\n"  >> results1/mul.txt
51 | 
52 | # 2 - 3 operands:
53 | 
54 | for i in  imul
55 | do
56 | 
57 | for n in  2 3
58 | do
59 | 
60 | for r in 16 32 64
61 | do
62 | 
63 | echo -e "\n\nLatency: $i , regsize $r, numop $n"  >> results1/mul.txt
64 | 
65 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dnumop=$n -Dtmode=L -Dregsize=$r -Pmul.inc TemplateB64.nasm
66 | if [ $? -ne 0 ] ; then exit ; fi
67 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
68 | if [ $? -ne 0 ] ; then exit ; fi
69 | ./x >> results1/mul.txt
70 | 
71 | echo -e "\n\nThroughput: $i , regsize $r, numop $n"  >> results1/mul.txt
72 | for cts in $PMClist
73 | do
74 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dnumop=$n -Dtmode=T -Dregsize=$r -Dcounters=$cts -Pmul.inc TemplateB64.nasm
75 | if [ $? -ne 0 ] ; then exit ; fi
76 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
77 | if [ $? -ne 0 ] ; then exit ; fi
78 | ./x >> results1/mul.txt
79 | done
80 | 
81 | echo -e "\n\nThroughput with memory operand: $i , regsize $r, numop $n"  >> results1/mul.txt
82 | for cts in $PMClist
83 | do
84 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dnumop=$n -Dtmode=M -Dregsize=$r -Dcounters=$cts -Pmul.inc TemplateB64.nasm
85 | if [ $? -ne 0 ] ; then exit ; fi
86 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
87 | if [ $? -ne 0 ] ; then exit ; fi
88 | ./x >> results1/mul.txt
89 | done
90 | done
91 | done
92 | done
93 | 
94 | echo -e "\n"  >> results1/mul.txt
95 | 
96 | 


--------------------------------------------------------------------------------
/src/phr_branch_target_xor.cpp:
--------------------------------------------------------------------------------
  1 | #include "include/utils.h"
  2 | #include <assert.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <vector>
  6 | 
  7 | // use with generate_gadget tool
  8 | 
  9 | // defined in gen_phr_branch_target_xor_test()
 10 | // args: loop count, buffer
 11 | typedef void (*gadget)(size_t, uint32_t *);
 12 | extern "C" {
 13 | extern gadget phr_branch_target_xor_gadgets[];
 14 | }
 15 | 
 16 | int main(int argc, char *argv[]) {
 17 |   int loop_count = 1000;
 18 |   // match gen_phr_branch_target_xor_test
 19 | #if defined(HOST_AMD64)
 20 |   int min_branch_toggle = 1;
 21 |   int max_branch_toggle = 12;
 22 |   int min_target_toggle = 0;
 23 |   int max_target_toggle = 12;
 24 | #else
 25 |   int min_branch_toggle = 2;
 26 |   int max_branch_toggle = 18;
 27 |   int min_target_toggle = 2;
 28 |   int max_target_toggle = 18;
 29 | #endif
 30 | 
 31 |   bind_to_core();
 32 | #ifdef NO_COND_BRANCH_MISSES
 33 |   setup_perf_branch_misses();
 34 | #else
 35 |   setup_perf_cond_branch_misses();
 36 | #endif
 37 |   FILE *fp = fopen("phr_branch_target_xor.csv", "w");
 38 |   assert(fp);
 39 | 
 40 |   uint32_t *buffer = new uint32_t[loop_count + 1];
 41 | 
 42 |   fprintf(fp, "branch,target,min,avg,max\n");
 43 |   int gadget_index = 0;
 44 |   int repeat = 2; // two branches
 45 |   for (int branch_toggle = min_branch_toggle;
 46 |        branch_toggle <= max_branch_toggle; branch_toggle++) {
 47 |     for (int target_toggle = min_target_toggle;
 48 |          target_toggle <= max_target_toggle; target_toggle++) {
 49 |       std::vector<double> history;
 50 |       int iterations = 100;
 51 |       history.reserve(iterations);
 52 | 
 53 |       double sum = 0;
 54 |       // run several times
 55 |       for (int i = 0; i < iterations; i++) {
 56 |         for (int i = 0; i <= loop_count; i++) {
 57 |           buffer[i] = rand() % 2;
 58 |         }
 59 | #ifdef NO_COND_BRANCH_MISSES
 60 |         uint64_t begin = perf_read_branch_misses();
 61 | #else
 62 |         uint64_t begin = perf_read_cond_branch_misses();
 63 | #endif
 64 |         phr_branch_target_xor_gadgets[gadget_index](loop_count, buffer);
 65 | #ifdef NO_COND_BRANCH_MISSES
 66 |         uint64_t elapsed = perf_read_branch_misses() - begin;
 67 | #else
 68 |         uint64_t elapsed = perf_read_cond_branch_misses() - begin;
 69 | #endif
 70 | 
 71 |         // skip warmup
 72 |         if (i >= 10) {
 73 |           double time = (double)elapsed / loop_count / repeat;
 74 |           history.push_back(time);
 75 |           sum += time;
 76 |         }
 77 |       }
 78 |       gadget_index++;
 79 | 
 80 |       double min = history[0];
 81 |       double max = history[0];
 82 |       for (size_t i = 0; i < history.size(); i++) {
 83 |         if (min > history[i]) {
 84 |           min = history[i];
 85 |         }
 86 |         if (max < history[i]) {
 87 |           max = history[i];
 88 |         }
 89 |       }
 90 |       fprintf(fp, "%d,%d,%.2lf,%.2lf,%.2lf\n", branch_toggle, target_toggle,
 91 |               min, sum / history.size(), max);
 92 |       fflush(fp);
 93 |     }
 94 |   }
 95 | 
 96 |   printf("Results are written to phr_branch_target_xor.csv\n");
 97 |   delete[] buffer;
 98 |   return 0;
 99 | }
100 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/pushpop.sh1:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                                        2021-01-25 Agner Fog
 3 | # Compile and run PMCTest for push and pop instructions
 4 | # looping through list of instructions
 5 | # (c) Copyright 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses
 6 | 
 7 | . vars.sh
 8 | 
 9 | echo -e "push and pop instructions latency and throughput\n64 bit mode"  > results1/pushpop.txt
10 | echo -e "Operands:"               >> results1/pushpop.txt
11 | echo -e "r  = register"            >> results1/pushpop.txt
12 | echo -e "I  = immediate constant"  >> results1/pushpop.txt
13 | echo -e "M  = memory"              >> results1/pushpop.txt
14 | echo -e "F  = flags"               >> results1/pushpop.txt
15 | echo -e "SP = stack pointer\n\n"   >> results1/pushpop.txt
16 | 
17 | 
18 | # warmup
19 | $ass -f elf64 -o b64.o -Dinstruct=push -Dregsize=64 -Doper=R -DWARMUPCOUNT=10000000 -Ppushpop.inc TemplateB64.nasm
20 | if [ $? -ne 0 ] ; then exit ; fi
21 | g++ -fno-pie -no-pie -m64 -fno-pie -no-pie a64.o b64.o -ox -lpthread
22 | if [ $? -ne 0 ] ; then exit ; fi
23 | ./x >> /dev/null
24 | 
25 | 
26 | for i in  push pop
27 | do
28 | for o in R SP M F
29 | do
30 | 
31 | echo -e "\n\nThroughput: $i , operand: $o "  >> results1/pushpop.txt
32 | for cts in $PMClist
33 | do
34 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dregsize=64 -Doper=$o -Dcounters=$cts -Ppushpop.inc TemplateB64.nasm
35 | if [ $? -ne 0 ] ; then exit ; fi
36 | g++ -fno-pie -no-pie -m64 -fno-pie -no-pie a64.o b64.o -ox -lpthread
37 | if [ $? -ne 0 ] ; then exit ; fi
38 | ./x >> results1/pushpop.txt
39 | done
40 | done
41 | done
42 | 
43 | for i in  push
44 | do
45 | for o in I
46 | do 
47 | 
48 | echo -e "\n\nThroughput: $i , operand: $o "  >> results1/pushpop.txt
49 | for cts in $PMClist
50 | do
51 | $ass -f elf64 -o b64.o -Dinstruct=$i -Dregsize=64 -Doper=$o -Dcounters=$cts -Ppushpop.inc TemplateB64.nasm
52 | if [ $? -ne 0 ] ; then exit ; fi
53 | g++ -fno-pie -no-pie -m64 -fno-pie -no-pie a64.o b64.o -ox -lpthread
54 | if [ $? -ne 0 ] ; then exit ; fi
55 | ./x >> results1/pushpop.txt
56 | done
57 | done
58 | done
59 | 
60 | if [ $support32bit -ne 0 ] ; then
61 | 
62 | echo -e "\n\n\npush and pop instructions latency and throughput\n32 bit mode"  >> results1/pushpop.txt
63 | 
64 | for i in  push pop
65 | do
66 | for o in R SP M F
67 | do
68 | 
69 | echo -e "\n\nThroughput: $i , operand: $o "  >> results1/pushpop.txt
70 | 
71 | $ass -f elf32 -o b32.o -Dinstruct=$i -Dregsize=32 -Doper=$o -Ppushpop.inc TemplateB32.nasm
72 | if [ $? -ne 0 ] ; then exit ; fi
73 | g++ -fno-pie -no-pie -m32 -fno-pie -no-pie a32.o b32.o -ox -lpthread
74 | if [ $? -ne 0 ] ; then exit ; fi
75 | ./x >> results1/pushpop.txt
76 | 
77 | done
78 | done
79 | 
80 | for i in  push
81 | do
82 | for o in I
83 | do 
84 | 
85 | echo -e "\n\nThroughput: $i , operand: $o "  >> results1/pushpop.txt
86 | 
87 | $ass -f elf32 -o b32.o -Dinstruct=$i -Dregsize=32 -Doper=$o -Ppushpop.inc TemplateB32.nasm
88 | if [ $? -ne 0 ] ; then exit ; fi
89 | g++ -fno-pie -no-pie -m32 -fno-pie -no-pie a32.o b32.o -ox -lpthread
90 | if [ $? -ne 0 ] ; then exit ; fi
91 | ./x >> results1/pushpop.txt
92 | 
93 | done
94 | done
95 | fi
96 | 
97 | echo -e "\n"  >> results1/pushpop.txt
98 | 
99 | 


--------------------------------------------------------------------------------
/src/detect_uarch.cpp:
--------------------------------------------------------------------------------
  1 | #include "include/uarch.h"
  2 | #include <cassert>
  3 | #include <stdio.h>
  4 | 
  5 | int main() {
  6 |   enum uarch uarch = get_uarch();
  7 |   switch (uarch) {
  8 |   case firestorm:
  9 |     printf("-DAPPLE_SILICON\n");
 10 |     printf("-DAPPLE_PCORE\n");
 11 |     printf("-DAPPLE_M1\n");
 12 |     printf("-DAPPLE_M1_FIRESTORM\n");
 13 |     break;
 14 |   case icestorm:
 15 |     printf("-DAPPLE_SILICON\n");
 16 |     printf("-DAPPLE_M1\n");
 17 |     printf("-DAPPLE_M1_ICESTORM\n");
 18 |     break;
 19 |   case avalanche:
 20 |     printf("-DAPPLE_SILICON\n");
 21 |     printf("-DAPPLE_PCORE\n");
 22 |     printf("-DAPPLE_M2\n");
 23 |     printf("-DAPPLE_M2_AVALANCHE\n");
 24 |     break;
 25 |   case blizzard:
 26 |     printf("-DAPPLE_SILICON\n");
 27 |     printf("-DAPPLE_M2\n");
 28 |     printf("-DAPPLE_M2_BLIZZARD\n");
 29 |     break;
 30 |   case m4_pcore:
 31 |     printf("-DAPPLE_SILICON\n");
 32 |     printf("-DAPPLE_PCORE\n");
 33 |     printf("-DAPPLE_M4\n");
 34 |     printf("-DAPPLE_M4_PCORE\n");
 35 |     break;
 36 |   case m4_ecore:
 37 |     printf("-DAPPLE_SILICON\n");
 38 |     printf("-DAPPLE_M4\n");
 39 |     printf("-DAPPLE_M4_ECORE\n");
 40 |     break;
 41 |   case oryon:
 42 |     printf("-DQUALCOMM_ORYON\n");
 43 |     break;
 44 |   case cortex_a78:
 45 |     printf("-DARM_CORTEX_A78\n");
 46 |     break;
 47 |   case cortex_a77:
 48 |     printf("-DARM_CORTEX_A77\n");
 49 |     break;
 50 |   case cortex_x1:
 51 |     printf("-DARM_CORTEX_X1\n");
 52 |     break;
 53 |   case neoverse_n1:
 54 |     printf("-DNO_FJCVTZS\n");
 55 |     printf("-DARM_NEOVERSE_N1\n");
 56 |     break;
 57 |   case neoverse_v1:
 58 |     printf("-DARM_NEOVERSE_V1\n");
 59 |     break;
 60 |   case neoverse_n2:
 61 |     printf("-DARM_NEOVERSE_N2\n");
 62 |     break;
 63 |   case neoverse_v2:
 64 |     printf("-DARM_NEOVERSE_V2\n");
 65 |     break;
 66 |   case tsv110:
 67 |     printf("-DHISILICON_TSV110\n");
 68 |     break;
 69 |   case unknown_arm64:
 70 |     break;
 71 |   case golden_cove:
 72 |     printf("-DINTEL\n");
 73 |     printf("-DINTEL_AHYBRID\n");
 74 |     break;
 75 |   case gracemont:
 76 |     printf("-DINTEL\n");
 77 |     printf("-DINTEL_AHYBRID\n");
 78 |     break;
 79 |   case sunny_cove:
 80 |     printf("-DINTEL\n");
 81 |     printf("-DINTEL_ICELAKE_SERVER\n");
 82 |     break;
 83 |   case skylake:
 84 |     printf("-DINTEL\n");
 85 |     printf("-DINTEL_SKYLAKE_SERVER\n");
 86 |     break;
 87 |   case broadwell:
 88 |     printf("-DINTEL\n");
 89 |     printf("-DINTEL_BROADWELL\n");
 90 |     break;
 91 |   case zen1:
 92 |     printf("-DAMD\n");
 93 |     printf("-DAMD_ZEN1\n");
 94 |     break;
 95 |   case zen2:
 96 |     printf("-DAMD\n");
 97 |     printf("-DAMD_ZEN2\n");
 98 |     break;
 99 |   case zen3:
100 |     printf("-DAMD\n");
101 |     printf("-DAMD_ZEN3\n");
102 |     break;
103 |   case zen4:
104 |     printf("-DAMD\n");
105 |     printf("-DAMD_ZEN4\n");
106 |     break;
107 |   case zen5:
108 |     printf("-DAMD\n");
109 |     printf("-DAMD_ZEN5\n");
110 |     break;
111 |   case unknown_amd64:
112 |     break;
113 |   case la464:
114 |     printf("-DLA464\n");
115 |   case unknown_loongarch64:
116 |     break;
117 |   default:
118 |     assert(false);
119 |   }
120 |   return 0;
121 | }
122 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/shift.inc:
--------------------------------------------------------------------------------
  1 | ;----------------------------------------------------------------------------
  2 | ;                       shift.inc                           2012-01-26 Agner Fog
  3 | ;
  4 | ;            PMC Test program for shift and rotate instructions
  5 | ;                           YASM syntax
  6 | ;
  7 | ; The following macros can be defined on the command line or in include files:
  8 | ; 
  9 | ; instruct:     The name of a single instruction to test
 10 | ; 
 11 | ; regsize:      Register size: 8, 16, 32, 64
 12 | ; 
 13 | ; cntop:        Count operand: must be integer constant or cl
 14 | ;
 15 | ; tmode:        L:  Latency
 16 | ;               T:  Throughput
 17 | ;               M:  Throughput with memory operand
 18 | ;
 19 | ; (c) Copyright 2012 by Agner Fog. GNU General Public License www.gnu.org/licenses
 20 | ;-----------------------------------------------------------------------------
 21 | %ifndef tmode
 22 |    %define tmode  L          ; default: measure latency
 23 | %endif
 24 | 
 25 | %ifndef cntop
 26 |    %define cntop   1         ; default count operand = 1
 27 | %endif
 28 | 
 29 | 
 30 | ;##############################################################################
 31 | ;#
 32 | ;#                 Test code macro
 33 | ;#
 34 | ;##############################################################################
 35 | 
 36 | 
 37 | %macro blockports 0    ; currently unused
 38 | %endmacro  
 39 | 
 40 | 
 41 | ; main testcode macro
 42 | %macro testcode 0
 43 | 
 44 | %ifidni instruct, shld    ; define second operand for shld and shrd only
 45 |    %if modesize == 64
 46 |       %define op2 reg7,
 47 |    %else
 48 |       %define op2 reg6,
 49 |    %endif
 50 | %elifidni instruct, shrd
 51 |    %if modesize == 64
 52 |       %define op2 reg7,
 53 |    %else
 54 |       %define op2 reg6,
 55 |    %endif
 56 | %else
 57 |    %define op2 
 58 | %endif
 59 | 
 60 | ; start loop
 61 |         mov ecx, 5
 62 |         mov ebp,100
 63 |         align 32
 64 | Testloop1:        
 65 | 
 66 | %IFIDNI tmode, L         ; measure latency
 67 | 
 68 |    %rep 100 
 69 |         instruct reg0, op2 cntop
 70 |    %endrep
 71 | 
 72 | %ELIFIDNI tmode, T         ; measure throughput with register operands
 73 | 
 74 |    %rep 25
 75 |         instruct reg0, op2 cntop
 76 |         blockports
 77 |         instruct reg1, op2 cntop
 78 |         blockports
 79 |         instruct reg3, op2 cntop      ; avoid ecx
 80 |         blockports
 81 |         instruct reg4, op2 cntop
 82 |         blockports
 83 |    %endrep
 84 | 
 85 | %ELIFIDNI tmode, M         ; measure throughput with memory operand
 86 | 
 87 |    %rep 25
 88 |         instruct sizeptr [rsi], op2 cntop
 89 |         blockports
 90 |         instruct sizeptr [rsi+regsize], op2 cntop
 91 |         blockports
 92 |         instruct sizeptr [rsi+regsize*2], op2 cntop
 93 |         blockports
 94 |         instruct sizeptr [rsi+regsize*3], op2 cntop
 95 |         blockports
 96 |    %endrep
 97 | 
 98 | %ELSE
 99 |    %error unknown testmode
100 | %ENDIF
101 | 
102 |         dec ebp
103 |         jnz Testloop1
104 | 
105 | %endmacro ; testcode
106 | 
107 | ; disable default test loops
108 | %define repeat1 1
109 | %define repeat2 1
110 | 
111 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/stack_sync_uops.inc:
--------------------------------------------------------------------------------
  1 | ;----------------------------------------------------------------------------
  2 | ;                       stack_sync_uops.inc        2013-07-21 Agner Fog
  3 | ;
  4 | ;            PMC Test program for testingbranch prediction
  5 | ;                           NASM syntax
  6 | ;
  7 | ; The following macros can be defined on the command line or in include files:
  8 | ; 
  9 | ; tcase:    Test case number. See below for each case
 10 | ;           1. Push and pop only
 11 | ;           2. added mov r,[rsp]
 12 | ;           3. further added mov r,rsp
 13 | ;           4. call and ret
 14 | ;           5. call and ret imm
 15 | ;           6. call and ret and add rsp,const
 16 | ;
 17 | ;
 18 | ; (c) Copyright 2013 by Agner Fog. GNU General Public License www.gnu.org/licenses
 19 | ;-----------------------------------------------------------------------------
 20 | 
 21 | %ifndef tcase
 22 |    %define case  1          ; default case 1
 23 | %endif
 24 | 
 25 | %ifndef count1
 26 |    %define count1   10      ; default count1
 27 | %endif
 28 | 
 29 | 
 30 | ;##############################################################################
 31 | ;#
 32 | ;#                 Test code macros
 33 | ;#
 34 | ;##############################################################################
 35 | 
 36 | 
 37 | %if tcase < 4   ; Push and pop only
 38 | 
 39 | %macro testcode 0
 40 | nop
 41 | nop
 42 | mov ebp, count1
 43 | align 16
 44 | LL:
 45 |      push rax
 46 |      push rbx
 47 |      push rcx
 48 |      %if tcase > 1
 49 |      mov r8,[rsp]
 50 |      %endif
 51 |      pop rdx
 52 |      pop rdx
 53 |      pop rdx
 54 |      %if tcase > 2
 55 |      mov rdi,rsp
 56 |      %endif
 57 | dec ebp
 58 | jnz LL
 59 | %endmacro
 60 | 
 61 | %elif tcase == 4 ; call and return
 62 | 
 63 | %macro testinit1 0
 64 |    jmp AROUND
 65 |    align 16
 66 |    TESTFUNC:
 67 |    nop
 68 |    nop
 69 |    nop
 70 |    ret
 71 |    align 16
 72 |    AROUND:
 73 | %endmacro
 74 | 
 75 | %macro testcode 0
 76 | mov ebp, count1
 77 | align 16
 78 | LL:
 79 |      push rax
 80 |      push rbx
 81 |      call TESTFUNC
 82 |      pop rdx
 83 |      pop rdx
 84 | dec ebp
 85 | jnz LL
 86 | %endmacro
 87 | 
 88 | %elif tcase == 5 ; call and ret imm
 89 | 
 90 | %macro testinit1 0
 91 |    jmp AROUND
 92 |    align 16
 93 |    TESTFUNC:
 94 |    nop
 95 |    nop
 96 |    nop
 97 |    ret 16
 98 |    align 16
 99 |    AROUND:
100 | %endmacro
101 | 
102 | %macro testcode 0
103 | mov ebp, count1
104 | align 16
105 | LL:
106 |      push rax
107 |      push rbx
108 |      call TESTFUNC
109 | dec ebp
110 | jnz LL
111 | %endmacro
112 | 
113 | %elif tcase == 6 ; call and ret and add rsp,const
114 | 
115 | %macro testinit1 0
116 |    jmp AROUND
117 |    align 16
118 |    TESTFUNC:
119 |    nop
120 |    nop
121 |    nop
122 |    ret
123 |    align 16
124 |    AROUND:
125 | %endmacro
126 | 
127 | %macro testcode 0
128 | mov ebp, count1
129 | align 16
130 | LL:
131 |      push rax
132 |      push rbx
133 |      call TESTFUNC
134 |      add rsp,8
135 |      pop rcx
136 | dec ebp
137 | jnz LL
138 | %endmacro
139 | 
140 | %else
141 | %error unknown test case tcase
142 | %endif
143 | 
144 | ; disable default test loops
145 | %define repeat1 1
146 | %define repeat2 1
147 | 
148 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/daxpy.sh2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #                                                                   2016-11-02 Agner Fog
 3 | 
 4 | # Compile and run PMCTest for different implementations of DAXPY algorithm
 5 | # (c) 2012-2016 by Agner Fog. GNU General Public License www.gnu.org/licenses
 6 | 
 7 | 
 8 | echo -e "Different implementations of DAXPY\n"  > results2/daxpy.txt
 9 | 
10 | . vars.sh
11 | 
12 | ndat=2000
13 | repeat1=100
14 | tcase=1
15 | nthreads=1
16 | 
17 | echo -e "\n\n$repeat1 * $ndat double precision elements\n"  >> results2/daxpy.txt
18 | 
19 | echo -e "\n\nCase 1: SSE2, 128 bit"  >> results2/daxpy.txt
20 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm
21 | if [ $? -ne 0 ] ; then exit ; fi
22 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
23 | if [ $? -ne 0 ] ; then exit ; fi
24 | ./x >> results2/daxpy.txt
25 | 
26 | if  [ `grep -c -i "avx"   cpuinfo.txt ` -gt 0 ] ; then
27 | tcase=2
28 | echo -e "\n\nCase 2: AVX, 256 bit"  >> results2/daxpy.txt
29 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm
30 | if [ $? -ne 0 ] ; then exit ; fi
31 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
32 | if [ $? -ne 0 ] ; then exit ; fi
33 | ./x >> results2/daxpy.txt
34 | fi
35 | 
36 | if  [ `grep -c -i "fma[ 3,\b]"   cpuinfo.txt ` -gt 0 ] ; then  # FMA3
37 | tcase=3
38 | echo -e "\n\nCase 3: FMA3, 128 bit"  >> results2/daxpy.txt
39 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm
40 | if [ $? -ne 0 ] ; then exit ; fi
41 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
42 | if [ $? -ne 0 ] ; then exit ; fi
43 | ./x >> results2/daxpy.txt
44 | 
45 | tcase=4
46 | echo -e "\n\nCase 4: FMA3, 256 bit"  >> results2/daxpy.txt
47 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm
48 | if [ $? -ne 0 ] ; then exit ; fi
49 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
50 | if [ $? -ne 0 ] ; then exit ; fi
51 | ./x >> results2/daxpy.txt
52 | fi
53 | 
54 | if  [ `grep -c -i "fma4"   cpuinfo.txt ` -gt 0 ] ; then  # FMA4
55 | tcase=5
56 | echo -e "\n\nCase 5: FMA4, 128 bit"  >> results2/daxpy.txt
57 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm
58 | if [ $? -ne 0 ] ; then exit ; fi
59 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
60 | if [ $? -ne 0 ] ; then exit ; fi
61 | ./x >> results2/daxpy.txt
62 | 
63 | tcase=6
64 | echo -e "\n\nCase 6: FMA4, 256 bit"  >> results2/daxpy.txt
65 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm
66 | if [ $? -ne 0 ] ; then exit ; fi
67 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
68 | if [ $? -ne 0 ] ; then exit ; fi
69 | ./x >> results2/daxpy.txt
70 | fi
71 | 
72 | if  [ `grep -c -i "avx512" cpuinfo.txt ` -gt 0 ] ; then  # AVX512
73 | tcase=7
74 | echo -e "\n\nCase 7: AVX512, 512 bit"  >> results2/daxpy.txt
75 | $ass -f elf64 -o b64.o -Dtcase=$tcase -Dndat=$ndat -Drepeat1=$repeat1 -Dnthreads=$nthreads -Pdaxpy.inc TemplateB64.nasm
76 | if [ $? -ne 0 ] ; then exit ; fi
77 | g++ -fno-pie -no-pie -m64 a64.o b64.o -ox -lpthread
78 | if [ $? -ne 0 ] ; then exit ; fi
79 | ./x >> results2/daxpy.txt
80 | fi
81 | 
82 | 
83 | echo -e "\n"  >> results2/daxpy.txt
84 | 
85 | 


--------------------------------------------------------------------------------
/src/fp_peak.cpp:
--------------------------------------------------------------------------------
  1 | #include "include/utils.h"
  2 | #include <assert.h>
  3 | #include <cstddef>
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <unistd.h>
  7 | #include <vector>
  8 | 
  9 | // defined in fp_peak_gen.cpp
 10 | // args: loop count
 11 | typedef void (*gadget)(size_t);
 12 | extern "C" {
 13 | extern gadget fp_peak_gadgets[];
 14 | }
 15 | 
 16 | int main(int argc, char *argv[]) {
 17 |   // match fp_peak_gen.cpp
 18 |   int repeat = 1000;
 19 |   int loop_count = 1000;
 20 | 
 21 | #ifdef HOST_AARCH64
 22 |   int num_patterns = 6;
 23 |   char patterns[][20] = {
 24 |       "32-bit SP FMADD",  "64-bit DP FMADD", "128-bit SP ASIMD",
 25 |       "128-bit DP ASIMD", "xxxx-bit SP SVE", "xxxx-bit DP SVE",
 26 |   };
 27 | 
 28 |   int coef[] = {
 29 |       32 / 32 * 2,  // 32-bit SP
 30 |       64 / 64 * 2,  // 64-bit DP
 31 |       128 / 32 * 2, // 128-bit SP
 32 |       128 / 64 * 2, // 128-bit DP
 33 |       0,            // ?-bit SP
 34 |       0,            // ?-bit DP
 35 |   };
 36 | #else
 37 |   int num_patterns = 4;
 38 |   const char *patterns[] = {
 39 |       "256-bit SP FMA",
 40 |       "256-bit DP FMA",
 41 |       "512-bit SP AVX512F",
 42 |       "512-bit DP AVX512F",
 43 |   };
 44 | 
 45 |   int coef[] = {
 46 |       256 / 32 * 2, // 256-bit SP
 47 |       256 / 64 * 2, // 256-bit DP
 48 |       512 / 32 * 2, // 512-bit SP
 49 |       512 / 64 * 2, // 512-bit DP
 50 |   };
 51 | #endif
 52 | 
 53 |   bind_to_core();
 54 |   setup_perf_cycles();
 55 |   FILE *fp = fopen("fp_peak.csv", "w");
 56 |   assert(fp);
 57 | 
 58 |   int gadget_index = 0;
 59 |   fprintf(fp, "pattern,min,avg,max\n");
 60 |   for (int pattern = 0; pattern < num_patterns; pattern++) {
 61 |     std::vector<double> history;
 62 |     int iterations = 100;
 63 |     history.reserve(iterations);
 64 | 
 65 | #ifdef HOST_AARCH64
 66 |     // read sve length in runtime
 67 |     if (pattern == 4) {
 68 |       uint64_t len = 0;
 69 |       asm __volatile__(".arch armv9-a+sve\ncntw %0" : "=r"(len));
 70 |       sprintf(patterns[pattern], "%ld-bit SP SVE", len * 32);
 71 |       coef[pattern] = len * 2;
 72 |     } else if (pattern == 5) {
 73 |       uint64_t len = 0;
 74 |       asm __volatile__(".arch armv9-a+sve\ncntd %0" : "=r"(len));
 75 |       sprintf(patterns[pattern], "%ld-bit DP SVE", len * 64);
 76 |       coef[pattern] = len * 2;
 77 |     }
 78 | #endif
 79 | 
 80 |     double sum = 0;
 81 |     // run several times
 82 |     for (int i = 0; i < iterations; i++) {
 83 |       uint64_t begin = perf_read_cycles();
 84 |       fp_peak_gadgets[gadget_index](loop_count);
 85 |       uint64_t elapsed = perf_read_cycles() - begin;
 86 | 
 87 |       // skip warmup
 88 |       if (i >= 10) {
 89 |         double time =
 90 |             (double)coef[pattern] / ((double)elapsed / loop_count / repeat);
 91 |         history.push_back(time);
 92 |         sum += time;
 93 |       }
 94 |     }
 95 | 
 96 |     double min = history[0];
 97 |     double max = history[0];
 98 |     for (size_t i = 0; i < history.size(); i++) {
 99 |       if (min > history[i]) {
100 |         min = history[i];
101 |       }
102 |       if (max < history[i]) {
103 |         max = history[i];
104 |       }
105 |     }
106 | 
107 |     fprintf(fp, "%s,%.2lf,%.2lf,%.2lf\n", patterns[pattern], min,
108 |             sum / history.size(), max);
109 |     fflush(fp);
110 | 
111 |     gadget_index++;
112 |   }
113 | 
114 |   printf("Results are written to fp_peak.csv\n");
115 |   return 0;
116 | }
117 | 


--------------------------------------------------------------------------------
/agner/testp/TestScripts/read_write_bandwidth.inc:
--------------------------------------------------------------------------------
  1 | ; read_write_bandwidth.inc                         2021-03-23 Agner Fog
  2 | 
  3 | ; Test the maximum number of memory reads and writes per clock cycle
  4 | ;
  5 | ; (c) 2013-2021 by Agner Fog. GNU General Public License www.gnu.org/licenses
  6 | ;
  7 | ; Parameters:
  8 | ;
  9 | ; tmode:       Test mode:
 10 | ;              R:    read only
 11 | ;              W:    write only
 12 | ;              RW:   one read and one write
 13 | ;              RRW:  two reads and one write
 14 | ;              RWW:  one read and two writes
 15 | ;              RWW2: one read and two writes to different cache lines
 16 | ;
 17 | ; regsize:     register size, bits
 18 | 
 19 | ; define appropriate move instruction:
 20 | %if regsize <= 64
 21 |    %define mov1 mov
 22 | %elif regsize == 65    ; 64 bit mmx registers
 23 |    %define mov1 movq
 24 | %elif regsize == 128
 25 |    %define mov1 movdqa
 26 | %elif regsize == 256
 27 |    %define mov1 vmovdqa
 28 | %elif regsize == 512
 29 |    %define mov1 vmovdqa64
 30 | %else
 31 |    %error unknown register size regsize
 32 | %endif
 33 | 
 34 | 
 35 | ; main testcode macro
 36 | %macro testcode 0
 37 | 
 38 |    %ifidni tmode, R  ; read only
 39 |    
 40 |       %rep 25
 41 |          mov1 reg0, [psi]
 42 |          mov1 reg1, [psi+regsize/8]
 43 |          mov1 reg2, [psi+2*regsize/8]
 44 |          mov1 reg3, [psi+3*regsize/8]
 45 |       %endrep   
 46 |          
 47 |    %elifidni tmode, W  ; write only
 48 |    
 49 |       %rep 25
 50 |          mov1 [psi], reg0
 51 |          mov1 [psi+regsize/8], reg1
 52 |          mov1 [psi+2*regsize/8], reg2
 53 |          mov1 [psi+3*regsize/8], reg3
 54 |       %endrep   
 55 |          
 56 |    %elifidni tmode, RW  ; one read and one write
 57 |    
 58 |       %rep 50
 59 |          mov1 reg0, [psi]
 60 |          mov1 [psi+regsize/8], reg1
 61 |          mov1 reg2, [psi+2*regsize/8]
 62 |          mov1 [psi+3*regsize/8], reg3
 63 |       %endrep   
 64 | 
 65 |    %elifidni tmode, RRW  ; two reads and one write
 66 |    
 67 |       %rep 50
 68 |          mov1 reg0, [psi]
 69 |          mov1 reg1, [psi+regsize/8]
 70 |          mov1 [psi+4*regsize/8], reg2
 71 |          mov1 reg3, [psi+2*regsize/8]
 72 |          mov1 reg4, [psi+3*regsize/8]
 73 |          mov1 [psi+5*regsize/8], reg5
 74 |       %endrep 
 75 | 
 76 |    %elifidni tmode, RRRW  ; three reads and one write
 77 |    
 78 |       %rep 50
 79 |          mov1 reg0, [psi]
 80 |          mov1 reg1, [psi+2*regsize/8]
 81 |          mov1 reg2, [psi+4*regsize/8]
 82 |          mov1 [psi+6*regsize/8], reg2
 83 |       %endrep     
 84 | 
 85 |    %elifidni tmode, RWW  ; one read and two writes
 86 |    
 87 |       %rep 50
 88 |          mov1 reg0, [psi]
 89 |          mov1 [psi+2*regsize/8], reg1
 90 |          mov1 [psi+3*regsize/8], reg2
 91 |          mov1 reg3, [psi+1*regsize/8]
 92 |          mov1 [psi+4*regsize/8], reg4
 93 |          mov1 [psi+5*regsize/8], reg5
 94 |       %endrep
 95 |       
 96 |    %elifidni tmode, RWW2  ; one read and two writes to different cache lines
 97 |    
 98 |       %rep 50
 99 |          mov1 reg0, [psi]
100 |          mov1 [psi+0x40], reg1
101 |          mov1 [psi+0x80], reg2
102 |          mov1 reg3, [psi+1*regsize/8]
103 |          mov1 [psi+0xC0], reg4
104 |          mov1 [psi+0x100], reg5
105 |       %endrep   
106 |     
107 | %else 
108 |   
109 |     %error unknown test mode tmode
110 |     
111 | %endif
112 |     
113 | %endmacro
114 | 
115 | ; test loops
116 | %define repeat1 1000
117 | %define repeat2 1
118 | 


--------------------------------------------------------------------------------
/agner/testp/PMCTest/timingtest.h:
--------------------------------------------------------------------------------
  1 | /***************************  timingtest.h  ****************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2014-04-15
  4 | * Last modified: 2014-04-15
  5 | * Project:       define functions for timing purposes etc.
  6 | * Description:
  7 | *
  8 | ******************************************************************************/
  9 | 
 10 | #pragma once
 11 | #include <stdint.h>
 12 | 
 13 | #if defined(__WINDOWS__) || defined(_WIN32) || defined(_WIN64) 
 14 | // System-specific definitions for Windows
 15 | 
 16 | #if 1    // if intrin.h has __cpuid, __rdtsc and __readpmc
 17 | 
 18 | #include <intrin.h>
 19 | 
 20 | static inline void cpuid_ (int32_t output[4], int32_t functionnumber) {	
 21 |     __cpuid(output, functionnumber);
 22 | }
 23 | 
 24 | // serialize CPU by cpuid function 0
 25 | static inline void serialize () {
 26 |     int dummy[4];
 27 |     cpuid_(dummy, 0);
 28 |     // Prevent the compiler from optimizing away the whole Serialize function:
 29 |     volatile int DontSkip = dummy[0];
 30 | }
 31 | 
 32 | // read time stamp counter
 33 | static inline int64_t readtsc() {
 34 |     return __rdtsc();
 35 | }
 36 | 
 37 | // read performance monitor counter
 38 | static inline int64_t readpmc(int32_t nPerfCtr) {
 39 |     return __readpmc(nPerfCtr);
 40 | }
 41 | 
 42 | 
 43 | #else // intrin.h missing. use inline assembly
 44 | 
 45 | // inline MASM syntax
 46 | 
 47 | static inline void cpuid_ (int32_t output[4], int32_t functionnumber) {	
 48 |     __asm {
 49 |         mov eax, functionnumber;
 50 |         cpuid;
 51 |         mov esi, output;
 52 |         mov [esi],    eax;
 53 |         mov [esi+4],  ebx;
 54 |         mov [esi+8],  ecx;
 55 |         mov [esi+12], edx;
 56 |     }
 57 | }
 58 | 
 59 | static inline void serialize () {
 60 |     __asm {
 61 |         xor eax, eax;
 62 |         cpuid;
 63 |     }
 64 | }
 65 | 
 66 | // get time stamp counter
 67 | #pragma warning(disable:4035)
 68 | static inline uint64_t readtsc() {
 69 |     // read performance monitor counter number nPerfCtr
 70 |     __asm {
 71 |         rdtsc
 72 |     }
 73 | }
 74 | 
 75 | static inline uint64_t readpmc(int32_t nPerfCtr) {
 76 |     // read performance monitor counter number nPerfCtr
 77 |     __asm {
 78 |         mov ecx, nPerfCtr
 79 |             rdpmc
 80 |     }
 81 | }
 82 | #pragma warning(default:4035)
 83 | 
 84 | #endif
 85 | 
 86 | 
 87 | #elif defined(__unix__) || defined(__linux__)
 88 | // System-specific definitions for Linux
 89 | 
 90 | #include <cpuid.h>
 91 | 
 92 | static inline void cpuid_ (int32_t output[4], int32_t functionnumber) {	
 93 |     __get_cpuid(functionnumber, (uint32_t*)output, (uint32_t*)(output+1), (uint32_t*)(output+2), (uint32_t*)(output+3));
 94 | }
 95 | 
 96 | static inline void serialize () {
 97 |     __asm __volatile__ ("cpuid" : : "a"(0) : "ebx", "ecx", "edx" );  // serialize
 98 | }
 99 | 
100 | // read time stamp counter
101 | static inline uint64_t readtsc() {
102 |     uint32_t lo, hi;
103 |     __asm __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi) : : );
104 |     return lo | (uint64_t)hi << 32;
105 | }
106 | 
107 | // read performance monitor counter
108 | static inline uint64_t readpmc(int32_t n) {
109 |     uint32_t lo, hi;
110 |     __asm __volatile__ ("rdpmc" : "=a"(lo), "=d"(hi) : "c"(n) : );
111 |     return lo | (uint64_t)hi << 32;
112 | }
113 | 
114 | 
115 | #else  // not Windows or Unix
116 | 
117 | #error Unknown platform
118 | 
119 | #endif
120 | 


--------------------------------------------------------------------------------