├── util_scripts ├── flush_page_cache.sh ├── check_cpu_freq.sh ├── lock_cpu_freq.sh ├── unlock_cpu_freq.sh ├── env.sh ├── reconfig_all.sh ├── config_all.sh ├── hyperthread_ctrl.sh └── numa_balance_ctrl.sh ├── caption_ae ├── example_input │ ├── single_mlc.sh │ ├── async_tune.txt │ └── sync_tune.sh ├── config.py ├── action.py ├── metrics │ ├── vmstat_mon.py │ ├── slab_mon.py │ ├── pmu_mon.py │ └── pcm_mon.py ├── algo.py ├── README.md ├── caption_ctrl.py └── caption.py ├── memo_ae ├── app │ └── mlc_linux │ │ └── place_holder.txt ├── src │ ├── test.h │ ├── Makefile │ ├── main.c │ ├── util.h │ ├── util.c │ ├── workload.c │ ├── test.c │ └── workload.h ├── evaluation │ ├── figure_4a.sh │ ├── figure_4b.sh │ └── figure_3.sh ├── generate_random_inst.py ├── test_cxl │ ├── test_single_op_latency.sh │ ├── test_movdir_bw.sh │ ├── test_seq_bw.sh │ ├── test_ptr_chase.sh │ ├── test_block_access_latency.sh │ └── test_rand_bw.sh └── README.md ├── README.md └── LICENSE /util_scripts/flush_page_cache.sh: -------------------------------------------------------------------------------- 1 | sudo sh -c "echo 3 > /proc/sys/vm/drop_caches" 2 | -------------------------------------------------------------------------------- /caption_ae/example_input/single_mlc.sh: -------------------------------------------------------------------------------- 1 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 2 | -------------------------------------------------------------------------------- /util_scripts/check_cpu_freq.sh: -------------------------------------------------------------------------------- 1 | sudo cpupower --cpu all frequency-info | grep "current CPU frequency" 2 | -------------------------------------------------------------------------------- /memo_ae/app/mlc_linux/place_holder.txt: -------------------------------------------------------------------------------- 1 | This is a place holder file. 2 | Please place the Intel MLC binary in the same directory. 3 | -------------------------------------------------------------------------------- /util_scripts/lock_cpu_freq.sh: -------------------------------------------------------------------------------- 1 | sudo cpupower --cpu all frequency-set --freq 2100MHz 2 | sudo sh -c 'echo 0 > /sys/devices/system/cpu/cpufreq/boost' 3 | 4 | -------------------------------------------------------------------------------- /caption_ae/example_input/async_tune.txt: -------------------------------------------------------------------------------- 1 | ./single_mlc.sh 2 | ./single_mlc.sh 3 | ./single_mlc.sh 4 | ./single_mlc.sh 5 | ./single_mlc.sh 6 | ./single_mlc.sh 7 | -------------------------------------------------------------------------------- /util_scripts/unlock_cpu_freq.sh: -------------------------------------------------------------------------------- 1 | #sudo cpupower --cpu all frequency-set --governor osndemand 2 | sudo cpupower --cpu all frequency-set --governor ondemand 3 | 4 | -------------------------------------------------------------------------------- /util_scripts/env.sh: -------------------------------------------------------------------------------- 1 | export CLOSEST_CORE=0 2 | export CLOSEST_NODE=0 3 | export NODE_MAX=2 4 | export TSC_FREQ=2100 5 | 6 | echo "closest node to CXL=${CLOSEST_NODE}" 7 | echo "closest core to CXL=${CLOSEST_CORE}" 8 | echo "TSC_FREQ: $TSC_FREQ (Mhz)" 9 | -------------------------------------------------------------------------------- /util_scripts/reconfig_all.sh: -------------------------------------------------------------------------------- 1 | #!bin/bash 2 | 3 | SETUP_SCRIPT_DIR=./ 4 | 5 | # set 6 | bash $SETUP_SCRIPT_DIR/unlock_cpu_freq.sh 7 | bash $SETUP_SCRIPT_DIR/check_cpu_freq.sh 8 | bash $SETUP_SCRIPT_DIR/hyperthread_ctrl.sh 1 9 | bash $SETUP_SCRIPT_DIR/numa_balance_ctrl.sh 1 10 | -------------------------------------------------------------------------------- /util_scripts/config_all.sh: -------------------------------------------------------------------------------- 1 | #!bin/bash 2 | 3 | SETUP_SCRIPT_DIR=$(cd $(dirname $0) && pwd) 4 | 5 | # set 6 | bash $SETUP_SCRIPT_DIR/lock_cpu_freq.sh 7 | bash $SETUP_SCRIPT_DIR/check_cpu_freq.sh 8 | bash $SETUP_SCRIPT_DIR/hyperthread_ctrl.sh 0 9 | bash $SETUP_SCRIPT_DIR/numa_balance_ctrl.sh 0 10 | 11 | sudo systemctl stop numad 12 | -------------------------------------------------------------------------------- /memo_ae/src/test.h: -------------------------------------------------------------------------------- 1 | #ifndef TEST_H 2 | #define TEST_H 3 | #include "util.h" 4 | #include 5 | 6 | // spawn thread 7 | int run_test(test_cfg_t* cfg); 8 | 9 | // dispatch to different workload wrappers 10 | void* thread_wrapper(void* arg); 11 | 12 | void get_bw(test_cfg_t* cfg_arr, int iter, int delay); 13 | 14 | #endif // TEST_H 15 | -------------------------------------------------------------------------------- /util_scripts/hyperthread_ctrl.sh: -------------------------------------------------------------------------------- 1 | #!bin/bash 2 | 3 | if [ $1 -eq 1 ]; 4 | then 5 | echo "enable hyperthreading" 6 | sudo sh -c "echo on > /sys/devices/system/cpu/smt/control" 7 | fi 8 | 9 | if [ $1 -eq 0 ]; 10 | then 11 | echo "disable hyperthreading" 12 | sudo sh -c "echo off > /sys/devices/system/cpu/smt/control" 13 | fi 14 | 15 | -------------------------------------------------------------------------------- /util_scripts/numa_balance_ctrl.sh: -------------------------------------------------------------------------------- 1 | #!bin/bash 2 | 3 | if [ $1 -eq 2 ]; 4 | then 5 | echo "enable numa balance in tpp mode" 6 | sudo sh -c "echo 2 > /proc/sys/kernel/numa_balancing" 7 | fi 8 | 9 | if [ $1 -eq 1 ]; 10 | then 11 | echo "enable numa balance" 12 | sudo sh -c "echo 1 > /proc/sys/kernel/numa_balancing" 13 | fi 14 | 15 | if [ $1 -eq 0 ]; 16 | then 17 | echo "disable numa balance" 18 | sudo sh -c "echo 0 > /proc/sys/kernel/numa_balancing" 19 | fi 20 | 21 | -------------------------------------------------------------------------------- /memo_ae/src/Makefile: -------------------------------------------------------------------------------- 1 | CC=gcc 2 | CFLAGS=-I. -W -Wall -Wextra -Wuninitialized -Wstrict-aliasing 3 | DEPS=util.h test.h workload.h 4 | OBJ=util.o test.o workload.o main.o 5 | LDLIBS=-lpthread -lnuma -lm 6 | 7 | .PHONY: all 8 | all: cxlMemTest 9 | 10 | %.o: %.c $(DEPS) 11 | $(CC) -c -o $@ $< $(CFLAGS) 12 | 13 | cxlMemTest: $(OBJ) 14 | $(CC) -o $@ $^ $(CFLAGS) $(LDLIBS) 15 | 16 | .PHONY: clean 17 | clean: 18 | $(RM) *~ *.o cxlMemTest 19 | 20 | debug: CFLAGS+=-g 21 | debug: cxlMemTest 22 | 23 | -------------------------------------------------------------------------------- /memo_ae/evaluation/figure_4a.sh: -------------------------------------------------------------------------------- 1 | bash ../../util_scripts/config_all.sh 2 | source ../../util_scripts/env.sh 3 | 4 | # config for SPR1 5 | CXL_NODE="2" 6 | NUMA_REMOTE_NODE="1" 7 | CLOSEST_CORE_S="0-31" 8 | 9 | # arg1: mem node 10 | # arg2: cores 11 | test_mlc_bw() { 12 | echo "running mlc, node: ${1}, cores: ${2} ..." 13 | sudo numactl --membind=$1 ../app/mlc_linux/mlc --peak_injection_bandwidth -k$2 -b104858 > ../results/figure_4a/c${2}_m${1}.txt 14 | 15 | echo "done!" 16 | } 17 | 18 | 19 | mkdir -p ../results/figure_4a 20 | echo "testing with MLC peak injection bw... " 21 | 22 | test_mlc_bw $CXL_NODE $CLOSEST_CORE_S 23 | test_mlc_bw $NUMA_REMOTE_NODE $CLOSEST_CORE_S 24 | -------------------------------------------------------------------------------- /caption_ae/example_input/sync_tune.sh: -------------------------------------------------------------------------------- 1 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 2 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 3 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 4 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 5 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 6 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 7 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 8 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 9 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 10 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 11 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 12 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 13 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 14 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 15 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 16 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 17 | -------------------------------------------------------------------------------- /memo_ae/generate_random_inst.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | CNT=16 4 | RANGE_LOW=0 5 | RANGE_HIGH=1024 6 | # NTLD 7 | #inst_base = '"vmovntdqa {0}(%%r11, %%r10), %%zmm{1} \\n" \\' 8 | # NTST 9 | #inst_base = '"vmovntdq %%zmm{1}, {0}(%%r11, %%r10) \\n" \\' 10 | # LD 11 | #inst_base = '"vmovdqa64 {0}(%%r11, %%r10), %%zmm{1} \\n lfence \\n" \\' 12 | #inst_base = '"vmovdqa64 {0}(%%r11, %%r10), %%zmm{1} \\n \\' 13 | # ST + WB 14 | #inst_base = '"vmovdqa64 %%zmm{1}, {0}(%%r11, %%r10) \\n clwb {0}(%%r11, %%r10) \\n" \\' 15 | # ST 16 | inst_base = '"vmovdqa64 %%zmm{1}, {0}(%%r11, %%r10) \\n" \\' 17 | 18 | seen_set = set() 19 | 20 | for i in range(CNT): 21 | curr_offset = random.randrange(RANGE_LOW, RANGE_HIGH) 22 | while curr_offset in seen_set: 23 | curr_offset = random.randrange(RANGE_LOW, RANGE_HIGH) 24 | 25 | seen_set.add(curr_offset) 26 | 27 | print(inst_base.format(hex(curr_offset << 6), i)) 28 | -------------------------------------------------------------------------------- /memo_ae/test_cxl/test_single_op_latency.sh: -------------------------------------------------------------------------------- 1 | ITERATION=10000 2 | 3 | bash ../../util_scripts/config_all.sh 4 | source ../../util_scripts/env.sh 5 | 6 | test_single_op_lats() { 7 | echo "[INFO] Test started" 8 | for ((k=0;k<=$NODE_MAX;k=k+1)); do 9 | for ((j=0;j<4;j=j+1)); do # op 10 | FOLDER_NAME=single_op_n${k} 11 | CURR_RESULT_PATH=../results/$FOLDER_NAME 12 | mkdir -p $CURR_RESULT_PATH 13 | 14 | echo "[TEST] op: $j node: $k......" 15 | LINE=`sudo ../src/cxlMemTest -p $CLOSEST_CORE -t 1 -S 1 -n $k -T 0 -o $j -i $ITERATION -F $TSC_FREQ | awk '/Median latency/ {print}'` 16 | echo $LINE 17 | 18 | LATS=`echo $LINE | awk '{print $(NF-2)}' | grep -Eo '[+-]?[0-9]+([.][0-9]+)?'` 19 | echo $LATS >> $CURR_RESULT_PATH/single_op_lats_n${k}.txt 20 | echo $LATS 21 | done 22 | done 23 | } 24 | 25 | test_single_op_lats 26 | -------------------------------------------------------------------------------- /memo_ae/test_cxl/test_movdir_bw.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | NUM_THREADS=32 3 | STEP=2 4 | ITERATION=3 5 | OP_MAX=3 6 | 7 | bash ../../util_scripts/config_all.sh 8 | source ../../util_scripts/env.sh 9 | 10 | FOLDER_NAME=movdir_bw_test 11 | CURR_RESULT_PATH=../results/$FOLDER_NAME 12 | mkdir -p $CURR_RESULT_PATH 13 | 14 | for ((src=0;src<=$NODE_MAX;src++)); do 15 | for ((dst=0;dst<=$NODE_MAX;dst++)); do 16 | for ((i=0;i<=$NUM_THREADS;i=i+$STEP)); do 17 | if [ $i == 0 ];then 18 | continue 19 | fi 20 | echo "[TEST] src: $src dst: $dst, num_thread: $i......" 21 | THROUGHPUT=`sudo ../src/cxlMemTest -p $CLOSEST_CORE -f -t $i -S 6 -n $src -d $dst -T 1 -o 4 -i $ITERATION | awk '/get_bw/ {print}'` 22 | BW=`echo $THROUGHPUT | awk '{print $(NF-1)}'` 23 | echo $BW >> $CURR_RESULT_PATH/s${src}_d${dst}.txt 24 | echo $THROUGHPUT 25 | done 26 | done 27 | done 28 | -------------------------------------------------------------------------------- /memo_ae/test_cxl/test_seq_bw.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | NUM_THREADS=32 3 | STEP=2 4 | ITERATION=3 5 | OP_MAX=3 6 | 7 | bash ../../util_scripts/config_all.sh 8 | source ../../util_scripts/env.sh 9 | 10 | for ((j=0;j<=$NODE_MAX;j++)); do 11 | FOLDER_NAME=seq_bw_${j}_test 12 | CURR_RESULT_PATH=../results/$FOLDER_NAME 13 | mkdir -p $CURR_RESULT_PATH 14 | 15 | for ((k=0;k<=$OP_MAX;k++)); do 16 | 17 | for ((i=0;i<=$NUM_THREADS;i=i+$STEP)); do 18 | if [ $i == 0 ];then 19 | continue 20 | fi 21 | 22 | echo "[TEST] node: $j, op: $k, num_thread: $i......" 23 | THROUGHPUT=`sudo ../src/cxlMemTest -p $CLOSEST_CORE -f -t $i -S 6 -n $j -d $j -T 1 -o $k -i $ITERATION | awk '/get_bw/ {print}'` 24 | BW=`echo $THROUGHPUT | awk '{print $(NF-1)}'` 25 | echo $BW >> $CURR_RESULT_PATH/seq_bw_$k.txt 26 | echo $THROUGHPUT 27 | done 28 | done 29 | done 30 | -------------------------------------------------------------------------------- /memo_ae/test_cxl/test_ptr_chase.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | STEPS=19 3 | 4 | bash ../../util_scripts/config_all.sh 5 | source ../../util_scripts/env.sh 6 | 7 | for ((j=0;j<=$NODE_MAX;j++)); do 8 | FOLDER_NAME=chase_${j}_test 9 | mkdir -p ../results/$FOLDER_NAME 10 | 11 | # Start testing 12 | echo "[INFO] Test started" 13 | size=4096 14 | iter=500 15 | # max = 2^12 * 2^18 = 2^30 ~ 4GB 16 | for ((i=0;i<$STEPS;i++)); do 17 | if [ $((i%3)) -eq 1 ]; 18 | then 19 | iter=$((iter / 2)) 20 | fi 21 | echo -n "[TEST] test $i, iteration $iter......" 22 | LATENCY=`sudo ../src/cxlMemTest -t 1 -m $size -o 0 -T 2 -n $j -i $iter -p $CLOSEST_CORE -F $TSC_FREQ | awk '/chase\/block_lats average/ {print $8}' | tail -n1 | grep -Eo '[+-]?[0-9]+([.][0-9]+)?'` 23 | echo $LATENCY >> ../results/$FOLDER_NAME/ptr_chase_lat_vs_size.txt 24 | echo "$LATENCY" 25 | 26 | size=$((size*2)) 27 | done 28 | done 29 | -------------------------------------------------------------------------------- /memo_ae/test_cxl/test_block_access_latency.sh: -------------------------------------------------------------------------------- 1 | ITERATION=10000 2 | 3 | bash ../../util_scripts/config_all.sh 4 | source ../../util_scripts/env.sh 5 | 6 | test_block_lats() { 7 | echo "[INFO] Test started" 8 | 9 | for ((k=0;k<=$NODE_MAX;k=k+1)); do # node 10 | for ((j=0;j<4;j=j+1)); do # op 11 | FOLDER_NAME=block_lats_n${k} 12 | CURR_RESULT_PATH=../results/$FOLDER_NAME 13 | mkdir -p $CURR_RESULT_PATH 14 | 15 | #echo "[TEST] $i $j $k......" 16 | echo "[TEST] op: $j node: $k......" 17 | LINE=`sudo ../src/cxlMemTest -p $CLOSEST_CORE -t 1 -S 1 -n $k -T 3 -o $j -i $ITERATION -B -F $TSC_FREQ | awk '/Median latency/ {print}'` 18 | echo $LINE 19 | 20 | LATS=`echo $LINE | awk '{print $(NF-2)}' | grep -Eo '[+-]?[0-9]+([.][0-9]+)?'` 21 | echo $LATS >> $CURR_RESULT_PATH/block_lats_n${k}.txt 22 | echo $LATS 23 | done 24 | done 25 | } 26 | 27 | test_block_lats 28 | -------------------------------------------------------------------------------- /memo_ae/test_cxl/test_rand_bw.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | NUM_THREADS=32 3 | STEP=2 4 | ITERATION=3 5 | OP_MAX=3 6 | 7 | THREAD_CNT=(1 2 4 6 8) 8 | 9 | bash ../../util_scripts/config_all.sh 10 | source ../../util_scripts/env.sh 11 | 12 | for ((k=0;k<=$NODE_MAX;k++)); do # k node 13 | 14 | for ((op=0;op<=$OP_MAX;op++)); do 15 | 16 | for i in ${THREAD_CNT[@]}; do # i thread 17 | FOLDER_NAME=rand_bw_${k}_test 18 | batch_size=16 # 19 | echo "[INFO] ====> new core: ${i} <==== " 20 | 21 | for ((j=0;j<7;j=j+1)); do # j blocksize 22 | CURR_RESULT_PATH=../results/$FOLDER_NAME/ 23 | mkdir -p $CURR_RESULT_PATH 24 | 25 | echo "[TEST] node: $k, op: $op, thread: ${i}, batch_size: ${batch_size} ......" 26 | THROUGHPUT=`sudo ../src/cxlMemTest -p $CLOSEST_CORE -f -t $i -S 6 -n $k -T 1 -o $op -i $ITERATION -g $batch_size -r | awk '/get_bw/ {print}'` 27 | BW=`echo $THROUGHPUT | awk '{print $(NF-1)}'` 28 | echo $BW >> $CURR_RESULT_PATH/${i}_${op}.txt 29 | echo $THROUGHPUT 30 | batch_size=$((batch_size*2)) 31 | done 32 | done 33 | done 34 | done 35 | -------------------------------------------------------------------------------- /memo_ae/evaluation/figure_4b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | NUM_THREADS=32 3 | STEP=2 4 | ITERATION=3 5 | OP_MAX=3 6 | 7 | bash ../../util_scripts/config_all.sh 8 | source ../../util_scripts/env.sh 9 | FOLDER_NAME="figure_4b" 10 | 11 | # ======================================= 12 | # test 2-32 thread BW, stepping = 2 13 | # ======================================= 14 | 15 | for ((j=0;j<=$NODE_MAX;j++)); do 16 | CURR_RESULT_PATH=../results/$FOLDER_NAME 17 | mkdir -p $CURR_RESULT_PATH 18 | 19 | for ((k=0;k<=$OP_MAX;k++)); do 20 | 21 | for ((i=0;i<=$NUM_THREADS;i=i+$STEP)); do 22 | if [ $i == 0 ];then 23 | continue 24 | fi 25 | 26 | # A - B - C 27 | # B is the closest node 28 | # A-B, B-B, B-C are sufficient to show Local-NUMA, Local-Local, Local-CXL 29 | 30 | echo "[TEST] node: $j, op: $k, num_thread: $i......" 31 | THROUGHPUT=`sudo ../src/cxlMemTest -t $i -S 6 -n $j -d $j -T 1 -o $k -i $ITERATION -p $CLOSEST_CORE -f | awk '/get_bw/ {print}'` 32 | BW=`echo $THROUGHPUT | awk '{print $(NF-1)}'` 33 | echo $BW >> $CURR_RESULT_PATH/seq_bw_op${k}_core${CLOSEST_CORE}_mem${j}.txt 34 | echo $THROUGHPUT 35 | done 36 | done 37 | done 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cxl\_type3\_tests 2 | This repository contains two parts: 3 | 1. **memo** -- a versatile benchmark for CXL-related memory behaviors and characterizations. 4 | 2. **caption** -- a performance tuning (based on memory page allocation) tool to maximize the system memory bandwidth utilazation in a CXL-enabled system. 5 | 6 | They are the correpsonding artifacts of the paper `Demystifying CXL Memory with True CXL-Ready Systems and CXL Memory Devices (MICRO'23)`, the first research work of CXL memory characterization based on real CXL hardware devices. 7 | 8 | For the artifact evaluation configurations, please refer to [link to config repo](https://github.com/ece-fast-lab/cxl_type3_tests_ae) 9 | 10 | ### Contact 11 | 12 | For any questions, please :e-mail: . 13 | 14 | Thank you! :wink: 15 | 16 | 17 | ## [Related Publication](https://doi.org/10.1145/3613424.3614256) 18 | 19 | 20 | ```bibtex 21 | @inproceedings {sun-memo, 22 | author = {Sun, Yan and Yuan, Yifan and Yu, Zeduo and Kuper, Reese and Song, Chihun and Huang, Jinghan and Ji, Houxiang and Agarwal, Siddharth and Lou, Jiaqi and Jeong, Ipoom and Wang, Ren and Ahn, Jung Ho and Xu, Tianyin and Kim, Nam Sung}, 23 | title = {Demystifying {CXL} memory with genuine {CXL}-ready systems and devices}, 24 | booktitle = {Proceedings of the 48th IEEE/ACM International Symposium on Microarchitecture (MICRO'23)}, 25 | year = {2023}, 26 | } 27 | ``` 28 | -------------------------------------------------------------------------------- /memo_ae/evaluation/figure_3.sh: -------------------------------------------------------------------------------- 1 | ITERATION=10000 2 | 3 | bash ../../util_scripts/config_all.sh 4 | source ../../util_scripts/env.sh 5 | 6 | test_block_lats() { 7 | echo "[INFO] Test started" 8 | echo "CLOSEST_NODE: $CLOSEST_NODE" 9 | echo "CLOSEST_CORE: $CLOSEST_CORE" 10 | echo "TSC_FREQ: $TSC_FREQ" 11 | FOLDER_NAME="figure_3_memo" 12 | 13 | for ((k=0;k<=$NODE_MAX;k=k+1)); do # node 14 | for ((j=0;j<4;j=j+1)); do # op 15 | CURR_RESULT_PATH=../results/$FOLDER_NAME 16 | mkdir -p $CURR_RESULT_PATH 17 | 18 | #echo "[TEST] $i $j $k......" 19 | echo "[TEST] op: $j node: $k, core: $CLOSEST_CORE......" 20 | LINE=`sudo ../src/cxlMemTest -p $CLOSEST_CORE -t 1 -S 1 -n $k -T 3 -o $j -i $ITERATION -B -F $TSC_FREQ | awk '/Median latency/ {print}'` 21 | echo $LINE 22 | 23 | LATS=`echo $LINE | awk '{print $(NF-2)}' | grep -Eo '[+-]?[0-9]+([.][0-9]+)?'` 24 | echo $LATS >> $CURR_RESULT_PATH/block_lats_n${k}.txt 25 | echo $LATS 26 | done 27 | done 28 | } 29 | 30 | test_mlc_lats() { 31 | echo "running mlc ..." 32 | sudo ../app/mlc_linux/mlc --latency_matrix > ../results/figure_3_mlc/mlc.txt 33 | 34 | echo "done!" 35 | } 36 | 37 | mkdir -p ../results/figure_3_mlc 38 | echo "testing with MLC ... " 39 | test_mlc_lats 40 | 41 | echo "testing with MEMO ... " 42 | test_block_lats 43 | -------------------------------------------------------------------------------- /caption_ae/config.py: -------------------------------------------------------------------------------- 1 | class bcolors: 2 | HEADER = '\033[95m' 3 | OKBLUE = '\033[94m' 4 | OKCYAN = '\033[96m' 5 | OKGREEN = '\033[92m' 6 | WARNING = '\033[93m' 7 | FAIL = '\033[91m' 8 | ENDC = '\033[0m' 9 | BOLD = '\033[1m' 10 | UNDERLINE = '\033[4m' 11 | 12 | LOG_NONE = 0 13 | LOG_ACTION = LOG_NONE + 1 14 | LOG_DEBUG = LOG_ACTION + 1 15 | LOG_METRIC = LOG_DEBUG + 1 16 | 17 | DO_LOG = LOG_DEBUG 18 | ACTION_ENABLE = True 19 | #ACTION_ENABLE = False 20 | 21 | IL_TOP_RESET = 10 22 | IL_BOT_MAX = IL_TOP_RESET * 2 23 | IL_BOT_RESET = 1 24 | STEP_RESET = 3 25 | MIN_STEP = 1 26 | 27 | 28 | WINDOW_SIZE=5 29 | 30 | # Config #1 31 | RESET_THRESHOLD = 500 32 | TUNE_TRESHOLD = 0.2 33 | IDLE_THRESHOLD = 70000 34 | ALLOC_THRESHOLD = 150000 35 | ALLOC_DROP_THRESHOLD = 50000 36 | 37 | scale_dict = { } 38 | 39 | metric_dict = { 40 | 'norm_ipc': 0, 41 | 'L1.miss.lats': 0, 42 | 'DDR.read.lats': 0, 43 | } 44 | 45 | # Model from R-studio 46 | #norm_ipc 99.55281 11.24329 8.854 2.35e-08 *** 47 | #l1_lat -0.04686 0.01539 -3.045 0.00639 ** 48 | #ddr_lat -0.48751 0.14351 -3.397 0.00286 ** 49 | coeff_dict = { 50 | 'norm_ipc': 99.55281, 51 | 'L1.miss.lats': -0.04686, 52 | 'DDR.read.lats': -0.48751, 53 | } 54 | 55 | pmu_translateion = { } 56 | 57 | pcm_translateion = { 58 | 'norm_ipc': 'pcm_norm_ipc', 59 | 'L1.miss.lats': 'pcm_l1miss', 60 | 'DDR.read.lats': 'pcm_ddrReadLat', 61 | } 62 | -------------------------------------------------------------------------------- /caption_ae/action.py: -------------------------------------------------------------------------------- 1 | from config import * 2 | import subprocess 3 | 4 | def log_action(color, log): 5 | if DO_LOG >= LOG_ACTION: 6 | print("[ACTION] === " + color + log + bcolors.ENDC + " ===") 7 | pass 8 | def log_metric(tag, log): 9 | log = str(log) 10 | log = tag + " " + log 11 | if DO_LOG >= LOG_METRIC: 12 | print("[METRIC] === " + bcolors.UNDERLINE + log + bcolors.ENDC + " ===") 13 | pass 14 | pass 15 | def log_debug(log): 16 | log = str(log) 17 | if DO_LOG >= LOG_DEBUG: 18 | print("[DEBUG] === " + bcolors.BOLD + log + bcolors.ENDC + " ===") 19 | pass 20 | 21 | def update_metric(pcm_dict, pmu_dict): 22 | #if len(pcm_dict) == 0 or len(pmu_dict) == 0: 23 | # return 24 | for i, (v, k) in enumerate(pcm_translateion.items()): 25 | if k not in pcm_dict: 26 | continue 27 | val = pcm_dict[k] 28 | if 'log' in v: 29 | val = math.log10(val) 30 | if "2)2" in v: 31 | val = val * val 32 | metric_dict[v] = val 33 | log_metric(v, val) 34 | 35 | for i, (v, k) in enumerate(pmu_translateion.items()): 36 | if k not in pmu_dict: 37 | continue 38 | val = pmu_dict[k] 39 | if 'log' in v: 40 | val = math.log10(val) 41 | if "2)2" in v: 42 | val = val * val 43 | metric_dict[v] = pmu_dict[k] 44 | log_metric(v, val) 45 | 46 | def set_ratio(top, bot): 47 | if ACTION_ENABLE: 48 | subprocess.run(['sudo','sysctl','-w','vm.numa_tier_interleave_top='+str(top)]) 49 | subprocess.run(['sudo','sysctl','-w','vm.numa_tier_interleave_bot='+str(bot)]) 50 | 51 | def reset_default(): 52 | set_ratio(IL_TOP_RESET, IL_BOT_RESET) 53 | -------------------------------------------------------------------------------- /memo_ae/src/main.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Developed by FAST Lab @ ECE-UIUC -- 2022-2023 3 | */ 4 | #include "util.h" 5 | #include "test.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | int main(int argc, char*argv[]) { 12 | int ret; 13 | test_cfg_t* cfg; 14 | cfg = malloc(sizeof(test_cfg_t)); 15 | 16 | ret = parse_arg(argc, argv, cfg); 17 | if (ret < 0) { 18 | if (ret == -1) { 19 | printf("BAD parse_arg\n"); 20 | } 21 | goto out; 22 | } 23 | 24 | ret = init_buf(cfg->total_buf_size, cfg->buf_a_numa_node, &(cfg->buf_a)); 25 | if (ret < 0) { 26 | if (ret == -1) { 27 | printf("BAD init_buf buf_a, fail to alloc\n"); 28 | goto out; 29 | } else { // already alloc, needs to free 30 | printf("BAD init_buf buf_a, alloc strange\n"); 31 | goto out1; 32 | } 33 | } 34 | 35 | if (cfg->op == MOV) { 36 | ret = init_buf(cfg->total_buf_size, cfg->buf_b_numa_node, &(cfg->buf_b)); 37 | if (ret < 0) { 38 | if (ret == -1) { 39 | printf("BAD init_buf buf_b, fail to alloc\n"); 40 | goto out1; // free buf_a 41 | } else { // already alloc, needs to free 42 | printf("BAD init_buf buf_b, alloc strange\n"); 43 | goto out2; // free buf_b then buf_a 44 | } 45 | } 46 | } 47 | 48 | ret = run_test(cfg); 49 | 50 | ret = get_node(cfg->buf_a, cfg->total_buf_size); 51 | printf("end, buf_a is on node %d\n", ret); 52 | 53 | if (cfg->op == MOV) { 54 | ret = get_node(cfg->buf_b, cfg->total_buf_size); 55 | printf("end, buf_b is on node %d\n", ret); 56 | } 57 | 58 | out2: 59 | numa_free(cfg->buf_b, cfg->total_buf_size); 60 | out1: 61 | numa_free(cfg->buf_a, cfg->total_buf_size); 62 | out: 63 | free(cfg); 64 | return 0; 65 | } 66 | -------------------------------------------------------------------------------- /caption_ae/metrics/vmstat_mon.py: -------------------------------------------------------------------------------- 1 | #!/home/zeduoyu2/anaconda3/bin/python3 2 | 3 | import os 4 | import subprocess 5 | import time 6 | import sys 7 | import signal 8 | from queue import Queue, Empty 9 | from threading import Thread 10 | import re 11 | import time 12 | 13 | class vmstat_metric: 14 | def __init__(self) -> None: 15 | self.cnt = 0 16 | self.val = 0 17 | 18 | def run_realtime(self, interval=1000, print_info=True) -> None: 19 | # define and start the parsing threads 20 | def catch_output(): 21 | prev_val = 0 22 | curr_val = 0 23 | while(True): 24 | output = subprocess.check_output("cat /proc/vmstat", shell=True) 25 | output = output.splitlines() 26 | target_line = output[66] 27 | target_line = target_line.decode("utf-8") 28 | curr_val = int(target_line.split()[-1]) 29 | ''' 30 | for idx, line in enumerate(output): 31 | line = line.decode("utf-8") 32 | if "pgalloc_normal" not in line: 33 | continue 34 | else: 35 | curr_val = int(line.split()[-1]) 36 | print(idx) 37 | break 38 | ''' 39 | 40 | self.val = curr_val - prev_val 41 | prev_val = curr_val 42 | time.sleep(1) 43 | 44 | t = Thread(target=catch_output) 45 | t.daemon = True 46 | t.start() 47 | 48 | if print_info: 49 | while(True): 50 | time.sleep(1) 51 | 52 | 53 | def get_stat(self) -> int: 54 | return self.val 55 | 56 | if __name__ == "__main__": 57 | vmstat = vmstat_metric() 58 | vmstat.run_realtime(print_info=False) 59 | while(True): 60 | print(vmstat.get_stat()) 61 | time.sleep(1) 62 | -------------------------------------------------------------------------------- /memo_ae/src/util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Developed by FAST Lab @ ECE-UIUC -- 2022-2023 3 | */ 4 | #ifndef UTIL_H 5 | #define UTIL_H 6 | 7 | #include 8 | #include 9 | 10 | #define DEBUG 1 11 | #define debug_print(fmt, ...) \ 12 | do { if (DEBUG) fprintf(stderr, "%s:%d:%s(): " fmt, __FILE__, \ 13 | __LINE__, __func__, __VA_ARGS__); } while (0) 14 | 15 | 16 | /* text color */ 17 | #define RED "\x1B[31m" 18 | #define GRN "\x1B[32m" 19 | #define YEL "\x1B[33m" 20 | #define BLU "\x1B[34m" 21 | #define MAG "\x1B[35m" 22 | #define CYN "\x1B[36m" 23 | #define WHT "\x1B[37m" 24 | #define RESET "\x1B[0m" 25 | 26 | typedef struct chase_struct chase_t; 27 | 28 | struct chase_struct { 29 | // 64-bit addr, 64 * 64 = 512 bit per cacheline 30 | chase_t* ptr_arr[8]; 31 | }; 32 | 33 | typedef enum test_op { 34 | READ, 35 | READ_NT, 36 | WRITE, 37 | WRITE_NT, 38 | MOV, 39 | MIXED /* mix read and write */ 40 | } test_op_t; 41 | 42 | typedef enum test_type { 43 | LATS_CLFLUSH, 44 | BW, 45 | LATS_CHASE, 46 | BLOCK_LATS 47 | } test_type_t; 48 | 49 | typedef struct test_cfg { 50 | // overall 51 | uint64_t num_thread; 52 | uint64_t total_buf_size; 53 | int buf_a_numa_node; 54 | int buf_b_numa_node; 55 | char* buf_a; 56 | char* buf_b; 57 | bool prefetch_en; 58 | int bw_granu; // number of cache line (n * 64B) 59 | double tsc_freq; // GHz 60 | 61 | // thread 62 | int thread_idx; 63 | int core_a; 64 | int core_b; 65 | char* start_addr_a; 66 | char* start_addr_b; 67 | uint64_t per_thread_size; // num byte per thread 68 | int op_iter; 69 | test_type_t type; 70 | test_op_t op; 71 | int starting_core; 72 | bool random; 73 | int stall_ratio; 74 | int read_ratio; /* computed by (read / write) */ 75 | bool flush_block; 76 | int num_clear_pipe; 77 | 78 | // monitoring 79 | volatile uint64_t curr_op_cnt; 80 | 81 | // thread sync 82 | volatile int halt; 83 | 84 | } test_cfg_t; 85 | 86 | int parse_arg(int argc, char*argv[], test_cfg_t* cfg); 87 | 88 | int get_node(void* p, uint64_t size); 89 | 90 | int init_buf(uint64_t size, int node, char** alloc_ptr); 91 | 92 | uint64_t read_MSR(int cpu); 93 | 94 | void write_MSR(int cpu, uint64_t val); 95 | 96 | void disable_prefetch(int cpu); 97 | 98 | void enable_prefetch(int cpu); 99 | 100 | uint64_t xorshf96(uint64_t* x); 101 | 102 | void flush_all_cache(); 103 | 104 | #endif // UTIL_H 105 | -------------------------------------------------------------------------------- /caption_ae/algo.py: -------------------------------------------------------------------------------- 1 | from action import * 2 | from config import * 3 | 4 | # Input (tracked by the caller) 5 | # curr state 6 | # prev state 7 | # prev stepping 8 | # bot ratio 9 | # Output (returned to the caller) 10 | # curr stepping 11 | # new bot ratio 12 | 13 | # In this implementation, the top is fixed to some ratio 14 | # The bot is tunned in some range of value 15 | 16 | def algo(dynamic_state, static_state, prev_step, bot_ratio): 17 | diff = dynamic_state - static_state 18 | abs_diff = abs(diff) 19 | log_debug("PRE: diff:{0}, prev_step:{1}, bot_ratio:{2}".format(diff, prev_step, bot_ratio)) 20 | log_debug("PRE: dyn:{0}, stc:{1}".format(dynamic_state, static_state)) 21 | 22 | # ================================== pre-condition 23 | if dynamic_state > IDLE_THRESHOLD: 24 | log_action(bcolors.OKGREEN, "Pass -- idle") 25 | return MIN_STEP, bot_ratio 26 | elif abs_diff < TUNE_TRESHOLD: 27 | log_action(bcolors.OKGREEN, "Pass -- stable") 28 | set_ratio(IL_TOP_RESET, bot_ratio) 29 | if prev_step < 0: 30 | return -MIN_STEP , bot_ratio 31 | else: 32 | return MIN_STEP , bot_ratio 33 | elif abs_diff > RESET_THRESHOLD: 34 | log_action(bcolors.WARNING, "Reset") 35 | reset_default() 36 | return STEP_RESET, -1 37 | 38 | # ================================== step 39 | curr_step = prev_step 40 | log_action(bcolors.OKBLUE, "Tune") 41 | if diff > 0: # gets better 42 | log_action(bcolors.OKGREEN, "better") 43 | curr_step = prev_step 44 | else: # gets worse 45 | curr_step = -prev_step / 2 # apply reversed half step 46 | log_action(bcolors.OKCYAN, "worse, pre-bound step = " + str(curr_step)) 47 | 48 | # ================================== bound step 49 | if curr_step < MIN_STEP and curr_step > -MIN_STEP: 50 | if curr_step < 0: 51 | curr_step = -MIN_STEP 52 | else: 53 | curr_step = MIN_STEP 54 | curr_step = int(curr_step) 55 | log_debug("post-bound step = " + str(curr_step)) 56 | 57 | ## ================================== bound ratio 58 | bot_ratio += curr_step 59 | if bot_ratio <= 1: # cap at ddr:cxl = 10:1 60 | log_action(bcolors.WARNING, "lower bound: bot{0}, step{1}".format(bot_ratio, curr_step)) 61 | bot_ratio = 1 62 | elif bot_ratio >= (IL_BOT_MAX): 63 | log_action(bcolors.WARNING, "upper bound: bot{0}, step{1}".format(bot_ratio, curr_step)) 64 | bot_ratio = IL_BOT_MAX 65 | 66 | # ================================== set 67 | bot_ratio = int(bot_ratio) 68 | set_ratio(IL_TOP_RESET, bot_ratio) 69 | 70 | log_debug("POST: curr_step:{0}, prev_step:{1}, bot_ratio:{2}".format(curr_step, prev_step, bot_ratio)) 71 | log_debug("POST: dynamic_state:{0}, static_state:{1}, bot_ratio:{2}".format(dynamic_state, static_state, bot_ratio)) 72 | return curr_step, bot_ratio 73 | 74 | # Linear function 75 | def calculate_state(): 76 | ret = 0 77 | for i, (k, v) in enumerate(coeff_dict.items()): 78 | # Log/square applied in the translation 79 | mul = v * metric_dict[k] 80 | ret += mul 81 | #print(mul, v, metric_dict[k]) 82 | return ret 83 | -------------------------------------------------------------------------------- /caption_ae/README.md: -------------------------------------------------------------------------------- 1 | # Caption 2 | 3 | ## Setup 4 | ### Prerequisite 5 | - Must 6 | - Python 3 7 | - Linux Kernel with N:M interleaving [patch](https://lore.kernel.org/linux-mm/YqD0%2FtzFwXvJ1gK6@cmpxchg.org/T/) 8 | + The patch added a tunable parameter (numa\_tier\_interleave) in `vm_table` in `kernel/sysctl.c` 9 | + In our case, we use two parameters to control the top and bot ratio independently. 10 | * `numa_tier_interleave_top` for top tier 11 | * `numa_tier_interleave_bot` for bot tier 12 | + The rest of the patch is applied without any modification 13 | - Intel PCM 14 | + Please follow [intel-pcm](https://github.com/intel/pcm) to clone and build PCM. 15 | + Please update the `PCM_PATH` in `metrics/pcm_mon.py` to your pcm binary path. 16 | 17 | ### Clone 18 | ```bash 19 | $ git clone https://github.com/ece-fast-lab/cxl_type3_tests.git 20 | $ cd caption_ae 21 | ``` 22 | 23 | ## Notes 24 | * The interleaving ratio is applied to `libnuma`, `numactl --interleave` calls for memory interleaving **allocations**. Therefore, the ratio is only applied upon new memory allocations. This is orthogonal to works on memory migration. 25 | * Currently, `Caption` assumes major memory allocation happens when the application launches, and thus, the tuning happesns at the end of each iteration of an application and before its next launch. 26 | * Caption is independent of application output and only monitors system performance counters. However, in some cases, it may be desirable to have application output as a feed back on the direction of tunning. We leave the enhancement of the monitoring scheme as a future work. 27 | * Although `IPC` alone may seems sufficient, `L1 latency` and `DDR latency` are here to assit the model to identify subtle change in application's performance. 28 | 29 | ## Known issues 30 | * If the tunning time interval is too small, `Caption` may not be able to capture enough information about the system state. 31 | * If the tuning stepping is too small, `Caption` may not be able to correctly indentify if the direction is correct -- i.e. the performance difference is too subtle. 32 | 33 | ## Arguments 34 | | Argument | Brief description | Default | Valid inputs | Note | 35 | | -------- | ----------------- | ------- | ------------ | ---- | 36 | | h | Help message generated by `argparser` | - | - | - | 37 | | x | Stepping mode | - | - | Test the Caption model by simply iterating through the interleaving ratio. By default, this will iterate from DDR:CXL = 10:1 and increase the CXL ratio by 2, i.e. 10:3, 10:5 ...| 38 | | n | No tune | - | - | This is used for monitoring the model output at a fixed interleaving ratio. The tuning algorithm is not applied in this case.| 39 | | s | Test synchronous mode, shell script path | - | String, path to a shell script | The script may contain multiple program, where each program may execute in the background with '&'. In this case, tuning happen when the shell script exits. We provide an example script in the `example_input` folder.| 40 | | t | Test asynchronous mode, txt file path | - | String, path to a txt file | The txt may contain muliple shell scripts. Each script will be executed by the python program in a seperate thread. In this case, tunning happen if one of the shell script exits. You may change the tuning mask (`tune_mask` in `caption.py`) to enable tunning for the ending on a specific thread (script). We provide an example txt in the `example_input`.| 41 | 42 | ## Example usage 43 | ### Syncrhnous mode 44 | ``` 45 | $ python3 caption.py -s example_input/sync_tune.sh 46 | ``` 47 | 48 | ### Asynchronous mode 49 | ``` 50 | $ python3 caption.py -t example_input/async_tune.sh 51 | ``` 52 | -------------------------------------------------------------------------------- /caption_ae/metrics/slab_mon.py: -------------------------------------------------------------------------------- 1 | #!/home/zeduoyu2/anaconda3/bin/python3 2 | 3 | import os 4 | import subprocess 5 | import time 6 | import sys 7 | import signal 8 | from queue import Queue, Empty 9 | from threading import Thread 10 | 11 | class slab_metric: 12 | def __init__(self, window_size=5) -> None: 13 | self.slab_realtime_cmd = ["sudo", "python3", "/home/yans3/bcc/tools/slabratetop.py", "-C"] 14 | self.stats = {'alloc_sum':[]} 15 | self.cnt = 0 16 | self.moving_sum = 0 17 | self.window_size = window_size 18 | for i in range(window_size): 19 | self.stats['alloc_sum'].append(0) 20 | 21 | def run_realtime(self, interval=1000, print_info=True) -> None: 22 | 23 | # define the SIGINT handler 24 | def signal_handler(sig, frame): 25 | os.killpg(os.getpgid(self.p_slab.pid), signal.SIGINT) 26 | print('[INFO] You pressed Ctrl+C!') 27 | sys.exit(0) 28 | 29 | #signal.signal(signal.SIGINT, signal_handler) 30 | print('[INFO] Press Ctrl+C to exit') 31 | 32 | cmd = self.slab_realtime_cmd 33 | print("[COMMAND]", cmd) 34 | self.p_slab = subprocess.Popen(cmd, text=True, stdout=subprocess.PIPE, preexec_fn=os.setsid) 35 | 36 | # define and start the recording threads 37 | def enqueue_output(stdout, queue): 38 | for line in stdout: 39 | queue.put(line) 40 | stdout.close() 41 | 42 | q_slab = Queue() 43 | t_slab = Thread(target=enqueue_output, args=(self.p_slab.stdout, q_slab)) 44 | t_slab.daemon = True 45 | t_slab.start() 46 | 47 | # define and start the parsing threads 48 | def catch_output(q:Queue): 49 | curr_sum = 0 50 | while(True): 51 | try: 52 | line = q.get_nowait() # or q.get(timeout=.1) 53 | except Empty: 54 | time.sleep(interval / 1000) # tune to 0.5 just in case 55 | else: # got line 56 | if line.isspace(): continue 57 | line = line.split() 58 | try: 59 | float(line[-1]) 60 | except ValueError: 61 | continue 62 | 63 | if "loadavg" in line[1]: 64 | self.moving_sum += curr_sum 65 | self.cnt += 1 66 | if self.cnt >= self.window_size: 67 | self.moving_sum -= self.stats['alloc_sum'][self.cnt % self.window_size] 68 | 69 | if print_info: 70 | print('mov', self.moving_sum, 'curr', curr_sum) 71 | print(self.stats['alloc_sum']) 72 | 73 | self.stats['alloc_sum'][self.cnt % self.window_size] = curr_sum 74 | curr_sum = 0 75 | else: 76 | curr_sum += int(line[1]) 77 | 78 | if print_info: 79 | print(line) 80 | 81 | 82 | t_catch_latency = Thread(target=catch_output, args=[q_slab]) 83 | t_catch_latency.daemon = True 84 | t_catch_latency.start() 85 | 86 | if print_info: 87 | while(True): 88 | time.sleep(1) 89 | 90 | 91 | def get_stat(self) -> int: 92 | return self.moving_sum / self.window_size 93 | 94 | if __name__ == "__main__": 95 | slab = slab_metric(1) 96 | slab.run_realtime(print_info=True) 97 | while(True): 98 | print(slab.get_stat()) 99 | time.sleep(1) 100 | -------------------------------------------------------------------------------- /caption_ae/caption_ctrl.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | from threading import Thread 4 | import math 5 | import threading 6 | import statistics 7 | 8 | from metrics.pcm_mon import * 9 | from metrics.vmstat_mon import * 10 | 11 | from algo import * 12 | from action import * 13 | from config import * 14 | 15 | MIN_SAMPLE_CNT = 20 16 | 17 | print("=======================") 18 | print("reset basics: top:{0}, bot_max:{1}, bot_rst:{2}, step_rst:{3}".format( 19 | IL_TOP_RESET, 20 | IL_BOT_MAX, 21 | IL_BOT_RESET, 22 | STEP_RESET 23 | )) 24 | print("=======================") 25 | 26 | class caption_ctrl: 27 | def __init__(self) -> None: 28 | self.arr = {'norm_ipc': [], 29 | 'L1.miss.lats': [], 30 | 'DDR.read.lats': []} 31 | self.prev_state = -1 32 | self.prev_step = STEP_RESET 33 | self.prev_ratio = IL_BOT_RESET 34 | 35 | def run_realtime(self, log_level=LOG_DEBUG)->None: 36 | pcm = pcm_metric() 37 | pcm_thread = Thread(target=pcm.run_realtime, args=(False,)) 38 | pcm_thread.start() 39 | reset_default() 40 | 41 | def catch_output(): 42 | cnt = 0 43 | while(True): 44 | update_metric(pcm.get_stat(WINDOW_SIZE), {}) 45 | 46 | self.arr['norm_ipc'].append(metric_dict['norm_ipc']) 47 | self.arr['L1.miss.lats'].append(metric_dict['L1.miss.lats']) 48 | self.arr['DDR.read.lats'].append(metric_dict['DDR.read.lats']) 49 | time.sleep(1) 50 | cnt += 1 51 | 52 | t = Thread(target=catch_output) 53 | t.daemon = True 54 | t.start() 55 | 56 | ''' 57 | while(True): 58 | time.sleep(1) 59 | self.prev_state += 1 60 | ''' 61 | 62 | def get_set_tune_val(self, first_time=False, model_only=False): 63 | log_action(bcolors.OKCYAN, "=============== TUNE ================ ") 64 | 65 | # get 66 | if len(self.arr['norm_ipc']) > MIN_SAMPLE_CNT: 67 | log_action(bcolors.OKCYAN, "=============== TUNE -- valid ================ ") 68 | accu_avg = 0 69 | # linear equation, SUM(coeff * mean of sample) 70 | for k, v in self.arr.items(): 71 | mean = sum(v) / len(v) 72 | accu_avg += coeff_dict[k] * mean 73 | self.arr[k] = [] 74 | print(k, mean) 75 | log_debug("=============== accu_avg -- {0} ================ ".format(accu_avg)) 76 | if model_only: 77 | return 78 | else: 79 | log_action(bcolors.OKCYAN, "=============== TUNE -- not enough samples ================ ") 80 | log_action(bcolors.OKCYAN, "=============== TUNE -- need: %d, has: %d samples ================ " % (MIN_SAMPLE_CNT, 81 | len(self.arr['norm_ipc']))) 82 | return 83 | 84 | log_action(bcolors.OKCYAN, "=============== TUNE -- algo ================ ") 85 | # first time tuning always attempt to tune toward more CXL 86 | if first_time: 87 | self.prev_state = accu_avg - 1 88 | step, ratio = algo(accu_avg, self.prev_state, self.prev_step, self.prev_ratio) 89 | 90 | log_action(bcolors.OKCYAN, "=============== TUNE -- track states ================ ") 91 | self.prev_state = accu_avg 92 | self.prev_step = step 93 | if ratio < 0: 94 | self.prev_ratio = IL_BOT_RESET 95 | else: 96 | self.prev_ratio = ratio 97 | 98 | if __name__ == "__main__": 99 | tuner = caption_ctrl() 100 | tuner.run_realtime() 101 | while(True): 102 | time.sleep(1) 103 | print(tuner.get_set_tune_val()) 104 | -------------------------------------------------------------------------------- /caption_ae/caption.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import sys 4 | from caption_ctrl import * 5 | import argparse 6 | import subprocess 7 | import threading 8 | import time 9 | 10 | MAX_TUNE_ITER = 7 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-x", "--step", help="stepping mode (10:x+=step). Algorithm is not applied, but the output of the estimator will be printed.", action='store_true') 14 | parser.add_argument("-s", "--sh-path", help="shell script that houses the program to be tuned. You may embed several program in this shell script") 15 | parser.add_argument("-n", "--no-tune", help="disable tuning. This will simply run the passed in shell script", action='store_true') 16 | parser.add_argument("-t", "--batch_txt", help="txt script that houses several shell scripts to be exectued. Tuning will happen whenevr a script ended its execution. You may set 'tune_mask' to enable tunning when one of the program ends") 17 | args = parser.parse_args() 18 | 19 | #print(args.echo) 20 | def exec_cmd_and_wait(sh_path): 21 | print(sh_path) 22 | subprocess.call(["sudo", "bash", sh_path]) 23 | 24 | def exec_cmd_and_wait_arg(sh_path): 25 | sh_path_arr = sh_path.split() 26 | sh_path_arr.insert(0, "bash") 27 | sh_path_arr.insert(0, "sudo") 28 | print(sh_path_arr) 29 | subprocess.call(sh_path_arr) 30 | 31 | def sync_tune(): 32 | # start monitor 33 | tuner = caption_ctrl() 34 | tuner.run_realtime() 35 | time.sleep(5) 36 | 37 | tune_iter = 0 38 | first_time = True 39 | 40 | stepping_ratio = 1 41 | 42 | try: 43 | while True: 44 | print("sync tuning iteration: %d" % (tune_iter)) 45 | 46 | # run exec 47 | exec_cmd_and_wait(args.sh_path) 48 | 49 | # tune 50 | if args.no_tune: 51 | print(" ============ no tune ============== ") 52 | if args.step is not None: 53 | print(" ============ no tune, stepping only ============== ") 54 | tuner.get_set_tune_val(first_time, model_only=True) 55 | stepping_ratio += 2 56 | set_ratio(10, stepping_ratio) 57 | else: 58 | tuner.get_set_tune_val(first_time) 59 | 60 | first_time = False 61 | time.sleep(5) 62 | tune_iter += 1 63 | 64 | if tune_iter >= MAX_TUNE_ITER: 65 | break 66 | except KeyboardInterrupt: 67 | print("ended with ctrl-c") 68 | 69 | def async_tune(): 70 | # start monitor 71 | tuner = caption_ctrl() 72 | tuner.run_realtime() 73 | time.sleep(5) 74 | 75 | tune_iter = 0 76 | first_time = True 77 | 78 | # read path 79 | # (t, sh_path) 80 | thread_arr = [] 81 | tune_mask = [] 82 | with open(args.batch_txt) as f: 83 | for line in f.readlines(): 84 | line = line.strip() 85 | print(line) 86 | t = threading.Thread(target=exec_cmd_and_wait_arg, args=(line,)) 87 | thread_arr.append((t, line)) 88 | 89 | # FIXME, set to the desired mask 90 | tune_mask.append(True) 91 | # for example: tune whenever "roms" ended 92 | #tune_mask.append("roms" in line) 93 | 94 | for t, _ in thread_arr: 95 | t.start() 96 | 97 | print("tune_mask: ", tune_mask) 98 | 99 | try: 100 | while True: 101 | print("async tuning iteration: %d" % (tune_iter)) 102 | 103 | found_end = False 104 | # This will find the targeted ending thread 105 | while found_end is False: 106 | 107 | for idx, (t, sh_path) in enumerate(thread_arr): 108 | if not t.is_alive(): 109 | print("ended %s" % sh_path) 110 | 111 | # restart 112 | new_t = threading.Thread(target=exec_cmd_and_wait_arg, args=(sh_path,)) 113 | thread_arr[idx] = (new_t, sh_path) 114 | new_t.start() 115 | 116 | # stop 117 | if tune_mask[idx]: 118 | print("tune!") 119 | found_end = True 120 | break 121 | 122 | # sleep, avoid spinning 123 | time.sleep(1) 124 | 125 | # tune 126 | if args.no_tune: 127 | print(" ============ no tune ============== ") 128 | else: 129 | tuner.get_set_tune_val(first_time) 130 | 131 | first_time = False 132 | time.sleep(5) 133 | tune_iter += 1 134 | 135 | if tune_iter >= MAX_TUNE_ITER: 136 | break 137 | 138 | except KeyboardInterrupt: 139 | print("ctrl-c pressed") 140 | for t, _ in thread_arr: 141 | t.stop() 142 | print("ended with ctrl-c") 143 | 144 | if __name__ == "__main__": 145 | if args.batch_txt is not None: 146 | async_tune() 147 | else: 148 | sync_tune() 149 | 150 | -------------------------------------------------------------------------------- /caption_ae/metrics/pmu_mon.py: -------------------------------------------------------------------------------- 1 | #!/home/zeduoyu2/anaconda3/bin/python3 2 | 3 | import os 4 | import subprocess 5 | import time 6 | import sys 7 | import signal 8 | from queue import Queue, Empty 9 | from threading import Thread 10 | 11 | class pmu_metric: 12 | def __init__(self, nodes=["Backend_Bound.Memory_Bound.DRAM_Bound.MEM_Latency", "Backend_Bound.Memory_Bound.L1_Bound", 13 | "Backend_Bound.Memory_Bound.DRAM_Bound.MEM_Bandwidth"], output_path="result/pmu_tools") -> None: 14 | 15 | self.node_list = nodes 16 | self.node_names = ','.join(nodes) 17 | self.output_path = output_path 18 | self.numaRatio_cmd = "sudo sysctl -w vm.numa_tier_interleave={ratio}" 19 | self.numaRatioTop_cmd = "sudo sysctl -w vm.numa_tier_interleave_top={top}" 20 | self.numaRatioBot_cmd = "sudo sysctl -w vm.numa_tier_interleave_bot={bottom}" 21 | self.toplev_cmd = ["sudo", "/home/yans3/pmu-tools/toplev", "-x,", "-o", "{filename}", "--no-desc", "-I", "1000", "-v", "--nodes", 22 | "!" + self.node_names] 23 | self.toplev_realtime_cmd = ["sudo", "/home/yans3/pmu-tools/toplev", "--no-desc", "-I", "1000", "-v", 24 | "--nodes", "!" + self.node_names] 25 | # self.stats = {"L1_Bound":[], "BW_Bound":[], "Lat_Bound":[]} 26 | self.stats = {} 27 | for node in self.node_list: 28 | self.stats[node] = [] 29 | 30 | if not os.path.exists(self.output_path): 31 | os.mkdir(self.output_path) 32 | 33 | 34 | def set_ratio(self, top:int, bot:int) -> None: 35 | 36 | print("[INFO] Configuring NUMA interleave ratio to %d:%d (DRAM:CXL)"%(top, bot)) 37 | cmd = self.numaRatioTop_cmd.format(top=top) 38 | print("[COMMAND]", cmd) 39 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True, text=True) 40 | ret = p.wait() 41 | if ret != 0: 42 | sys.exit(ret) 43 | out = p.communicate()[0] 44 | print("[RETURN] Output:", out) 45 | 46 | cmd = self.numaRatioBot_cmd.format(bottom=bot) 47 | print("[COMMAND]", cmd) 48 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True, text=True) 49 | ret = p.wait() 50 | if ret != 0: 51 | sys.exit(ret) 52 | out = p.communicate()[0] 53 | print("[RETURN] Output:", out) 54 | 55 | 56 | 57 | def start_recording(self, filename:str = "pmu_result.csv") -> None: 58 | 59 | # define the SIGINT handler 60 | def signal_handler(sig, frame): 61 | os.killpg(os.getpgid(self.p_toplev.pid), signal.SIGINT) 62 | print('[INFO] You pressed Ctrl+C!') 63 | sys.exit(0) 64 | 65 | signal.signal(signal.SIGINT, signal_handler) 66 | print('[INFO] Press Ctrl+C to exit') 67 | 68 | assert filename.endswith('.csv') 69 | file_path = os.path.join(self.output_path, filename) 70 | cmd = self.toplev_cmd 71 | cmd[4] = file_path 72 | print("[COMMAND]", cmd) 73 | self.p_toplev = subprocess.Popen(cmd, text=True, preexec_fn=os.setsid) 74 | 75 | 76 | def stop_recording(self) -> None: 77 | # self.fp.close() 78 | os.killpg(os.getpgid(self.p_toplev.pid), signal.SIGINT) 79 | print("[INFO] PMU monitoring stopped.") 80 | 81 | 82 | def run_realtime(self, interval=1000, print_info=True) -> None: 83 | 84 | # define the SIGINT handler 85 | def signal_handler(sig, frame): 86 | os.killpg(os.getpgid(self.p_toplev.pid), signal.SIGINT) 87 | print('[INFO] You pressed Ctrl+C!') 88 | sys.exit(0) 89 | 90 | #signal.signal(signal.SIGINT, signal_handler) 91 | print('[INFO] Press Ctrl+C to exit') 92 | 93 | cmd = self.toplev_realtime_cmd 94 | cmd[4] = str(interval) 95 | print("[COMMAND]", cmd) 96 | self.p_toplev = subprocess.Popen(cmd, text=True, stderr=subprocess.PIPE, preexec_fn=os.setsid) 97 | 98 | # define and start the recording threads 99 | def enqueue_output(stdout, queue): 100 | for line in stdout: 101 | queue.put(line) 102 | stdout.close() 103 | 104 | q_toplev = Queue() 105 | t_toplev = Thread(target=enqueue_output, args=(self.p_toplev.stderr, q_toplev)) 106 | t_toplev.daemon = True 107 | t_toplev.start() 108 | 109 | # define and start the parsing threads 110 | def catch_output(q:Queue): 111 | while(True): 112 | try: 113 | line = q.get_nowait() # or q.get(timeout=.1) 114 | except Empty: 115 | time.sleep(interval / 1000) # tune to 0.5 just in case 116 | else: # got line 117 | if line.isspace(): continue 118 | line = line.split() 119 | try: 120 | float(line[0]) 121 | except ValueError: 122 | continue 123 | 124 | node = line[2] 125 | val = line[5] 126 | 127 | if print_info: 128 | print("[RESULT] {node_name:<60} {value:<4} %".format(node_name=node, value=val)) 129 | 130 | self.stats[node].append(val) 131 | 132 | t_catch_latency = Thread(target=catch_output, args=[q_toplev]) 133 | t_catch_latency.daemon = True 134 | t_catch_latency.start() 135 | 136 | if print_info: 137 | while(True): 138 | time.sleep(1) 139 | 140 | 141 | def get_stat(self, window_size:int=5) -> dict: 142 | 143 | res = {} 144 | 145 | # length = len(self.stats["L1_Bound"]) 146 | # if length >= 1: 147 | # series = np.array(self.stats["L1_Bound"][-min(length,window_size):]).astype('float') 148 | # res["L1_Bound"] = series.mean() 149 | 150 | # length = len(self.stats["BW_Bound"]) 151 | # if length >= 1: 152 | # series = np.array(self.stats["BW_Bound"][-min(length,window_size):]).astype('float') 153 | # res["BW_Bound"] = series.mean() 154 | 155 | # length = len(self.stats["Lat_Bound"]) 156 | # if length >= 1: 157 | # series = np.array(self.stats["Lat_Bound"][-min(length,window_size):]).astype('float') 158 | # res["Lat_Bound"] = series.mean() 159 | 160 | for node in self.node_list: 161 | length = len(self.stats[node]) 162 | if length >= 1: 163 | series = np.array(self.stats[node][-min(length,window_size):]).astype('float') 164 | res[node] = series.mean() 165 | 166 | return res 167 | 168 | 169 | if __name__ == "__main__": 170 | 171 | node_names = ["Backend_Bound.Memory_Bound.DRAM_Bound.MEM_Latency", "Backend_Bound.Memory_Bound.L1_Bound", 172 | "Backend_Bound.Memory_Bound.DRAM_Bound.MEM_Bandwidth", "Backend_Bound.Memory_Bound", 173 | "Backend_Bound.Memory_Bound.L2_Bound", "Backend_Bound.Memory_Bound.L3_Bound"] 174 | 175 | pmu = pmu_metric(node_names) 176 | # pmu.set_ratio(10,20) 177 | # pmu.start_recording("dlrm.csv") 178 | pmu.run_realtime(print_info=False) 179 | while(True): 180 | print(pmu.get_stat()) 181 | # print(pmu.stats) 182 | time.sleep(1) 183 | -------------------------------------------------------------------------------- /memo_ae/README.md: -------------------------------------------------------------------------------- 1 | # memo benchmark 2 | 3 | ## Setup 4 | ### Prerequisite 5 | - Must 6 | - Installing cpupower,turbostat 7 | - `sudo apt-get install -y linux-tools-$(uname -r)` 8 | - libnuma installation 9 | - `sudo apt-get install libnuma-dev` 10 | 11 | ### Clone & Build 12 | ```bash 13 | git clone https://github.com/ece-fast-lab/cxl_type3_tests.git 14 | cd memo_ae/src 15 | make 16 | ``` 17 | 18 | ### Get Turbo stat 19 | ```bash 20 | # under memo_ae 21 | cd ./test_cxl/ 22 | bash get_turbostat.sh 23 | ``` 24 | 25 | ### Setup `env.sh` 26 | ```bash 27 | # Use your favorite editor to open env.sh 28 | vim ../util_scripts/env.sh 29 | ``` 30 | 1. Set `CLOSEST_NODE`: The NUMA node that the CXL device is directly attached to. 31 | - Command `sudo lspci -vvv` should also show the NUMA node that a CXL device attaches to. 32 | 2. Set `CLOSEST_CORE`: This is one of the CPU cores within CLOSEST NODE 33 | - Place the first core number of the `CLOSEST_NODE` in `CLOSEST_CORE`. You may find the core range of a NUMA node using this command: `numactl -H`. 34 | - For example, if CXL is connected to NUMA node 1, please place the first CPU in `node 1 cpus: XX, XX+1` (XX) in `CLOSEST_CPU`. 35 | 3. Set `TSC_FREQ`: 36 | - This should be the output from `Test Turbo stat` in `results/turbostat.txt`, it should look like this: 37 | 4. Set `MAX_NODE` 38 | - For a machine with 2 socket and 1 CXL node, this should be set to "2" 39 | - For a machine with 1 socket and 1 CXL node, this should be set to "1" 40 | ``` 41 | Core CPU Avg_MHz Busy% Bzy_MHz TSC_MHz IPC IRQ SMI POLL C1 C1E C3 C6 POLL% C1% C1E% C3% C6% CPU%c1 CPU%c3 CPU%c6 CoreTmp CoreThr PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 PkgWatt RAMWatt PKG_% RAM_% 42 | - - 10 0.57 1753 2000 0.52 985 0 14 15 108 0 945 0.00 0.12 0.40 0.00 98.93 2.23 0.00 97.20 22 0 25 2.14 0.00 76.94 25.52 0.00 0.00 0.00 43 | 0 0 4 0.25 1802 2000 1.02 29 0 0 0 0 0 34 0.00 0.00 0.00 0.00 99.76 0.93 0.00 98.83 22 0 25 2.14 0.00 76.95 25.52 0.00 0.00 0.00 44 | ``` 45 | **Note:** 46 | 1. There should be a constant number for all cores, 2000MHz in the example above. Please set `TSC_FREQ` (unit = MHz) to 2000 if the number is 2000. 47 | 2. In most system, this should also be the 6th number in the second row of `results/turbostat.txt` 48 | 49 | ## Notes 50 | * memo is only tested on AVX-enabled machines 51 | * Single-op latency (`-T 0`) has a much higher absolute values than block access latency (`-T 3`). 52 | * Block access is default to issue 16 accesses with randomly hard-coded offsets within a 64KB region. 53 | - You may play with `generate_random_inst.py` to generate a new set of random offsets. 54 | - You may change the MACROs in `src/workload.h`, `BLOCK_xN` and `*_xN_RAND_AVX512` to see how the number of parallel issue affects the average latency of each access. 55 | * Setting the `-F` argument is critical for all latency measures. `-F` is not used for all bandwidth tests. 56 | * Latency tests should always put `-t 1` for the thread count argument. 57 | 58 | ## Known issues 59 | * Random pointer chasing is NOT implemented, i.e. running with `-T 2` with `-r` 60 | - Testing with pointer chasing should always pin to core with `-p `, which defaults to run a sequential link list chasing with prefetching OFF on `` 61 | * The `-R` and `-o 5` options for read-write ratio are experimental. 62 | 63 | ## Arguments 64 | 65 | | Argument | Brief description | Default | Valid inputs | Note | 66 | | -------- | ----------------- | ------- | ------------ | ---- | 67 | | t | Number of testing threads | 32 | 1 - X | This should be set to 1 for all latency tests | 68 | | f | Prefetching enabled | disabled | -f | When `-p` is not specified, prefetch is NOT toggled. When `-p` is specified, default to prefetching disabled | 69 | | m | Total buffer size in bytes | 2^30 | 32-bit integer | Anything larger than 2^30 should use the `-S` argument | 70 | | S | Total buffer size in GiB | 1 GiB | 1 - total memory size on a NUMA node | / | 71 | | n | Buffer NUMA node | 0 | 0 - (Number of NUMA node - 1) | When `-o 4` is specified, this argument is used as the source buffer of the move operation | 72 | | d | Buffer NUMA node | 0 | 0 - (Number of NUMA node - 1) | When `-o 4` is specified, this argument is used as the destination buffer of the move operation. Otherwise, this argument is ignored. | 73 | | s | Stall Ratio | 0 | 0 - X | This argument will be used in the bandwidth test where each block of accesses is accompanied by `-s` number of stall blocks. A stall block consists of 6 x 16 x 16 x 4 = 6144 `nop` instructions. | 74 | | i | Iteration | 1 | 1 - X | For bandwidth tests, each iteration monitor the number of byte accessed across all thread in 0.5 second. For latency tests, each iteration is a single op / single block of accesses. For the pointer chasing test, each iteration is chasing through all cachelines in the specified buffer size. | 75 | | T | Type of operation | 0 | 0 - 3 |
  • 0 = single-op latency
  • 1 = bandwidth
  • 2 = pointer chasing
  • 3 = block-access latency| 76 | | p | Pin to core | -1 (not pin to core) | 0 - number of cores | Pinning to core affects argument `-f` | 77 | | a | Two core bandwidth test (core a) | -1 | 0 - number of cores | When testing with 2 thread bandwidth test, `-a` will specify the first core the thread should pin to. The policy for prefetching is aligned with `-p`.| 78 | | b | Two core bandwidth test (core b) | -1 | 0 - number of cores | (Same with `-a`, this will pin the second thread)| 79 | | g | Bandwidth test access block size | 512 | 16 - (per thread buff size / 64) | The MACROs for bandwidth tests are all 1024 Byte, thus the smallest stepping would be 16 cachlines | 80 | | r | Random bandwidth test | disabled | -r | This argument will trigger the next block of access to increment by a somewhat random fashion. However, accesses within a block remains sequential | 81 | | o | Operation | 0 (Load) | 0 - 4 |
    • 0 = Load
    • 1 = NT-load
    • 2 = Store
    • 3 = Nt-store
    • 4 = movdir64B (only in bandwidth tests)| 82 | | B | Flush before block access latency test | NOT flushed | -B | Used only in `-T 3`, this argument decides whether the 64KB region to be access will be flushed. | 83 | | C | Number of `nop` blocks before block access latency test | 0 | 0 - X | Used only in `-T 3`, this argument decides how many blocks of `nop` should be issued after the cacheline flushes (if there's any) and before the test begines. | 84 | | F | TSC frequnecy | 2GHz | X | This value should **always** present for any latency tests. Please refer to the turbostat section for determining this value. | 85 | 86 | 87 | ## Other profilings 88 | Under `./test_cxl/` 89 | #### Block access (fast) < 5 min 90 | ``` 91 | bash test_block_access_latency.sh 92 | ``` 93 | 94 | #### Ptr chasing (fast) < 5 min 95 | ``` 96 | bash test_ptr_chase.sh 97 | ``` 98 | 99 | #### Single operation latency (fast) < 5 min 100 | ``` 101 | bash test_single_op_latency.sh 102 | ``` 103 | 104 | #### `movdir64B` bandwidth (long long) > 15 min 105 | ``` 106 | bash test_movdir_bw.sh 107 | ``` 108 | 109 | #### Sequential access bandwidth (long long) > 15 min 110 | ``` 111 | bash test_seq_bw.sh 112 | ``` 113 | 114 | #### Random access bandwidth (long long) > 30 min 115 | ``` 116 | bash test_rand_bw.sh 117 | ``` 118 | 119 | ## Results 120 | All results are under the `results` folder. 121 | 122 | 123 | ## Acknowledgement 124 | Some parts of this source code and the methodology are inspired by the marvalous work in this [publication(FAST20-Yang)](https://www.usenix.org/conference/fast20/presentation/yang) and this [reposiroty(OptaneStudy)](https://github.com/NVSL/OptaneStudy/tree/master). 125 | 126 | -------------------------------------------------------------------------------- /caption_ae/metrics/pcm_mon.py: -------------------------------------------------------------------------------- 1 | #!/home/zeduoyu2/anaconda3/bin/python3 2 | 3 | import os 4 | import subprocess 5 | import time 6 | import sys 7 | import signal 8 | from queue import Queue, Empty 9 | from threading import Thread 10 | import re 11 | 12 | PCM_PATH = "/home/yans3/AE_root/pcm/build/bin/" 13 | 14 | class pcm_metric: 15 | def __init__(self) -> None: 16 | 17 | self.pcmLat_cmd = ["sudo", PCM_PATH + "pcm-latency"] 18 | self.pcmBw_cmd = ["sudo", PCM_PATH + "pcm-memory"] 19 | self.pcmAll_cmd = ["sudo", PCM_PATH + "pcm", "-nc", "-ns"] 20 | 21 | self.stats = { 22 | "pcm_l1miss":[], 23 | "pcm_ddrReadLat":[], 24 | "pcm_norm_ipc":[], 25 | "pcm_ipc":[]} 26 | 27 | def signal_handler(sig, frame): 28 | os.killpg(os.getpgid(self.p_latency.pid), signal.SIGINT) 29 | os.killpg(os.getpgid(self.p_all.pid), signal.SIGINT) 30 | print('[INFO] You pressed Ctrl+C!') 31 | sys.exit(0) 32 | 33 | #signal.signal(signal.SIGINT, signal_handler) 34 | print('[INFO] Press Ctrl+C to exit') 35 | 36 | def get_stat(self, window_size:int=5): 37 | res = {} 38 | # Take mean of last N samples 39 | for k, v in self.stats.items(): 40 | length = len(v) 41 | if length >= 1: 42 | last_n = v[-min(length,window_size):] 43 | res[k] = sum(last_n) / len(last_n) 44 | return res 45 | 46 | def run_realtime(self, print_info=True) -> None: 47 | 48 | def signal_handler(sig, frame): 49 | os.killpg(os.getpgid(self.p_latency.pid), signal.SIGINT) 50 | os.killpg(os.getpgid(self.p_all.pid), signal.SIGINT) 51 | print('[INFO] You pressed Ctrl+C!') 52 | sys.exit(0) 53 | 54 | #signal.signal(signal.SIGINT, signal_handler) 55 | print('[INFO] Press Ctrl+C to exit') 56 | 57 | # Start the PCM processes 58 | print("[COMMAND]", self.pcmLat_cmd) 59 | self.p_latency = subprocess.Popen(self.pcmLat_cmd, stdout=subprocess.PIPE, text=True, preexec_fn=os.setsid) 60 | time.sleep(5) 61 | 62 | print("[COMMAND]", self.pcmAll_cmd) 63 | self.p_all = subprocess.Popen(self.pcmAll_cmd, stdout=subprocess.PIPE, text=True, preexec_fn=os.setsid) 64 | 65 | 66 | # define and start the recording threads 67 | def enqueue_output(stdout, queue): 68 | for line in stdout: 69 | queue.put(line) 70 | stdout.close() 71 | 72 | q_latency = Queue() 73 | q_all = Queue() 74 | 75 | t_latency = Thread(target=enqueue_output, args=(self.p_latency.stdout, q_latency)) 76 | t_all = Thread(target=enqueue_output, args=(self.p_all.stdout, q_all)) 77 | 78 | t_latency.daemon = True 79 | t_all.daemon = True 80 | 81 | t_latency.start() 82 | t_all.start() 83 | 84 | 85 | def catch_output_all_cal(q:Queue): 86 | lines = "" 87 | result_arr = [] 88 | while(True): 89 | try: 90 | line = q.get_nowait() 91 | except Empty: 92 | time.sleep(1) # tune to 0.5 just in case 93 | else: 94 | lines += line 95 | if line == "---------------------------------------------------------------------------------------------------------------\n": 96 | norm_ipc_cal = 0 97 | ipc_cal = 0 98 | valid_cnt = 0 99 | 100 | for norm_ipc, ipc in result_arr: 101 | # XXX, hack -- norm_ipc > 0.1 kind of indicates the core is running something 102 | if True or norm_ipc > 0.1: 103 | norm_ipc_cal += norm_ipc 104 | ipc_cal += ipc 105 | valid_cnt += 1 106 | 107 | if valid_cnt > 0: 108 | norm_ipc_cal = norm_ipc_cal / valid_cnt 109 | ipc_cal = ipc_cal / valid_cnt 110 | print('valid_cnt:', valid_cnt) 111 | 112 | result_arr = [] 113 | self.stats["pcm_norm_ipc"].append(norm_ipc_cal) 114 | self.stats["pcm_ipc"].append(ipc_cal) 115 | 116 | else: 117 | line_arr = line.split() 118 | if len(line_arr) < 5: 119 | continue 120 | elif line_arr[0].isdigit(): 121 | norm_ipc = float(line_arr[2]) 122 | ipc = float(line_arr[3]) 123 | result_arr.append((norm_ipc, ipc)) 124 | ''' 125 | if int(line_arr[0]) > 7: 126 | result_arr.append((norm_ipc, ipc)) 127 | ''' 128 | 129 | # define and start the parsing threads 130 | def catch_output_all(q:Queue): 131 | lines = "" 132 | pattern_norm_ipc = re.compile(r"Instructions per nominal CPU cycle: ([0-9]+\.[0-9]+)") 133 | pattern_ipc = re.compile(r" PHYSICAL CORE IPC : ([0-9]+\.[0-9]+)") 134 | while(True): 135 | try: 136 | line = q.get_nowait() 137 | except Empty: 138 | time.sleep(1) # tune to 0.5 just in case 139 | else: 140 | lines += line 141 | if line == "---------------------------------------------------------------------------------------------------------------\n": 142 | matches = pattern_norm_ipc.findall(lines) 143 | if matches: 144 | if print_info: print("norm IPC: " + matches[0]) 145 | self.stats["pcm_norm_ipc"].append(float(matches[0])) 146 | 147 | matches = pattern_ipc.findall(lines) 148 | if matches: 149 | if print_info: print("IPC: " + matches[0]) 150 | self.stats["pcm_ipc"].append(float(matches[0])) 151 | lines = "" 152 | 153 | def catch_output_latency(q:Queue): 154 | lines = "" 155 | while(True): 156 | try: 157 | line = q.get_nowait() 158 | except Empty: 159 | time.sleep(1) # tune to 0.5 just in case 160 | else: # got line 161 | lines += line 162 | if line == "-----------------------------------------------------------------------------\n": 163 | # q_out.put(lines) 164 | pattern = re.compile(r"L1 Cache Miss Latency\(ns\) \[Adding 5 clocks for L1 Miss\]\n+Socket0: ([0-9]+\.[0-9]+)") 165 | matches = pattern.findall(lines) 166 | if matches: 167 | if print_info: print("[RESULT] L1 Miss Latency:" + matches[0]) 168 | self.stats["pcm_l1miss"].append(float(matches[0])) 169 | 170 | pattern = re.compile(r"DDR read Latency\(ns\)\nSocket0: ([0-9]+\.[0-9]+)\s*") 171 | matches = pattern.findall(lines) 172 | if matches: 173 | if print_info: print("[RESULT] DDR Read Latency:" + matches[0]) 174 | self.stats["pcm_ddrReadLat"].append(float(matches[0])) 175 | 176 | lines = "" 177 | 178 | t_catch_latency = Thread(target=catch_output_latency, args=[q_latency]) 179 | t_catch_all = Thread(target=catch_output_all, args=[q_all]) 180 | 181 | t_catch_latency.daemon = True 182 | t_catch_all.daemon = True 183 | 184 | t_catch_latency.start() 185 | t_catch_all.start() 186 | 187 | if print_info: 188 | while(True): 189 | time.sleep(1) # fixed BUG: used to be 'pass', which cause the utilization to be 100% 190 | 191 | 192 | if __name__ == "__main__": 193 | pcm = pcm_metric() 194 | pcm.run_realtime(print_info=False) 195 | while(True): 196 | print(pcm.get_stat()) 197 | time.sleep(1) 198 | -------------------------------------------------------------------------------- /memo_ae/src/util.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Developed by FAST Lab @ ECE-UIUC -- 2022-2023 3 | */ 4 | #include "util.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define MAX_NUM_THREAD 128 14 | #define MAX_BUF_GB 16 15 | #define MAX_NUMA_NODE 10 16 | #define MAX_SKIP_BYTE 1024 17 | #define PREFETCH_REG_ADDR 0x1A4 18 | #define MAX_CORE_NUM 63 19 | #define FLUSH_SIZE (512 * (1 << 20)) // MB 20 | #define TSC_FREQ_GHZ 2.0 21 | 22 | char* help_str = " Usage: \n" \ 23 | "-t Number of threads.\n" \ 24 | "-f enable prefetching (when -p is not specified, prefetch is NOT toggled.) (when -p is specified, default to prefetching disabled)\n" \ 25 | "-m buffer size, in byte (!!! only with 32-bit int).\n" \ 26 | "-S buffer size, in GB.\n" \ 27 | "-n NUMA node, if op {0,1,2,3}; SRC node if op {4}.\n" \ 28 | "-d NUMA node, DST node, only used if op {4}.\n" \ 29 | "-s stall ratio -- 5 for 1:5 ratio in op:stall in bandwidth tests.\n" \ 30 | "-i number of iteration. For BW it means how many times we probe all threads.\n" \ 31 | "-T 0 stands for Latency, 1 stands for Bandwidth, 2 stands for pointer tracing, 3 stands for block access latency.\n" \ 32 | "-p Pin to cores starting at core X, default -- do not pin to core (let the Linux scheduler decides).\n" \ 33 | "-a/b pin to core a and core b. -a and -b must be used at the same time\n" \ 34 | "-g Bandwidth granularity -- batch size per each workload call, in the unit of 64B.\n" \ 35 | "-r random access in bandwidth tests (sequential by default) \n" \ 36 | "-o 0 - Read, 1 - Read Non-temporal, 2 - Write, 3 - Write Non-temporal, 4 - movdir64B, 5 - Read/Write Mixed.\n" \ 37 | "-R If chose operation as Read/Write Mixed, this argument is used to specify the read ratio. Example: 20:80.\n" \ 38 | "-B flush 64KB of data block during block latency test (default to 0)\n" \ 39 | "-C number of clear pipeline block in block latency tests (default to 0)\n" \ 40 | "-F TSC_Freq, used for calculating cycle --> ns. Unit = MHz; default to (2000MHz). Please check with turbostat"; 41 | 42 | void set_default_cfg(test_cfg_t* cfg) { 43 | cfg->op = READ; 44 | cfg->type = BW; 45 | cfg->num_thread = 32; 46 | cfg->total_buf_size = (1 << 30); 47 | cfg->buf_a_numa_node = 0; // src 48 | cfg->buf_b_numa_node = 0; // dst 49 | cfg->op_iter = 1; 50 | cfg->per_thread_size = cfg->total_buf_size / cfg->num_thread; 51 | cfg->starting_core = -1; 52 | cfg->random = false; 53 | cfg->prefetch_en = false; 54 | cfg->stall_ratio = 0; 55 | cfg->bw_granu = 512; 56 | cfg->core_a = -1; 57 | cfg->core_b = -1; 58 | cfg->read_ratio = 1; 59 | cfg->flush_block = 0; 60 | cfg->num_clear_pipe = 0; 61 | cfg->tsc_freq = TSC_FREQ_GHZ; 62 | } 63 | 64 | void print_cfg(test_cfg_t* cfg) { 65 | 66 | fprintf (stdout, "==========================\n"); 67 | fprintf (stdout, "num_thread: %lu\n", cfg->num_thread); 68 | fprintf (stdout, "total_buf_size: %lu\n", cfg->total_buf_size); 69 | fprintf (stdout, "buf_a_numa_node:%d\n", cfg->buf_a_numa_node); 70 | fprintf (stdout, "buf_b_numa_node:%d\n", cfg->buf_b_numa_node); 71 | fprintf (stdout, "per_thread_size:%ld\n", cfg->per_thread_size); 72 | fprintf (stdout, "op_iter: %d\n", cfg->op_iter); 73 | fprintf (stdout, "type: %d\n", cfg->type); 74 | fprintf (stdout, "op: %d\n", cfg->op); 75 | fprintf (stdout, "starting_core: %d\n", cfg->starting_core); 76 | fprintf (stdout, "random: %d\n", cfg->random); 77 | fprintf (stdout, "stall_ratio: %d\n", cfg->stall_ratio); 78 | fprintf (stdout, "bw_granu: %d\n", cfg->bw_granu); 79 | fprintf (stdout, "core_a: %d\n", cfg->core_a); 80 | fprintf (stdout, "core_b: %d\n", cfg->core_b); 81 | fprintf (stdout, "flush_block: %d\n", cfg->flush_block); 82 | fprintf (stdout, "num_clear_pipe: %d\n", cfg->num_clear_pipe); 83 | fprintf (stdout, "tsc_freq (GHz): %f\n", cfg->tsc_freq); 84 | fprintf (stdout, "==========================\n"); 85 | } 86 | 87 | 88 | 89 | int parse_arg(int argc, char*argv[], test_cfg_t* cfg) { 90 | int opt; 91 | int num; 92 | int read; 93 | int write; 94 | set_default_cfg(cfg); 95 | 96 | // TODO, parse arg for operation / type 97 | while ((opt = getopt(argc, argv, "F:C:p:a:b:t:m:S:n:d:s:i:g:T:o:R:rhfB")) != -1) { 98 | switch (opt) { 99 | case 'F': 100 | num = atoi(optarg); 101 | cfg->tsc_freq = (double)num / (double)(1000.0); 102 | break; 103 | case 'C': 104 | num = atoi(optarg); 105 | cfg->num_clear_pipe = num; 106 | break; 107 | case 'B': 108 | cfg->flush_block = 1; 109 | break; 110 | case 'a': 111 | num = atoi(optarg); 112 | if (num < 0 || num > MAX_CORE_NUM) { 113 | fprintf (stderr, "Can't start a from core: %d\n", num); 114 | return -1; 115 | } 116 | cfg->core_a = num; 117 | break; 118 | case 'b': 119 | num = atoi(optarg); 120 | if (num < 0 || num > MAX_CORE_NUM) { 121 | fprintf (stderr, "Can't start b from core: %d\n", num); 122 | return -1; 123 | } 124 | cfg->core_b = num; 125 | break; 126 | case 'p': 127 | num = atoi(optarg); 128 | if (num > MAX_CORE_NUM || num < 0) { 129 | fprintf (stderr, "Can't start from core: %d\n", num); 130 | return -1; 131 | } 132 | cfg->starting_core = num; 133 | break; 134 | case 't': 135 | num = atoi(optarg); 136 | if (num > MAX_NUM_THREAD) { 137 | fprintf (stderr, "Can't have more than %d threads, %d\n", MAX_NUM_THREAD, num); 138 | return -1; 139 | } else { 140 | cfg->num_thread = num; 141 | } 142 | break; 143 | case 'm': 144 | num = atoi(optarg); 145 | cfg->total_buf_size = num; 146 | break; 147 | 148 | case 'S': 149 | num = atoi(optarg); 150 | if (num > MAX_BUF_GB) { 151 | fprintf (stderr, "Can't have more than %d GB buf, %d\n", MAX_BUF_GB, num); 152 | return -1; 153 | } else { 154 | cfg->total_buf_size = ((uint64_t)num << 30); 155 | } 156 | break; 157 | 158 | case 'n': 159 | num = atoi(optarg); 160 | if (num < 0 || num > MAX_NUMA_NODE) { 161 | fprintf (stderr, "NUMA node out of range (0, %d): %d\n", MAX_NUMA_NODE, num); 162 | return -1; 163 | } else { 164 | cfg->buf_a_numa_node = num; 165 | } 166 | break; 167 | 168 | case 'd': 169 | num = atoi(optarg); 170 | if (num < 0 || num > MAX_NUMA_NODE) { 171 | fprintf (stderr, "NUMA node out of range (0, %d): %d\n", MAX_NUMA_NODE, num); 172 | return -1; 173 | } else { 174 | cfg->buf_b_numa_node = num; 175 | } 176 | break; 177 | 178 | case 's': 179 | num = atoi(optarg); 180 | if (num < 0) { 181 | fprintf (stderr, "stall ratio must be greater than 0, found: %d\n", num); 182 | return -1; 183 | } else { 184 | cfg->stall_ratio = num; 185 | } 186 | break; 187 | 188 | case 'i': 189 | num = atoi(optarg); 190 | if (num < 0) { 191 | fprintf (stderr, "iteration count must be positive: %d\n", num); 192 | return -1; 193 | } else { 194 | cfg->op_iter = num; 195 | } 196 | break; 197 | 198 | case 'T': 199 | num = atoi(optarg); 200 | if(num < 0 || num > 3){ 201 | fprintf(stderr, "type must be 0(latency clflush), 1(bandwidth), 2(pointer chasing), 3(block latency).\n"); 202 | return -1; 203 | } else { 204 | cfg->type = num; 205 | } 206 | break; 207 | 208 | case 'o': 209 | num = atoi(optarg); 210 | if(num < 0 || num > 5){ 211 | fprintf(stderr, "operation must be 0(read), 1(read non-temporal), 2(write), 3(write non-temporal), 4(movdir64B) or 5(mix RW).\n"); 212 | return -1; 213 | } else { 214 | cfg->op = num; 215 | } 216 | break; 217 | 218 | case 'R': 219 | sscanf(optarg, "%d:%d", &read, &write); 220 | if (read <= 0 || write <= 0) { 221 | fprintf(stderr, "Read/Write ratio cannot be negative numbers!\n"); 222 | } else { 223 | cfg->read_ratio = read / write; 224 | } 225 | break; 226 | 227 | case 'g': 228 | num = atoi(optarg); 229 | cfg->bw_granu = num; 230 | break; 231 | 232 | case 'r': 233 | cfg->random = true; 234 | break; 235 | 236 | case 'f': 237 | cfg->prefetch_en = true; 238 | break; 239 | 240 | case 'h': 241 | fprintf (stdout, "%s\n", help_str); 242 | return -2; 243 | break; 244 | 245 | case '?': 246 | fprintf (stderr, "Option -%c requires an argument.\n", optopt); 247 | return -1; 248 | break; 249 | 250 | default: 251 | fprintf (stderr, "default, %c, abort\n", optopt); 252 | return -1; 253 | abort(); 254 | } 255 | } 256 | 257 | if (cfg->core_a * cfg->core_b < 0) { 258 | fprintf (stderr, "found core_a: %d, core_b: %d, please set them accordingly\n", cfg->core_a, cfg->core_b); 259 | return -1; 260 | } 261 | 262 | cfg->per_thread_size = cfg->total_buf_size / cfg->num_thread; 263 | uint64_t calculated_buf_size = cfg->per_thread_size * cfg->num_thread; 264 | printf("cal: %lu vs total: %lu\n", calculated_buf_size, cfg->total_buf_size); 265 | 266 | if (calculated_buf_size != cfg->total_buf_size) { 267 | // reset per thread size to 2^12 byte aligned (avoid AVX run out of addresss) 268 | cfg->per_thread_size &= 0xFFFFFFFFFFFFF000; 269 | } 270 | 271 | // optind is for the extra arguments 272 | // which are not parsed 273 | for(; optind < argc; optind++){ 274 | printf("extra arguments: %s\n", argv[optind]); 275 | } 276 | 277 | print_cfg(cfg); 278 | return 0; 279 | } 280 | 281 | // This function returns the NUMA node that a pointer address resides on. 282 | int get_node(void *p, uint64_t size) 283 | { 284 | int* status; 285 | void** page_arr; 286 | unsigned long page_size; 287 | unsigned long page_cnt; 288 | int ret; 289 | char* start_addr; 290 | 291 | page_size = (unsigned long)getpagesize(); 292 | page_cnt = (size / page_size); 293 | status = malloc(page_cnt * sizeof(int)); 294 | page_arr = malloc(page_cnt * sizeof(char*)); 295 | start_addr = (char*)p; 296 | 297 | fprintf(stdout, "[get_node] buf: %lx, page_size: %ld, page_cnt: %ld\n", (uint64_t)(p), page_size, page_cnt); 298 | 299 | for (unsigned long i = 0; i < page_cnt; i++) { 300 | page_arr[i] = start_addr; 301 | if (i < page_cnt) { 302 | start_addr = &(start_addr[page_size]); 303 | } 304 | } 305 | 306 | 307 | ret = move_pages(0, page_cnt, page_arr, NULL, status, 0); 308 | if (ret != 0) { 309 | fprintf(stderr, "Problem in %s line %d calling move_pages(), ret = %d\n", __FILE__,__LINE__, ret); 310 | printf("%s\n", strerror(errno)); 311 | } 312 | 313 | ret = status[0]; 314 | for (uint64_t i = 0; i < page_cnt; i++) { 315 | if (ret != status[i]) { 316 | fprintf(stderr, "found page: %lu on node: %d, different from node: %d\n", i, status[i], ret); 317 | ret = status[i]; 318 | break; 319 | } 320 | } 321 | 322 | if (ret == status[0]) { 323 | fprintf(stdout, "all pages: %lx, %lx ... are on node: %d\n", (uint64_t)(page_arr[0]), (uint64_t)(page_arr[1]), ret); 324 | } 325 | 326 | free(page_arr); 327 | free(status); 328 | return ret; 329 | } 330 | 331 | int init_buf(uint64_t size, int node, char** alloc_ptr) { 332 | char *ptr; 333 | int ret; 334 | unsigned long page_size; 335 | uint64_t page_cnt; 336 | uint64_t idx; 337 | 338 | if ((ptr = (char *)numa_alloc_onnode(size, node)) == NULL) { 339 | fprintf(stderr,"Problem in %s line %d allocating memory\n",__FILE__,__LINE__); 340 | return -1; 341 | } 342 | printf("[INFO] done alloc. Next, touch all pages\n"); 343 | // alloc is only ready when accessed 344 | page_size = (unsigned long)getpagesize(); 345 | page_cnt = (size / page_size); 346 | idx = 0; 347 | for (uint64_t i = 0; i < page_cnt; i++) { 348 | ptr[idx] = 0; 349 | idx += page_size; 350 | } 351 | printf("[INFO] done touching pages. Next, validate on node X\n"); 352 | ret = get_node(ptr, size); 353 | if (ret != node) { 354 | printf("ptr is on node %d, but expect node %d\n", ret, node); 355 | return -2; 356 | } 357 | printf("ptr is on node %d\n", ret); 358 | printf("allocated: %luMB\n", (size >> 20)); 359 | 360 | *alloc_ptr = ptr; 361 | 362 | return 0; 363 | } 364 | 365 | uint64_t read_MSR(int cpu){ 366 | int fd; 367 | uint64_t data; 368 | char msr_file_name[64]; 369 | 370 | sprintf(msr_file_name, "/dev/cpu/%d/msr", cpu); 371 | fd = open(msr_file_name, O_RDONLY); 372 | 373 | if (fd < 0) { 374 | if (errno == ENXIO) { 375 | fprintf(stderr, "rdmsr: No CPU %d\n", cpu); 376 | exit(2); 377 | } else if (errno == EIO) { 378 | fprintf(stderr, "rdmsr: CPU %d doesn't support MSRs\n", 379 | cpu); 380 | exit(3); 381 | } else { 382 | perror("rdmsr: open"); 383 | exit(127); 384 | } 385 | } 386 | 387 | if (pread(fd, &data, sizeof data, PREFETCH_REG_ADDR) != sizeof data) { 388 | if (errno == EIO) { 389 | fprintf(stderr, "rdmsr: CPU %d cannot read ", cpu); 390 | exit(4); 391 | } else { 392 | perror("rdmsr: pread"); 393 | exit(127); 394 | } 395 | } 396 | 397 | close(fd); 398 | 399 | return data; 400 | } 401 | 402 | void write_MSR(int cpu, uint64_t val){ 403 | int fd; 404 | char msr_file_name[64]; 405 | 406 | sprintf(msr_file_name, "/dev/cpu/%d/msr", cpu); 407 | fd = open(msr_file_name, O_WRONLY); 408 | 409 | if (fd < 0) { 410 | if (errno == ENXIO) { 411 | fprintf(stderr, "rdmsr: No CPU %d\n", cpu); 412 | exit(2); 413 | } else if (errno == EIO) { 414 | fprintf(stderr, "rdmsr: CPU %d doesn't support MSRs\n", 415 | cpu); 416 | exit(3); 417 | } else { 418 | perror("rdmsr: open"); 419 | exit(127); 420 | } 421 | } 422 | 423 | if (pwrite(fd, &val, sizeof(val), PREFETCH_REG_ADDR) != sizeof(val)){ 424 | if (errno == EIO) { 425 | fprintf(stderr, 426 | "wrmsr: CPU %d cannot set MSR ", cpu); 427 | exit(4); 428 | } else { 429 | perror("wrmsr: pwrite"); 430 | exit(127); 431 | } 432 | } 433 | 434 | close(fd); 435 | 436 | return; 437 | } 438 | 439 | void disable_prefetch(int cpu){ 440 | uint64_t val; 441 | val = read_MSR(cpu); 442 | write_MSR(cpu, val | 0xF); 443 | val = read_MSR(cpu); 444 | printf(YEL "[INFO]" RESET " CPU %d prefetch disabled. Now at 0x1A4: %lx\n", cpu, val); 445 | } 446 | 447 | void enable_prefetch(int cpu){ 448 | uint64_t val; 449 | val = read_MSR(cpu); 450 | write_MSR(cpu, val & 0xFFFFFFFFFFFFFFF0); 451 | printf(YEL "[INFO]" RESET " CPU %d prefetch enabled.\n", cpu); 452 | } 453 | 454 | // taken from https://stackoverflow.com/questions/1046714/what-is-a-good-random-number-generator-for-a-game 455 | static uint64_t y=362436069, z=521288629; 456 | uint64_t xorshf96(uint64_t* xx) { //period 2^96-1 457 | uint64_t t; 458 | uint64_t x = *xx; 459 | x ^= x << 16; 460 | x ^= x >> 5; 461 | x ^= x << 1; 462 | 463 | t = x; 464 | x = y; 465 | y = z; 466 | z = t ^ x ^ y; 467 | *xx = x; 468 | 469 | return z; 470 | } 471 | 472 | // can't use WBINVD 473 | // https://stackoverflow.com/questions/1756825/how-can-i-do-a-cpu-cache-flush-in-x86-windows 474 | // alloc large bue and read/write 475 | void flush_all_cache() { 476 | char* buf; 477 | printf(YEL "[INFO]" RESET " Flushing cache, with %d MB access ... \n", FLUSH_SIZE >> 20); 478 | 479 | buf = malloc(FLUSH_SIZE); 480 | for (int j = 0; j < 2; j++) { 481 | for (int i = 0; i < FLUSH_SIZE; i++) { 482 | buf[i] = i + 1; // make sure this is not optimized 483 | } 484 | } 485 | free(buf); 486 | printf(YEL "[INFO]" RESET " Cache flush done ... \n"); 487 | } 488 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /memo_ae/src/workload.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Developed by FAST Lab @ ECE-UIUC -- 2022-2023 3 | * Some part of this file follows the methodology of FAST-20 Yang's resporitory 4 | * @ https://github.com/NVSL/OptaneStudy/tree/master 5 | */ 6 | #include "workload.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #define MIN_GRANULARITY 512 15 | 16 | // change me to use different size of AVX 17 | //#define SIZENTLD_MACRO SIZENTLD_512_AVX512 18 | #define SIZENTLD_MACRO SIZENTLD_1024_AVX512 19 | #define SIZELD_MACRO SIZELD_1024_AVX512 20 | #define SIZEST_MACRO SIZEST_1024_AVX512 21 | #define SIZEMOV_MACRO SIZEMOV_1024 22 | //#define SIZEST_MACRO SIZEST_WB_1024_AVX512 23 | 24 | 25 | /** 26 | * op_ntld 27 | * @brief Load the given size data from the memory with non-temporal hint. 28 | * @param addr the load start address 29 | * @param size the size of the memory we want to access (in byte) 30 | * @return none 31 | */ 32 | void op_ntld(char* addr, long size){ 33 | /* by default we perform load in 512 byte granularity */ 34 | /* sanity check */ 35 | if(size < MIN_GRANULARITY){ 36 | fprintf(stderr, RED "[ERROR]" RESET "op_ntld(): buffer size is smaller than %d byte.", MIN_GRANULARITY); 37 | exit(1); 38 | } 39 | /* round down to MIN_GRANULARITY */ 40 | size = size - (size % MIN_GRANULARITY); 41 | 42 | asm volatile( 43 | "mov %[addr], %%r9 \n" 44 | "xor %%r10, %%r10 \n" 45 | "LOOP_NTLD: \n" 46 | SIZENTLD_MACRO 47 | "cmp %[size], %%r10 \n" 48 | "jl LOOP_NTLD \n" 49 | : /* output */ 50 | :[size]"r"(size), [addr]"r"(addr) /* input */ 51 | :"%r9", "%r10" /* clobbered register */ 52 | ); 53 | } 54 | 55 | /** 56 | * op_ld 57 | * @brief Load the given size data from the memory with non-temporal hint. 58 | * @param addr the load start address 59 | * @param size the size of the memory we want to access (in byte) 60 | * @return none 61 | */ 62 | void op_ld(char* addr, long size){ 63 | /* by default we perform load in 512 byte granularity */ 64 | /* sanity check */ 65 | if(size < MIN_GRANULARITY){ 66 | fprintf(stderr, RED "[ERROR]" RESET "op_ld(): buffer size is smaller than %d byte.", MIN_GRANULARITY); 67 | exit(1); 68 | } 69 | /* round down to MIN_GRANULARITY*/ 70 | size = size - (size % MIN_GRANULARITY); 71 | 72 | asm volatile( 73 | "mov %[addr], %%r9 \n" 74 | "xor %%r10, %%r10 \n" 75 | "LOOP_LD: \n" 76 | SIZELD_MACRO 77 | "cmp %[size], %%r10 \n" 78 | "jl LOOP_LD \n" 79 | : /* output */ 80 | :[size]"r"(size), [addr]"r"(addr) /* input */ 81 | :"%r9", "%r10", ZMM_0_15 /* clobbered register */ 82 | ); 83 | } 84 | 85 | /** 86 | * op_ntst 87 | * @brief Store the given size data to the memory with non-temporal hint. 88 | * @param addr the store start address 89 | * @param size the size of the memory we want to store (in byte) 90 | * @return none 91 | */ 92 | void op_ntst(char* addr, long size){ 93 | /* by default we perform load in 512 byte granularity */ 94 | /* sanity check */ 95 | if(size < MIN_GRANULARITY){ 96 | fprintf(stderr, RED "[ERROR]" RESET "op_ntst(): buffer size is smaller than %d byte.", MIN_GRANULARITY); 97 | exit(1); 98 | } 99 | /* round down to MIN_GRANULARITY*/ 100 | size = size - (size % MIN_GRANULARITY); 101 | 102 | asm volatile( 103 | "mov %[addr], %%r9 \n" 104 | "xor %%r10, %%r10 \n" 105 | "LOOP_NTST: \n" 106 | SIZENTST_1024_AVX512 107 | "cmp %[size], %%r10 \n" 108 | "jl LOOP_NTST \n" 109 | "sfence \n" 110 | : /* output */ 111 | :[size]"r"(size), [addr]"r"(addr) /* input */ 112 | :"%r9", "%r10", ZMM_0_15 /* clobbered register */ 113 | ); 114 | } 115 | 116 | /** 117 | * op_st 118 | * @brief Store the given size data to the memory with non-temporal hint. 119 | * @param addr the store start address 120 | * @param size the size of the memory we want to store (in byte) 121 | * @return none 122 | */ 123 | void op_st(char* addr, long size){ 124 | /* by default we perform load in 512 byte granularity */ 125 | /* sanity check */ 126 | if(size < MIN_GRANULARITY){ 127 | fprintf(stderr, RED "[ERROR]" RESET "op_st(): buffer size is smaller than %d byte.", MIN_GRANULARITY); 128 | exit(1); 129 | } 130 | /* round down to MIN_GRANULARITY*/ 131 | size = size - (size % MIN_GRANULARITY); 132 | 133 | asm volatile( 134 | "mov %[addr], %%r9 \n" 135 | "xor %%r10, %%r10 \n" 136 | "LOOP_ST: \n" 137 | SIZEST_MACRO 138 | "cmp %[size], %%r10 \n" 139 | "jl LOOP_ST \n" 140 | : /* output */ 141 | :[size]"r"(size), [addr]"r"(addr) /* input */ 142 | :REGISTERS, "%r10" /* clobbered register */ 143 | ); 144 | } 145 | 146 | /** 147 | * op_mixed 148 | * @brief Store the given size data to the memory with non-temporal hint. 149 | * @param addr the store start address 150 | * @param size the total size of the memory we want to operate (in byte) 151 | * @param ratio the read to write ratio of memory accesses 152 | * @return none 153 | */ 154 | void op_mixed(char* addr, long size, int ratio){ 155 | /* by default we perform load in 512 byte granularity */ 156 | /* sanity check */ 157 | if(size < 384){ 158 | fprintf(stderr, RED "[ERROR]" RESET "op_mix(): buffer size is smaller than 384 byte."); 159 | exit(1); 160 | } 161 | /* round down to 512 */ 162 | // size = size - (size % 512); 163 | 164 | switch (ratio) 165 | { 166 | case 1: // 1R:1W 167 | asm volatile( 168 | "mov %[addr], %%r9 \n" 169 | "xor %%r10, %%r10 \n" 170 | "LOOP_MIXED1: \n" 171 | SIZE_R1W1_512 172 | "cmp %[size], %%r10 \n" 173 | "jl LOOP_MIXED1 \n" 174 | : /* output */ 175 | :[size]"r"(size), [addr]"r"(addr) /* input */ 176 | :REGISTERS, "%r10" /* clobbered register */ 177 | ); 178 | break; 179 | 180 | case 2: // 2R:1W 181 | asm volatile( 182 | "mov %[addr], %%r9 \n" 183 | "xor %%r10, %%r10 \n" 184 | "LOOP_MIXED2: \n" 185 | // SIZE_R2W1_384 186 | SIZE_R2W1_576 187 | "cmp %[size], %%r10 \n" 188 | "jl LOOP_MIXED2 \n" 189 | : /* output */ 190 | :[size]"r"(size), [addr]"r"(addr) /* input */ 191 | :REGISTERS, "%r10" /* clobbered register */ 192 | ); 193 | break; 194 | case 3: // 3R:1W 195 | asm volatile( 196 | "mov %[addr], %%r9 \n" 197 | "xor %%r10, %%r10 \n" 198 | "LOOP_MIXED3: \n" 199 | SIZE_R3W1_512 200 | "cmp %[size], %%r10 \n" 201 | "jl LOOP_MIXED3 \n" 202 | : /* output */ 203 | :[size]"r"(size), [addr]"r"(addr) /* input */ 204 | :REGISTERS, "%r10" /* clobbered register */ 205 | ); 206 | break; 207 | 208 | default: 209 | fprintf(stderr, RED "[ERROR]" RESET "op_mix(): Invalid RW ratio."); 210 | exit(1); 211 | break; 212 | } 213 | 214 | 215 | } 216 | 217 | /** 218 | * op_stall 219 | * @brief stall the core by issuing nop 220 | */ 221 | 222 | void op_stall() { 223 | asm volatile( 224 | CLEAR_PIPELINE 225 | CLEAR_PIPELINE 226 | CLEAR_PIPELINE 227 | CLEAR_PIPELINE 228 | : 229 | : 230 | : 231 | ); 232 | } 233 | 234 | /** 235 | * op_movdir64B 236 | * @brief Store the given size data to the memory with non-temporal hint. 237 | 238 | * @param addr the store start address 239 | * @param size the size of the memory we want to store (in byte) 240 | * @return none 241 | */ 242 | void op_movdir64B(char* src_addr, char* dst_addr, long size) { 243 | /* by default we perform load in 512 byte granularity */ 244 | /* sanity check */ 245 | if(size < MIN_GRANULARITY){ 246 | fprintf(stderr, RED "[ERROR]" RESET "op_st(): buffer size is smaller than %d byte.", MIN_GRANULARITY); 247 | exit(1); 248 | } 249 | /* round down to MIN_GRANULARITY */ 250 | size = size - (size % MIN_GRANULARITY); 251 | asm volatile( 252 | "mov %[src_addr], %%r9 \n" 253 | "mov %[dst_addr], %%r12 \n" 254 | "xor %%r10, %%r10 \n" 255 | "LOOP_MOV: \n" 256 | SIZEMOV_MACRO 257 | "cmp %[size], %%r10 \n" 258 | "jl LOOP_MOV \n" 259 | "sfence \n" 260 | : /* output */ 261 | :[size]"r"(size), [src_addr]"r"(src_addr), [dst_addr]"r"(dst_addr)/* input */ 262 | :REGISTERS, "%r10", "%r11", "%r12" /* clobbered register */ 263 | ); 264 | } 265 | 266 | /** 267 | * op_ntld_32B_lat 268 | * @brief measure the latency of loading 32 bytes with non-temporal hint 269 | * @param addr the memory address from where we load the 32 bytes 270 | * @return the time elapsed during load process. In unit of CPU cycles. 271 | */ 272 | uint64_t op_ntld_32B_lat(char* addr){ 273 | uint64_t t_start = 0, t_end = 0; 274 | 275 | /* make sure the addr is 32 byte aligned */ 276 | addr = (char*)((uint64_t)addr & (~0x1F)); 277 | 278 | asm volatile( 279 | "mov %[addr], %%rsi\n" 280 | "mfence\n" 281 | FLUSH_CACHE_LINE 282 | TIMING_BEGIN 283 | "vmovntdqa 0*32(%%rsi), %%ymm0 \n" 284 | TIMING_END 285 | :[t_start] "=r" (t_start), [t_end] "=r" (t_end) 286 | :[addr] "r" (addr) 287 | :REGISTERS 288 | ); 289 | 290 | return (t_end - t_start); 291 | } 292 | 293 | /** 294 | * op_ntld_64B_lat 295 | * @brief measure the latency of loading 64 bytes with non-temporal hint 296 | * @param addr the memory address from where we load the 64 bytes 297 | * @return the time elapsed during load process. In unit of CPU cycles. 298 | */ 299 | uint64_t op_ntld_64B_lat(char* addr){ 300 | uint64_t t_start = 0, t_end = 0; 301 | 302 | /* make sure address is 64byte aligned (what will happen if not?) */ 303 | addr = (char*)((uint64_t)addr & (~0x3F)); 304 | 305 | asm volatile( 306 | "mov %[addr], %%rsi\n" 307 | "mfence\n" 308 | FLUSH_CACHE_LINE 309 | TIMING_BEGIN 310 | "vmovntdqa 0*32(%%rsi), %%ymm0 \n" 311 | "vmovntdqa 1*32(%%rsi), %%ymm1 \n" 312 | TIMING_END 313 | :[t_start] "=r" (t_start), [t_end] "=r" (t_end) 314 | :[addr] "r" (addr) 315 | :REGISTERS 316 | ); 317 | 318 | return (t_end - t_start); 319 | } 320 | 321 | 322 | /** 323 | * op_ntst_64B_lat 324 | * @brief measure the latency of storing 64 bytes with non-temporal hint 325 | * @param addr the memory address from where we store the 64 bytes 326 | * @return the time elapsed during store process. In unit of CPU cycles. 327 | */ 328 | uint64_t op_ntst_64B_lat(char* addr){ 329 | uint64_t t_start = 0, t_end = 0; 330 | 331 | /* make sure address is 64byte aligned (what will happen if not?) */ 332 | addr = (char*)((uint64_t)addr & (~0x3F)); 333 | 334 | asm volatile( 335 | "mov %[addr], %%rsi\n" 336 | "mfence\n" 337 | FLUSH_CACHE_LINE 338 | CLEAR_PIPELINE 339 | TIMING_BEGIN 340 | "vmovntpd %%ymm0, 0*32(%%rsi) \n" 341 | "vmovntpd %%ymm1, 1*32(%%rsi) \n" 342 | 343 | TIMING_END 344 | :[t_start] "=r" (t_start), [t_end] "=r" (t_end) 345 | :[addr] "r" (addr) 346 | :REGISTERS 347 | ); 348 | 349 | return (t_end - t_start); 350 | } 351 | 352 | /** 353 | * op_ld_64B_lat 354 | * @brief measure the latency of loading 64 bytes without non-temporal hint 355 | * @param addr the memory address from where we load the 64 bytes 356 | * @return the time elapsed during load process. In unit of CPU cycles. 357 | */ 358 | uint64_t op_ld_64B_lat(char* addr){ 359 | uint64_t t_start = 0, t_end = 0; 360 | 361 | /* make sure address is 64byte aligned (what will happen if not?) */ 362 | addr = (char*)((uint64_t)addr & (~0x3F)); 363 | 364 | asm volatile( 365 | "mov %[addr], %%rsi\n" 366 | "mfence\n" 367 | FLUSH_CACHE_LINE 368 | CLEAR_PIPELINE 369 | TIMING_BEGIN 370 | "vmovdqa 0*32(%%rsi), %%ymm0 \n" 371 | "vmovdqa 1*32(%%rsi), %%ymm1 \n" 372 | TIMING_END 373 | :[t_start] "=r" (t_start), [t_end] "=r" (t_end) 374 | :[addr] "r" (addr) 375 | :REGISTERS 376 | ); 377 | 378 | return (t_end - t_start); 379 | } 380 | 381 | /** 382 | * op_st_64B_lat 383 | * @brief measure the latency of storing 64 bytes without non-temporal hint 384 | * @param addr the memory address from where we store the 64 bytes 385 | * @return the time elapsed during store process. In unit of CPU cycles. 386 | */ 387 | uint64_t op_st_64B_lat(char* addr){ 388 | uint64_t t_start = 0, t_end = 0; 389 | 390 | /* make sure address is 64byte aligned (what will happen if not?) */ 391 | addr = (char*)((uint64_t)addr & (~0x3F)); 392 | 393 | asm volatile( 394 | "mov %[addr], %%rsi\n" 395 | "mfence\n" 396 | FLUSH_CACHE_LINE 397 | CLEAR_PIPELINE 398 | TIMING_BEGIN 399 | "vmovdqa %%ymm0, 0*32(%%rsi) \n" 400 | "vmovdqa %%ymm0, 1*32(%%rsi) \n" 401 | //"sfence \n" 402 | TIMING_END 403 | :[t_start] "=r" (t_start), [t_end] "=r" (t_end) 404 | :[addr] "r" (addr) 405 | :REGISTERS 406 | ); 407 | 408 | return (t_end - t_start); 409 | } 410 | 411 | /** 412 | * op_st_cl_flush_64B_lat 413 | * @brief measure the latency of storing 64 bytes & flushing the cacheline, without non-temporal hint 414 | * @param addr the memory address from where we store the 64 bytes 415 | * @return the time elapsed during store process. In unit of CPU cycles. 416 | */ 417 | uint64_t op_st_cl_flush_64B_lat(char* addr){ 418 | uint64_t t_start = 0, t_end = 0; 419 | 420 | /* make sure address is 64byte aligned (what will happen if not?) */ 421 | addr = (char*)((uint64_t)addr & (~0x3F)); 422 | 423 | asm volatile( 424 | "mov %[addr], %%rsi\n" 425 | "mfence\n" 426 | CLEAR_PIPELINE 427 | TIMING_BEGIN 428 | "vmovdqa %%ymm0, 0*32(%%rsi) \n" 429 | "vmovdqa %%ymm0, 1*32(%%rsi) \n" 430 | "clwb 0*32(%%rsi) \n" 431 | TIMING_END 432 | :[t_start] "=r" (t_start), [t_end] "=r" (t_end) 433 | :[addr] "r" (addr) 434 | :REGISTERS 435 | ); 436 | return (t_end - t_start); 437 | } 438 | 439 | /** 440 | * op_st_32B_lat 441 | * @brief measure the latency of storing 32 bytes without non-temporal hint 442 | * @param addr the memory address from where we store the 32 bytes 443 | * @return the time elapsed during store process. In unit of CPU cycles. 444 | */ 445 | uint64_t op_st_32B_lat(char* addr){ 446 | uint64_t t_start = 0, t_end = 0; 447 | 448 | /* make sure address is 32 byte aligned (what will happen if not?) */ 449 | addr = (char*)((uint64_t)addr & (~0x1F)); 450 | 451 | asm volatile( 452 | "mov %[addr], %%rsi\n" 453 | "mfence\n" 454 | FLUSH_CACHE_LINE 455 | TIMING_BEGIN 456 | "vmovdqa %%ymm0, 0*32(%%rsi) \n" 457 | TIMING_END 458 | :[t_start] "=r" (t_start), [t_end] "=r" (t_end) 459 | :[addr] "r" (addr) 460 | :REGISTERS 461 | ); 462 | 463 | return (t_end - t_start); 464 | } 465 | 466 | uint64_t op_ptr_chase(char* addr, uint64_t num_chase_block) { 467 | uint64_t t_start = 0, t_end = 0; 468 | asm volatile( 469 | "mov %[addr], %%r11 \n" 470 | "xor %%r10, %%r10 \n" 471 | TIMING_BEGIN 472 | 473 | "LOOP_CHASE: \n" 474 | "mov (%%r11), %%r11 \n" 475 | "inc %%r10 \n" 476 | "cmp %[num_chase_block], %%r10 \n" 477 | "jl LOOP_CHASE \n" 478 | 479 | TIMING_END 480 | :[t_start] "=r" (t_start), [t_end] "=r" (t_end) 481 | :[addr] "r" (addr), [num_chase_block] "r" (num_chase_block) 482 | :REGISTERS, "%r10", "%r11" 483 | ); 484 | return (t_end - t_start); 485 | } 486 | 487 | uint64_t op_stwb_block_lat(char* addr, bool flush_block, long num_clear_pipe) { 488 | uint64_t t_start = 0, t_end = 0; 489 | //assume 64KB buff 490 | asm volatile( 491 | "mov %[addr], %%r11 \n" 492 | "xor %%r10, %%r10 \n" 493 | 494 | "cmp $0x0, %[flush_block] \n" 495 | "je LOOP_BLOCK_STWB_FLUSH_DONE \n" 496 | "LOOP_BLOCK_STWB_FLUSH: \n" 497 | "clflush (%%r11, %%r10) \n" 498 | "add $0x40, %%r10 \n" 499 | "cmp $0x10000, %%r10 \n" 500 | "jl LOOP_BLOCK_STWB_FLUSH \n" 501 | "xor %%r10, %%r10 \n" 502 | "mfence \n" 503 | 504 | "LOOP_BLOCK_STWB_FLUSH_DONE: \n" 505 | 506 | "cmp %[num_clear_pipe], %%r10 \n" 507 | "je LOOP_BLOCK_STWB_START \n" 508 | CLEAR_PIPELINE_x16 509 | "add $0x1, %%r10 \n" 510 | "jmp LOOP_BLOCK_STWB_FLUSH_DONE \n" 511 | 512 | "LOOP_BLOCK_STWB_START: \n" 513 | "xor %%r10, %%r10 \n" 514 | 515 | // Test 516 | TIMING_BEGIN 517 | STWB_xN_RAND_AVX512 518 | TIMING_END 519 | 520 | :[t_start] "=r" (t_start), [t_end] "=r" (t_end) 521 | :[addr] "r" (addr), [flush_block] "r" (flush_block), [num_clear_pipe] "r" (num_clear_pipe) 522 | :REGISTERS, "%r10", "%r11", ZMM_0_15 523 | ); 524 | 525 | return (t_end - t_start); 526 | } 527 | 528 | uint64_t op_ld_block_lat(char* addr, bool flush_block, long num_clear_pipe) { 529 | uint64_t t_start = 0, t_end = 0; 530 | asm volatile( 531 | "mov %[addr], %%r11 \n" 532 | "xor %%r10, %%r10 \n" 533 | 534 | // flush data 535 | "cmp $0x0, %[flush_block] \n" 536 | "je LOOP_BLOCK_LD_FLUSH_DONE \n" 537 | "LOOP_BLOCK_LD_FLUSH: \n" 538 | "clflush (%%r11, %%r10) \n" 539 | "add $0x40, %%r10 \n" 540 | "cmp $0x10000, %%r10 \n" 541 | "jl LOOP_BLOCK_LD_FLUSH \n" 542 | "xor %%r10, %%r10 \n" 543 | "mfence \n" 544 | 545 | "LOOP_BLOCK_LD_FLUSH_DONE: \n" 546 | 547 | "cmp %[num_clear_pipe], %%r10 \n" 548 | "je LOOP_BLOCK_LD_START \n" 549 | CLEAR_PIPELINE_x16 550 | "add $0x1, %%r10 \n" 551 | "jmp LOOP_BLOCK_LD_FLUSH_DONE \n" 552 | 553 | "LOOP_BLOCK_LD_START: \n" 554 | "xor %%r10, %%r10 \n" 555 | 556 | // Test 557 | TIMING_BEGIN 558 | LD_xN_RAND_AVX512 559 | TIMING_END 560 | 561 | :[t_start] "=r" (t_start), [t_end] "=r" (t_end) 562 | :[addr] "r" (addr), [flush_block] "r" (flush_block), [num_clear_pipe] "r" (num_clear_pipe) 563 | :REGISTERS, "%r10", "%r11", ZMM_0_15 564 | ); 565 | return (t_end - t_start); 566 | } 567 | 568 | uint64_t op_ntld_block_lat(char* addr, bool flush_block, long num_clear_pipe) { 569 | uint64_t t_start = 0, t_end = 0; 570 | asm volatile( 571 | "mov %[addr], %%r11 \n" 572 | "xor %%r10, %%r10 \n" 573 | 574 | // flush data 575 | "cmp $0x0, %[flush_block] \n" 576 | "je LOOP_BLOCK_NTLD_FLUSH_DONE \n" 577 | "LOOP_BLOCK_NTLD_FLUSH: \n" 578 | "clflush (%%r11, %%r10) \n" 579 | "add $0x40, %%r10 \n" 580 | "cmp $0x10000, %%r10 \n" 581 | "jl LOOP_BLOCK_NTLD_FLUSH \n" 582 | "xor %%r10, %%r10 \n" 583 | "mfence \n" 584 | 585 | "LOOP_BLOCK_NTLD_FLUSH_DONE: \n" 586 | 587 | "cmp %[num_clear_pipe], %%r10 \n" 588 | "je LOOP_BLOCK_NTLD_START \n" 589 | CLEAR_PIPELINE_x16 590 | "add $0x1, %%r10 \n" 591 | "jmp LOOP_BLOCK_NTLD_FLUSH_DONE \n" 592 | 593 | "LOOP_BLOCK_NTLD_START: \n" 594 | "xor %%r10, %%r10 \n" 595 | 596 | // Test 597 | TIMING_BEGIN 598 | NTLD_xN_RAND_AVX512 599 | TIMING_END 600 | 601 | :[t_start] "=r" (t_start), [t_end] "=r" (t_end) 602 | :[addr] "r" (addr), [flush_block] "r" (flush_block), [num_clear_pipe] "r" (num_clear_pipe) 603 | :REGISTERS, "%r10", "%r11", ZMM_0_15 604 | ); 605 | return (t_end - t_start); 606 | } 607 | 608 | uint64_t op_ntst_block_lat(char* addr, bool flush_block, long num_clear_pipe) { 609 | uint64_t t_start = 0, t_end = 0; 610 | asm volatile( 611 | "mov %[addr], %%r11 \n" 612 | "xor %%r10, %%r10 \n" 613 | 614 | // flush data 615 | "cmp $0x0, %[flush_block] \n" 616 | "je LOOP_BLOCK_NTST_FLUSH_DONE \n" 617 | "LOOP_BLOCK_NTST_FLUSH: \n" 618 | "clflush (%%r11, %%r10) \n" 619 | "add $0x40, %%r10 \n" 620 | "cmp $0x10000, %%r10 \n" 621 | "jl LOOP_BLOCK_NTST_FLUSH \n" 622 | "xor %%r10, %%r10 \n" 623 | "mfence \n" 624 | 625 | "LOOP_BLOCK_NTST_FLUSH_DONE: \n" 626 | 627 | "cmp %[num_clear_pipe], %%r10 \n" 628 | "je LOOP_BLOCK_NTST_START \n" 629 | CLEAR_PIPELINE_x16 630 | "add $0x1, %%r10 \n" 631 | "jmp LOOP_BLOCK_NTST_FLUSH_DONE \n" 632 | 633 | "LOOP_BLOCK_NTST_START: \n" 634 | "xor %%r10, %%r10 \n" 635 | 636 | // Test 637 | TIMING_BEGIN 638 | NTST_xN_RAND_AVX512 639 | "sfence \n" 640 | TIMING_END 641 | 642 | :[t_start] "=r" (t_start), [t_end] "=r" (t_end) 643 | :[addr] "r" (addr), [flush_block] "r" (flush_block), [num_clear_pipe] "r" (num_clear_pipe) 644 | :REGISTERS, "%r10", "%r11", ZMM_0_15 645 | ); 646 | return (t_end - t_start); 647 | } 648 | 649 | void set_all_zmm(char* addr) { 650 | asm volatile( 651 | "mov %[addr], %%r9 \n" 652 | "xor %%r10, %%r10 \n" 653 | SIZELD_MACRO 654 | "mfence\n" 655 | : /* output */ 656 | :[addr]"r"(addr) /* input */ 657 | :"%r9", "%r10", REGISTERS, ZMM_0_15 /* clobbered register */ 658 | ); 659 | } 660 | 661 | void dump_zmm(char* dst, uint64_t size) { 662 | char* data_buf; 663 | posix_memalign((void**)(&data_buf), 4096, 4096); 664 | for (int i = 0; i < 4096; i++) { 665 | data_buf[i] = 0; 666 | } 667 | asm volatile( 668 | "mov %[addr], %%r9 \n" 669 | "xor %%r10, %%r10 \n" 670 | SIZEST_MACRO 671 | "mfence\n" 672 | : /* output */ 673 | :[addr]"r"(data_buf) /* input */ 674 | :"%r9", "%r10", REGISTERS, ZMM_0_15 /* clobbered register */ 675 | ); 676 | for (int i = 0; i < 1024; i++) { 677 | if (i % 64 == 0) { 678 | printf("zmm%d ", i / 64); 679 | } 680 | printf("%x", (unsigned char)data_buf[i]); 681 | if (i % 64 == 63) { 682 | printf("\n"); 683 | } 684 | } 685 | if (dst != NULL) { 686 | uint64_t copy_size = size > 1024 ? 1024 : size; 687 | memcpy(data_buf, dst, copy_size); 688 | } 689 | free(data_buf); 690 | } 691 | -------------------------------------------------------------------------------- /memo_ae/src/test.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Developed by FAST Lab @ ECE-UIUC -- 2022-2023 3 | */ 4 | #define _GNU_SOURCE 5 | 6 | #include "test.h" 7 | #include "util.h" 8 | #include "workload.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define WAIT_SEC_US 5000000 20 | 21 | #define US_TO_S 1000000 22 | 23 | #define SET_VAL 15 24 | 25 | #define PAGE_SIZE 4096 26 | 27 | //#define CHECK_NT_ST 28 | 29 | //#define DUMP_ZMM 30 | 31 | // ================================================= 32 | // zmm test functions 33 | // ================================================= 34 | /* 35 | * These functions were used for dumping the data in 36 | * the avx zmm registers. In zmm0, zmm1, and zmm2, there 37 | * are a few bytes stayed constant despite storing to it. 38 | * Hence the weird conditions in `check_buff` 39 | * Maybe because of this: https://stackoverflow.com/questions/41819514/why-do-sse-instructions-preserve-the-upper-128-bit-of-the-ymm-registers 40 | */ 41 | void set_data_buf(char* data_buf, uint64_t size) { 42 | fprintf(stdout, "[set_data_buf] \n"); 43 | for (uint64_t i = 0; i < size; i++) { 44 | data_buf[i] = i; 45 | } 46 | } 47 | 48 | void clear_buff(char* buff, uint64_t size) { 49 | fprintf(stdout, "[clear_buff]\n"); 50 | for (uint64_t i = 0; i < size; i++) { 51 | buff[i] = 0; 52 | } 53 | } 54 | 55 | void check_buff(char* buff, uint64_t size) { 56 | fprintf(stdout, "[check_buff]\n"); 57 | uint64_t error_cnt = 0; 58 | uint64_t correct_cnt = 0; 59 | 60 | char* truth_buf; 61 | posix_memalign((void**)(&truth_buf), PAGE_SIZE, PAGE_SIZE); 62 | dump_zmm(truth_buf, 1024); 63 | int mod; 64 | for (uint64_t i = 0; i < size; i++) { 65 | mod = i % 1024; 66 | if (mod <= 0xF) continue; 67 | if (mod >= 0x40 && mod <= 0x4F) continue; 68 | if (mod >= 0x80 && mod <= 0x8F) continue; 69 | if (mod >= 0x140 && mod <= 0x14F) continue; 70 | if (buff[i] != truth_buf[mod]) { 71 | fprintf(stdout, "[check_buff] buff[%lx] != truth, found %x, expect %x\n", i, (unsigned char)buff[i], (unsigned char)truth_buf[mod]); 72 | error_cnt++; 73 | } else { 74 | correct_cnt++; 75 | } 76 | if (error_cnt > 100) { 77 | fprintf(stdout, "[check_buff] before exit with 100 error, correct_cnt %ld\n", correct_cnt); 78 | return; 79 | } 80 | } 81 | free(truth_buf); 82 | if (error_cnt == 0) { 83 | fprintf(stdout, "[check_buff] all correct! correct_cnt: %ld\n", correct_cnt); 84 | } 85 | } 86 | 87 | 88 | // ================================================= 89 | // benchmark wrapping functions 90 | // ================================================= 91 | 92 | static volatile int keepRunning = 1; 93 | void stop_threads(test_cfg_t* cfg_arr) { 94 | int num_thread; 95 | num_thread = cfg_arr[0].num_thread; 96 | fprintf(stdout, "[stop_threads]\n"); 97 | 98 | for (int i = 0; i < num_thread; i++) { 99 | cfg_arr[i].halt = 1; 100 | } 101 | } 102 | 103 | void intHandler(int dummy) { 104 | fprintf(stdout, "Ctrl-c detected, %d\n", dummy); 105 | keepRunning = 0; 106 | } 107 | 108 | /** 109 | * get_bw 110 | * @brief Read the curr_op_cnt from each thread and calculate the sum every *delay* microsecond. 111 | * @param cfg_arr array of config. 112 | * @param iter how many times we probe the bandwidth. 113 | * @param delay interval between probes in unit of us. 114 | * @return none 115 | */ 116 | void get_bw(test_cfg_t* cfg_arr, int iter, int delay) { 117 | int num_thread; 118 | num_thread = cfg_arr[0].num_thread; 119 | uint64_t prev_cnt, curr_cnt; 120 | prev_cnt = 0; 121 | 122 | for (int j = 0; j < iter; j++) { 123 | curr_cnt = 0; 124 | for (int i = 0; i < num_thread; i++) { 125 | curr_cnt += cfg_arr[i].curr_op_cnt; 126 | } 127 | // fprintf(stdout, GRN "[get_bw] " RESET "%.1f MB/sec\n", ((double)(curr_cnt - prev_cnt) / (double)((1 << 20) * (delay / 1000000)))); 128 | /* for easier processing */ 129 | fprintf(stdout, "[get_bw] %.1f MB/sec\n", ((double)(curr_cnt - prev_cnt) / (double)((1 << 20) * (delay / US_TO_S)))); 130 | prev_cnt = curr_cnt; 131 | if (keepRunning == 0) break; 132 | 133 | usleep(delay); 134 | } 135 | } 136 | 137 | /* 138 | * wrapping: 139 | * main 140 | * run_test 141 | * thread_wrapper 142 | * lats/bw_wrapper 143 | * operation 144 | */ 145 | 146 | // spawn thread 147 | int run_test(test_cfg_t* cfg) { 148 | pthread_t* thread_arr; 149 | test_cfg_t* cfg_arr; 150 | test_cfg_t* curr_cfg; 151 | int ret, num_thread; 152 | 153 | // just in case 154 | signal(SIGINT, intHandler); 155 | 156 | // alloc 157 | num_thread = cfg->num_thread; 158 | thread_arr = malloc(num_thread * sizeof(pthread_t)); 159 | cfg_arr = malloc(num_thread * sizeof(test_cfg_t)); 160 | memset(cfg_arr, 0, num_thread * sizeof(test_cfg_t)); 161 | 162 | 163 | // clear buff 164 | #ifdef CHECK_NT_ST 165 | clear_buff(cfg->buf_a, cfg->total_buf_size); 166 | if (cfg->op == MOV) { 167 | clear_buff(cfg->buf_b, cfg->total_buf_size); 168 | } 169 | #endif 170 | 171 | // launch thread 172 | for (int i = 0; i < num_thread; i++) { 173 | curr_cfg = &(cfg_arr[i]); 174 | memcpy(curr_cfg, cfg, sizeof(test_cfg_t)); 175 | 176 | curr_cfg->thread_idx = i; 177 | curr_cfg->halt = 0; 178 | curr_cfg->curr_op_cnt = 0; 179 | 180 | curr_cfg->start_addr_a = &(curr_cfg->buf_a[i * curr_cfg->per_thread_size]); 181 | if (cfg->op == MOV) { 182 | curr_cfg->start_addr_b = &(curr_cfg->buf_b[i * curr_cfg->per_thread_size]); 183 | } 184 | ret = pthread_create(&thread_arr[i], NULL, thread_wrapper, (void*)curr_cfg); 185 | } 186 | 187 | // monitor threads 188 | switch(cfg->type) { 189 | case LATS_CLFLUSH: 190 | // do nothing, latency is monitored within a single thread 191 | break; 192 | case BW: 193 | get_bw(cfg_arr, cfg->op_iter, WAIT_SEC_US); 194 | break; 195 | case LATS_CHASE: 196 | // do nothing, latency is monitored within a single thread 197 | break; 198 | case BLOCK_LATS: 199 | // do nothing, latency is monitored within a single thread 200 | break; 201 | default: 202 | fprintf(stderr, "unknown type, thread idx: %d\n", cfg->thread_idx); 203 | } 204 | 205 | if (cfg->type == BW) { 206 | stop_threads(cfg_arr); 207 | } 208 | 209 | // join threads 210 | for (int i = 0; i < num_thread; i++) { 211 | ret = pthread_join(thread_arr[i], NULL); 212 | } 213 | 214 | free(cfg_arr); 215 | free(thread_arr); 216 | return ret; 217 | } 218 | 219 | void print_lats(test_cfg_t* cfg, uint64_t min, uint64_t max, uint64_t sum, uint64_t num_chase_block) { 220 | uint64_t avg_cycle = sum / cfg->op_iter; 221 | printf(GRN "[RESULT]" RESET " Max latency: %.1f, Min latency: %.1f\n", 1.0*max/cfg->tsc_freq, 1.0*min/cfg->tsc_freq); 222 | printf(GRN "[RESULT]" RESET " Max cycle : %lu, Min cycle : %lu, Avg cycle: %lu\n", max, min, avg_cycle); 223 | printf(GRN "[RESULT]" RESET " Thread %d average latency among %d iterations: %.1fns (assume %fGHz)\n", cfg->thread_idx, \ 224 | cfg->op_iter, 1.0 / cfg->tsc_freq * avg_cycle, cfg->tsc_freq); 225 | if (num_chase_block != 0) { 226 | printf(RED "[RESULT]" RESET "chase/block_lats average cycle among %d iterations: %.1fcycles\n", cfg->op_iter, 1.0 * avg_cycle / num_chase_block); 227 | printf(RED "[RESULT]" RESET " chase/block_lats average latency among %d iterations: %.1fns (assume %fGHz)\n", cfg->op_iter, 1.0 / cfg->tsc_freq * avg_cycle / num_chase_block, cfg->tsc_freq); 228 | } 229 | } 230 | 231 | int comp(const void* elem1, const void* elem2){ 232 | uint64_t f = *((uint64_t*)elem1); 233 | uint64_t s = *((uint64_t*)elem2); 234 | if (f > s) return 1; 235 | if (f < s) return -1; 236 | return 0; 237 | } 238 | 239 | void print_lats_median(test_cfg_t* cfg, uint64_t* result) { 240 | qsort(result, cfg->op_iter, sizeof(*result), comp); 241 | uint64_t median = result[cfg->op_iter / 2]; 242 | printf(RED "[RESULT]" RESET " Median latency among %d iterations: %.1fns (assume %fGHz)\n", cfg->op_iter, 1.0 / cfg->tsc_freq * median, cfg->tsc_freq); 243 | } 244 | 245 | void init_ptr_buf_random(test_cfg_t* cfg) { 246 | // FIXME -- this is NOT implemented 247 | printf(YEL "[INFO]" RESET " Random pointer chasing is NOT implemented. Building the ptr array in order\n"); 248 | chase_t* curr_ptr; 249 | chase_t* next_ptr; 250 | uint64_t num_chase_block; 251 | 252 | curr_ptr = (chase_t*)cfg->start_addr_a; 253 | num_chase_block = cfg->total_buf_size / 64; 254 | 255 | for (uint64_t i = 0; i < num_chase_block - 1; i++) { 256 | next_ptr = &(curr_ptr[1]); 257 | curr_ptr->ptr_arr[0] = next_ptr; 258 | curr_ptr = next_ptr; 259 | } 260 | curr_ptr->ptr_arr[0] = (chase_t*)cfg->start_addr_a; 261 | } 262 | 263 | uint64_t init_ptr_buf(test_cfg_t* cfg) { 264 | 265 | chase_t* curr_ptr; 266 | chase_t* next_ptr; 267 | uint64_t num_chase_block; 268 | 269 | printf(YEL "[INFO]" RESET " building pointer chasing link list, block size: %ld bytes ... \n", sizeof(chase_t)); 270 | 271 | if (cfg->random) { 272 | init_ptr_buf_random(cfg); 273 | 274 | } else { 275 | curr_ptr = (chase_t*)cfg->start_addr_a; 276 | num_chase_block = cfg->total_buf_size / sizeof(chase_t); 277 | 278 | for (uint64_t i = 0; i < num_chase_block - 1; i++) { 279 | //for (uint64_t i = 0; i < 5; i++) { 280 | next_ptr = &(curr_ptr[1]); 281 | curr_ptr->ptr_arr[0] = next_ptr; 282 | curr_ptr = next_ptr; 283 | } 284 | curr_ptr->ptr_arr[0] = (chase_t*)cfg->start_addr_a; 285 | } 286 | 287 | printf(YEL "[INFO]" RESET " num blocks: %lu \n", num_chase_block); 288 | printf(YEL "[INFO]" RESET " Chase confirm: start_addr: 0x%lx, first chase addr: 0x%lx\n", 289 | (uint64_t)cfg->start_addr_a, 290 | (uint64_t)(&((chase_t*)(cfg->start_addr_a))->ptr_arr[0])); 291 | 292 | curr_ptr = (chase_t*)cfg->start_addr_a; 293 | printf(YEL "[INFO]" RESET " Chase confirm: next_addr: 0x%lx, second chase addr: 0x%lx\n", 294 | (uint64_t)(&curr_ptr[1]), 295 | (uint64_t)(((chase_t*)(cfg->start_addr_a))->ptr_arr[0])); 296 | 297 | return num_chase_block; 298 | } 299 | 300 | void set_prefetching(int starting_core, bool prefetch_en, int core_num) { 301 | if (starting_core >= 0) { 302 | if (prefetch_en) { 303 | enable_prefetch(core_num); 304 | } else { 305 | disable_prefetch(core_num); 306 | } 307 | } 308 | } 309 | 310 | void restore_prefetching(int starting_core, bool prefetch_en, int core_num) { 311 | if (starting_core >= 0) { 312 | // restore to enable prefetching 313 | if (!prefetch_en) { 314 | enable_prefetch(core_num); 315 | } 316 | } 317 | } 318 | 319 | 320 | void lats_chase_wrapper(test_cfg_t* cfg) { 321 | uint64_t result, latency_sum = 0; 322 | uint64_t min, max; 323 | uint64_t num_chase_block; 324 | int core_num = cfg->thread_idx + cfg->starting_core; 325 | 326 | if (cfg->start_addr_a == NULL) { 327 | printf(RED "[ERROR]" RESET " init_ptr_buf, found null buf addr\n"); 328 | return; 329 | } 330 | 331 | num_chase_block = init_ptr_buf(cfg); 332 | 333 | set_prefetching(cfg->starting_core, cfg->prefetch_en, core_num); 334 | 335 | cfg->op_iter += 1; // for warm up 336 | for (int i = 0; i < cfg->op_iter; i++) { 337 | switch (cfg->op) { 338 | default: 339 | result = op_ptr_chase(cfg->start_addr_a, num_chase_block); 340 | break; 341 | } 342 | if (i >= 1) { 343 | latency_sum += result; 344 | printf("result = %lu\n", result); 345 | } else { 346 | printf("warmup = %lu\n", result); 347 | } 348 | if (i == 1) { 349 | min = result; 350 | max = result; 351 | } else { 352 | if (min < result) min = result; 353 | if (max > result) max = result; 354 | } 355 | } 356 | 357 | restore_prefetching(cfg->starting_core, cfg->prefetch_en, core_num); 358 | 359 | cfg->op_iter -= 1; // remove warm up for average 360 | print_lats(cfg, min, max, latency_sum, num_chase_block); 361 | } 362 | 363 | /* 364 | * This function tests multi-operation latency. 365 | * The scheme here goes as: 366 | * flush cacheline 367 | * mfence 368 | * issue many nop 369 | * mark time1 370 | * issue X ops 371 | * mark time2 372 | * 373 | * In most cases, the latency goes down as more ops 374 | * are issued in parallel. 375 | */ 376 | void block_lats_wrapper(test_cfg_t* cfg) { 377 | uint64_t result, latency_sum = 0; 378 | uint64_t min, max; 379 | int offset; /* measure the latency op_iter times and take average */ 380 | int core_num = cfg->thread_idx + cfg->starting_core; 381 | 382 | uint64_t* result_buff; 383 | result_buff = malloc(sizeof(uint64_t) * cfg->op_iter); 384 | 385 | set_prefetching(cfg->starting_core, cfg->prefetch_en, core_num); 386 | 387 | flush_all_cache(); 388 | for (int i = 0; i < cfg->op_iter; i++) { 389 | offset = rand() % cfg->total_buf_size & ~(0xFFFF); 390 | switch (cfg->op) { 391 | case READ: 392 | result = op_ld_block_lat(cfg->start_addr_a + offset, cfg->flush_block, cfg->num_clear_pipe); 393 | break; 394 | case READ_NT: 395 | result = op_ntld_block_lat(cfg->start_addr_a + offset, cfg->flush_block, cfg->num_clear_pipe); 396 | break; 397 | case WRITE: 398 | result = op_stwb_block_lat(cfg->start_addr_a + offset, cfg->flush_block, cfg->num_clear_pipe); 399 | break; 400 | case WRITE_NT: 401 | result = op_ntst_block_lat(cfg->start_addr_a + offset, cfg->flush_block, cfg->num_clear_pipe); 402 | break; 403 | default: 404 | printf(RED "[ERROR]" RESET "bad cfg->op\n"); 405 | goto out; 406 | } 407 | if (i == 0) { 408 | min = result; 409 | max = result; 410 | } else { 411 | min = (result < min) ? result : min; 412 | max = (result > max) ? result : max; 413 | } 414 | latency_sum += result; 415 | result_buff[i] = result / BLOCK_xN; 416 | } 417 | print_lats(cfg, min, max, latency_sum, BLOCK_xN); 418 | print_lats_median(cfg, result_buff); 419 | 420 | out: 421 | restore_prefetching(cfg->starting_core, cfg->prefetch_en, core_num); 422 | free(result_buff); 423 | } 424 | 425 | /* 426 | * This function tests single operation latency. 427 | * The scheme here goes as: 428 | * flush cacheline 429 | * issue many nop 430 | * mark time1 431 | * issue 1 op 432 | * mark time2 433 | * 434 | * In most cases, the latency here is very high, 435 | * and the actual interpretation of this latency 436 | * may vary. 437 | */ 438 | void lats_clflush_wrapper(test_cfg_t* cfg) { 439 | 440 | uint64_t result, latency_sum = 0; 441 | uint64_t min, max; 442 | int offset; /* measure the latency op_iter times and take average */ 443 | int core_num = cfg->thread_idx + cfg->starting_core; 444 | 445 | uint64_t* result_buff; 446 | result_buff = malloc(sizeof(uint64_t) * cfg->op_iter); 447 | 448 | set_prefetching(cfg->starting_core, cfg->prefetch_en, core_num); 449 | 450 | flush_all_cache(); 451 | 452 | switch (cfg->op) 453 | { 454 | case READ: 455 | for (int i = 0; i < cfg->op_iter; i++){ 456 | offset = rand() % cfg->total_buf_size; 457 | result = op_ld_64B_lat(cfg->start_addr_a + offset); 458 | latency_sum += result; 459 | if (i == 0){ 460 | min = result; 461 | max = result; 462 | }else{ 463 | min = (result < min) ? result : min; 464 | max = (result > max) ? result : max; 465 | } 466 | result_buff[i] = result; 467 | } 468 | break; 469 | 470 | case READ_NT: 471 | for (int i = 0; i < cfg->op_iter; i++){ 472 | offset = rand() % cfg->total_buf_size; 473 | result = op_ntld_64B_lat(cfg->start_addr_a + offset); 474 | latency_sum += result; 475 | if (i == 0){ 476 | min = result; 477 | max = result; 478 | }else{ 479 | min = (result < min) ? result : min; 480 | max = (result > max) ? result : max; 481 | } 482 | result_buff[i] = result; 483 | } 484 | break; 485 | 486 | case WRITE: 487 | for (int i = 0; i < cfg->op_iter; i++){ 488 | offset = rand() % cfg->total_buf_size; 489 | result = op_st_cl_flush_64B_lat(cfg->start_addr_a + offset); 490 | latency_sum += result; 491 | if (i == 0){ 492 | min = result; 493 | max = result; 494 | }else{ 495 | min = (result < min) ? result : min; 496 | max = (result > max) ? result : max; 497 | } 498 | result_buff[i] = result; 499 | } 500 | break; 501 | 502 | case WRITE_NT: 503 | for (int i = 0; i < cfg->op_iter; i++){ 504 | offset = rand() % cfg->total_buf_size; 505 | result = op_ntst_64B_lat(cfg->start_addr_a + offset); 506 | latency_sum += result; 507 | if (i == 0){ 508 | min = result; 509 | max = result; 510 | }else{ 511 | min = (result < min) ? result : min; 512 | max = (result > max) ? result : max; 513 | } 514 | result_buff[i] = result; 515 | } 516 | break; 517 | 518 | default: 519 | break; 520 | } 521 | print_lats(cfg, min, max, latency_sum, 0); 522 | print_lats_median(cfg, result_buff); 523 | 524 | restore_prefetching(cfg->starting_core, cfg->prefetch_en, core_num); 525 | 526 | free(result_buff); 527 | return; 528 | } 529 | 530 | void bw_wrapper(test_cfg_t* cfg) { 531 | const uint64_t fixed_step = cfg->bw_granu << 6; 532 | // random steps will be aligned by fixed steps 533 | const uint64_t align_mask = (~(fixed_step - 1)); 534 | // random steps will be multiple of fix steps 535 | const uint64_t step_bound_mask = ~(align_mask << 6); 536 | 537 | cfg->curr_op_cnt = 0; 538 | char* src = cfg->start_addr_a; 539 | char* dst = cfg->start_addr_b; 540 | uint64_t rand_offset = rand(); 541 | uint64_t curr_step = fixed_step; 542 | uint64_t counter = 0; 543 | int stall_cnt; 544 | int core_num = cfg->thread_idx + cfg->starting_core; 545 | int rw_ratio = cfg->read_ratio; // rw_ratio can be 1, 2, 3. 546 | int mixed_switch = 0; 547 | 548 | set_prefetching(cfg->starting_core, cfg->prefetch_en, core_num); 549 | 550 | if (cfg->op == MOV) { 551 | printf("src: 0x%lx, dst: 0x%lx\n", (uint64_t)src, (uint64_t)dst); 552 | } 553 | 554 | /* sanity check */ 555 | if (cfg->op == MIXED && rw_ratio == 2 && fixed_step != 384) { 556 | printf("[WARNING] You are using a BW_granu other than 384 for R2W1 mixed bandwidth test. Data will be skewed.\n"); 557 | } 558 | 559 | #ifdef DUMP_ZMM 560 | char* data_buf; 561 | 562 | // set all zmm 563 | posix_memalign((void**)(&data_buf), PAGE_SIZE, PAGE_SIZE); 564 | set_data_buf(data_buf, PAGE_SIZE); 565 | set_all_zmm(data_buf); 566 | dump_zmm(NULL, 0); 567 | free(data_buf); 568 | #endif 569 | 570 | 571 | while (1) { 572 | if(counter + fixed_step > cfg->per_thread_size){ 573 | //fprintf(stdout, "reach end, reset\n"); 574 | counter = 0; 575 | src = cfg->start_addr_a; 576 | dst = cfg->start_addr_b; 577 | } 578 | switch(cfg->op) { 579 | case WRITE: 580 | op_st(src, fixed_step); 581 | break; 582 | 583 | case WRITE_NT: 584 | op_ntst(src, fixed_step); 585 | break; 586 | 587 | case READ: 588 | op_ld(src, fixed_step); 589 | break; 590 | 591 | case READ_NT: 592 | op_ntld(src, fixed_step); 593 | break; 594 | 595 | case MOV: 596 | op_movdir64B(src, dst, fixed_step); 597 | break; 598 | 599 | case MIXED: 600 | // op_mixed(src, fixed_step, rw_ratio); 601 | if (mixed_switch == rw_ratio){ 602 | op_st(src, fixed_step); 603 | mixed_switch = 0; 604 | } else { 605 | op_ld(src, fixed_step); 606 | mixed_switch += 1; 607 | } 608 | break; 609 | 610 | default: 611 | fprintf(stderr, "unknown op, thread idx: %d\n", cfg->thread_idx); 612 | goto out; 613 | } 614 | // ==================================== 615 | // Stepping, rand/seq 616 | // ==================================== 617 | // increment number of byte operated on 618 | cfg->curr_op_cnt += fixed_step; 619 | 620 | /* update the address of interest */ 621 | if (cfg->random) { 622 | curr_step = (curr_step ^ xorshf96(&rand_offset)) & align_mask; 623 | curr_step &= step_bound_mask; 624 | //fprintf(stdout, "curr_step: 0x%lx\n", curr_step); 625 | } else { 626 | curr_step = fixed_step; 627 | } 628 | counter += curr_step; 629 | src += curr_step; 630 | dst += curr_step; 631 | 632 | // ==================================== 633 | // Stalling 634 | // ==================================== 635 | // create artificial stalling if desired 636 | stall_cnt = 0; 637 | while (stall_cnt < cfg->stall_ratio) { 638 | op_stall(); 639 | stall_cnt++; 640 | } 641 | 642 | if (cfg->halt) { 643 | fprintf(stdout, "thread idx: %d end\n", cfg->thread_idx); 644 | #ifdef CHECK_NT_ST 645 | if (cfg->op == WRITE_NT) { 646 | check_buff(cfg->buf_a, cfg->total_buf_size); 647 | } 648 | #endif 649 | break; 650 | } 651 | } 652 | out: 653 | restore_prefetching(cfg->starting_core, cfg->prefetch_en, core_num); 654 | } 655 | 656 | // taken from https://stackoverflow.com/questions/1407786/how-to-set-cpu-affinity-of-a-particular-pthread 657 | int stick_this_thread_to_core(int core_id) { 658 | int num_cores = sysconf(_SC_NPROCESSORS_ONLN); 659 | if (core_id < 0 || core_id >= num_cores) 660 | return EINVAL; 661 | 662 | cpu_set_t cpuset; 663 | CPU_ZERO(&cpuset); 664 | CPU_SET(core_id, &cpuset); 665 | 666 | pthread_t current_thread = pthread_self(); 667 | return pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset); 668 | } 669 | 670 | // dispatch to different workload wrappers 671 | void* thread_wrapper(void* arg) { 672 | test_cfg_t* cfg; 673 | cfg = (test_cfg_t*)arg; 674 | fprintf(stdout, "thread %d created.\n", cfg->thread_idx); 675 | 676 | int res; 677 | if (cfg->core_a >= 0 && cfg->core_b >= 0) { 678 | if (cfg->thread_idx == 0) { 679 | cfg->starting_core = cfg->core_a; // to make prefetch pin to correct core 680 | 681 | } else if (cfg->thread_idx == 1) { 682 | cfg->starting_core = cfg->core_b; // to make prefetch pin to correct core 683 | 684 | } else { 685 | printf(RED "[ERROR]" RESET "more than 2 thread in testing pinning to core a, b\n"); 686 | return NULL; 687 | } 688 | } 689 | 690 | if (cfg->starting_core >= 0) { 691 | res = stick_this_thread_to_core(cfg->thread_idx + cfg->starting_core); 692 | } else { 693 | printf(YEL "[INFO]" RESET " core pinning is not specified. Prefetching options will be ignored\n"); 694 | } 695 | 696 | if(res != 0){ 697 | printf(RED "[ERROR]" RESET " Thread affinity set failure.\n"); 698 | return NULL; 699 | } 700 | 701 | switch(cfg->type) { 702 | case LATS_CLFLUSH: 703 | lats_clflush_wrapper(cfg); 704 | break; 705 | case BW: 706 | bw_wrapper(cfg); 707 | break; 708 | case LATS_CHASE: 709 | lats_chase_wrapper(cfg); 710 | break; 711 | case BLOCK_LATS: 712 | block_lats_wrapper(cfg); 713 | break; 714 | default: 715 | fprintf(stderr, "unkown type, thread idx: %d\n", cfg->thread_idx); 716 | } 717 | return NULL; 718 | } 719 | 720 | -------------------------------------------------------------------------------- /memo_ae/src/workload.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Developed by FAST Lab @ ECE-UIUC -- 2022-2023 3 | * Some part of this file follows the methodology of FAST-20 Yang's resporitory 4 | * @ https://github.com/NVSL/OptaneStudy/tree/master 5 | */ 6 | #ifndef WORKLOAD_H 7 | #define WORKLOAD_H 8 | 9 | #include 10 | #include 11 | 12 | /* 13 | #define BLOCK_xN 1 14 | #define LD_xN_RAND_AVX512 LD_x1_RAND_AVX512 15 | #define STWB_xN_RAND_AVX512 STWB_x1_RAND_AVX512 16 | #define NTLD_xN_RAND_AVX512 NTLD_x1_RAND_AVX512 17 | #define NTST_xN_RAND_AVX512 NTST_x1_RAND_AVX512 18 | */ 19 | 20 | /* 21 | #define BLOCK_xN 8 22 | #define LD_xN_RAND_AVX512 LD_x8_RAND_AVX512 23 | #define STWB_xN_RAND_AVX512 STWB_x8_RAND_AVX512 24 | #define NTLD_xN_RAND_AVX512 NTLD_x8_RAND_AVX512 25 | #define NTST_xN_RAND_AVX512 NTST_x8_RAND_AVX512 26 | */ 27 | 28 | #define BLOCK_xN 16 29 | #define LD_xN_RAND_AVX512 LD_x16_RAND_AVX512 30 | #define STWB_xN_RAND_AVX512 STWB_x16_RAND_AVX512 31 | #define NTLD_xN_RAND_AVX512 NTLD_x16_RAND_AVX512 32 | #define NTST_xN_RAND_AVX512 NTST_x16_RAND_AVX512 33 | //#define LD_xN_RAND_AVX512 LD_LFENCE_x16_RAND_AVX512 34 | //#define STWB_xN_RAND_AVX512 STWB_SFENCE_x16_RAND_AVX512 35 | //#define NTLD_xN_RAND_AVX512 NTLD_LFENCE_x16_RAND_AVX512 36 | //#define NTST_xN_RAND_AVX512 NTST_SFENCE_x16_RAND_AVX512 37 | 38 | /* 39 | #define BLOCK_xN 32 40 | #define LD_xN_RAND_AVX512 LD_x32_RAND_AVX512 41 | #define STWB_xN_RAND_AVX512 STWB_x32_RAND_AVX512 42 | #define NTLD_xN_RAND_AVX512 NTLD_x32_RAND_AVX512 43 | #define NTST_xN_RAND_AVX512 NTST_x32_RAND_AVX512 44 | */ 45 | 46 | void op_ntld(char* addr, long size); 47 | 48 | void op_ld(char* addr, long size); 49 | 50 | void op_ntst(char* addr, long size); 51 | 52 | void op_st(char* addr, long size); 53 | 54 | void op_stall(); 55 | 56 | void op_movdir64B(char* src_addr, char* dst_addr, long size); 57 | 58 | void op_mixed(char* addr, long size, int ratio); 59 | 60 | uint64_t op_ntld_32B_lat(char* addr); 61 | 62 | uint64_t op_ntld_64B_lat(char* addr); 63 | 64 | uint64_t op_ntst_64B_lat(char* addr); 65 | 66 | uint64_t op_ld_64B_lat(char* addr); 67 | 68 | uint64_t op_st_64B_lat(char* addr); 69 | 70 | uint64_t op_st_cl_flush_64B_lat(char* addr); 71 | 72 | uint64_t op_st_32B_lat(char* addr); 73 | 74 | uint64_t op_ptr_chase(char* addr, uint64_t num_chase_block); 75 | 76 | uint64_t op_ld_block_lat(char* addr, bool flush_block, long num_clear_pipe); 77 | 78 | uint64_t op_ntld_block_lat(char* addr, bool flush_block, long num_clear_pipe); 79 | 80 | uint64_t op_stwb_block_lat(char* addr, bool flush_block, long num_clear_pipe); 81 | 82 | uint64_t op_ntst_block_lat(char* addr, bool flush_block, long num_clear_pipe); 83 | 84 | void set_all_zmm(char* addr); 85 | 86 | void dump_zmm(char* dst, uint64_t size); 87 | 88 | /* Assembly to perform non-temporal load */ 89 | #define SIZENTLD_64_AVX512 \ 90 | "vmovntdqa 0x0(%%r9, %%r10), %%zmm0 \n" \ 91 | "add $0x40, %%r10 \n" 92 | 93 | #define SIZENTLD_128_AVX512 \ 94 | "vmovntdqa 0x0(%%r9, %%r10), %%zmm0 \n" \ 95 | "vmovntdqa 0x40(%%r9, %%r10), %%zmm1 \n" \ 96 | "add $0x80, %%r10 \n" 97 | 98 | #define SIZENTLD_256_AVX512 \ 99 | "vmovntdqa 0x0(%%r9, %%r10), %%zmm0 \n" \ 100 | "vmovntdqa 0x40(%%r9, %%r10), %%zmm1 \n" \ 101 | "vmovntdqa 0x80(%%r9, %%r10), %%zmm2 \n" \ 102 | "vmovntdqa 0xc0(%%r9, %%r10), %%zmm3 \n" \ 103 | "add $0x100, %%r10 \n" 104 | 105 | #define SIZENTLD_512_AVX512 \ 106 | "vmovntdqa 0x0(%%r9, %%r10), %%zmm0 \n" \ 107 | "vmovntdqa 0x40(%%r9, %%r10), %%zmm1 \n" \ 108 | "vmovntdqa 0x80(%%r9, %%r10), %%zmm2 \n" \ 109 | "vmovntdqa 0xc0(%%r9, %%r10), %%zmm3 \n" \ 110 | "vmovntdqa 0x100(%%r9, %%r10), %%zmm4 \n" \ 111 | "vmovntdqa 0x140(%%r9, %%r10), %%zmm5 \n" \ 112 | "vmovntdqa 0x180(%%r9, %%r10), %%zmm6 \n" \ 113 | "vmovntdqa 0x1c0(%%r9, %%r10), %%zmm7 \n" \ 114 | "add $0x200, %%r10 \n" 115 | 116 | #define SIZENTLD_1024_AVX512 \ 117 | "vmovntdqa 0x0(%%r9, %%r10), %%zmm0 \n" \ 118 | "vmovntdqa 0x40(%%r9, %%r10), %%zmm1 \n" \ 119 | "vmovntdqa 0x80(%%r9, %%r10), %%zmm2 \n" \ 120 | "vmovntdqa 0xc0(%%r9, %%r10), %%zmm3 \n" \ 121 | "vmovntdqa 0x100(%%r9, %%r10), %%zmm4 \n" \ 122 | "vmovntdqa 0x140(%%r9, %%r10), %%zmm5 \n" \ 123 | "vmovntdqa 0x180(%%r9, %%r10), %%zmm6 \n" \ 124 | "vmovntdqa 0x1c0(%%r9, %%r10), %%zmm7 \n" \ 125 | "vmovntdqa 0x200(%%r9, %%r10), %%zmm8 \n" \ 126 | "vmovntdqa 0x240(%%r9, %%r10), %%zmm9 \n" \ 127 | "vmovntdqa 0x280(%%r9, %%r10), %%zmm10 \n" \ 128 | "vmovntdqa 0x2c0(%%r9, %%r10), %%zmm11 \n" \ 129 | "vmovntdqa 0x300(%%r9, %%r10), %%zmm12 \n" \ 130 | "vmovntdqa 0x340(%%r9, %%r10), %%zmm13 \n" \ 131 | "vmovntdqa 0x380(%%r9, %%r10), %%zmm14 \n" \ 132 | "vmovntdqa 0x3c0(%%r9, %%r10), %%zmm15 \n" \ 133 | "add $0x400, %%r10 \n" 134 | 135 | #define NTLD_x1_RAND_AVX512 \ 136 | "vmovntdqa 0xd6c0(%%r11, %%r10), %%zmm0 \n" 137 | 138 | #define NTLD_x8_RAND_AVX512 \ 139 | "vmovntdqa 0xd6c0(%%r11, %%r10), %%zmm0 \n" \ 140 | "vmovntdqa 0xb680(%%r11, %%r10), %%zmm1 \n" \ 141 | "vmovntdqa 0x7040(%%r11, %%r10), %%zmm2 \n" \ 142 | "vmovntdqa 0x36c0(%%r11, %%r10), %%zmm3 \n" \ 143 | "vmovntdqa 0x3b80(%%r11, %%r10), %%zmm4 \n" \ 144 | "vmovntdqa 0x9340(%%r11, %%r10), %%zmm5 \n" \ 145 | "vmovntdqa 0x9ec0(%%r11, %%r10), %%zmm6 \n" \ 146 | "vmovntdqa 0x6e80(%%r11, %%r10), %%zmm7 \n" 147 | 148 | #define NTLD_x16_RAND_AVX512 \ 149 | "vmovntdqa 0xc840(%%r11, %%r10), %%zmm0 \n" \ 150 | "vmovntdqa 0xf180(%%r11, %%r10), %%zmm1 \n" \ 151 | "vmovntdqa 0xce40(%%r11, %%r10), %%zmm2 \n" \ 152 | "vmovntdqa 0x300(%%r11, %%r10), %%zmm3 \n" \ 153 | "vmovntdqa 0x6d40(%%r11, %%r10), %%zmm4 \n" \ 154 | "vmovntdqa 0xa440(%%r11, %%r10), %%zmm5 \n" \ 155 | "vmovntdqa 0xa9c0(%%r11, %%r10), %%zmm6 \n" \ 156 | "vmovntdqa 0xe980(%%r11, %%r10), %%zmm7 \n" \ 157 | "vmovntdqa 0xc940(%%r11, %%r10), %%zmm8 \n" \ 158 | "vmovntdqa 0x8200(%%r11, %%r10), %%zmm9 \n" \ 159 | "vmovntdqa 0xbac0(%%r11, %%r10), %%zmm10 \n" \ 160 | "vmovntdqa 0x8940(%%r11, %%r10), %%zmm11 \n" \ 161 | "vmovntdqa 0xe700(%%r11, %%r10), %%zmm12 \n" \ 162 | "vmovntdqa 0xe100(%%r11, %%r10), %%zmm13 \n" \ 163 | "vmovntdqa 0x8f40(%%r11, %%r10), %%zmm14 \n" \ 164 | "vmovntdqa 0xf2c0(%%r11, %%r10), %%zmm15 \n" 165 | 166 | #define NTLD_x32_RAND_AVX512 \ 167 | "vmovntdqa 0x3d80(%%r11, %%r10), %%zmm0 \n" \ 168 | "vmovntdqa 0x1780(%%r11, %%r10), %%zmm1 \n" \ 169 | "vmovntdqa 0x4700(%%r11, %%r10), %%zmm2 \n" \ 170 | "vmovntdqa 0xb980(%%r11, %%r10), %%zmm3 \n" \ 171 | "vmovntdqa 0xaa00(%%r11, %%r10), %%zmm4 \n" \ 172 | "vmovntdqa 0xad00(%%r11, %%r10), %%zmm5 \n" \ 173 | "vmovntdqa 0x9a40(%%r11, %%r10), %%zmm6 \n" \ 174 | "vmovntdqa 0x5300(%%r11, %%r10), %%zmm7 \n" \ 175 | "vmovntdqa 0x7d40(%%r11, %%r10), %%zmm8 \n" \ 176 | "vmovntdqa 0xf480(%%r11, %%r10), %%zmm9 \n" \ 177 | "vmovntdqa 0x9480(%%r11, %%r10), %%zmm10 \n" \ 178 | "vmovntdqa 0xbd80(%%r11, %%r10), %%zmm11 \n" \ 179 | "vmovntdqa 0x3fc0(%%r11, %%r10), %%zmm12 \n" \ 180 | "vmovntdqa 0xcdc0(%%r11, %%r10), %%zmm13 \n" \ 181 | "vmovntdqa 0x480(%%r11, %%r10), %%zmm14 \n" \ 182 | "vmovntdqa 0xb400(%%r11, %%r10), %%zmm15 \n" \ 183 | "vmovntdqa 0xb500(%%r11, %%r10), %%zmm16 \n" \ 184 | "vmovntdqa 0x49c0(%%r11, %%r10), %%zmm17 \n" \ 185 | "vmovntdqa 0x3380(%%r11, %%r10), %%zmm18 \n" \ 186 | "vmovntdqa 0x36c0(%%r11, %%r10), %%zmm19 \n" \ 187 | "vmovntdqa 0x14c0(%%r11, %%r10), %%zmm20 \n" \ 188 | "vmovntdqa 0xcc80(%%r11, %%r10), %%zmm21 \n" \ 189 | "vmovntdqa 0xb600(%%r11, %%r10), %%zmm22 \n" \ 190 | "vmovntdqa 0x6840(%%r11, %%r10), %%zmm23 \n" \ 191 | "vmovntdqa 0x6c80(%%r11, %%r10), %%zmm24 \n" \ 192 | "vmovntdqa 0x2c0(%%r11, %%r10), %%zmm25 \n" \ 193 | "vmovntdqa 0x62c0(%%r11, %%r10), %%zmm26 \n" \ 194 | "vmovntdqa 0x79c0(%%r11, %%r10), %%zmm27 \n" \ 195 | "vmovntdqa 0xfe40(%%r11, %%r10), %%zmm28 \n" \ 196 | "vmovntdqa 0xc200(%%r11, %%r10), %%zmm29 \n" \ 197 | "vmovntdqa 0x58c0(%%r11, %%r10), %%zmm30 \n" \ 198 | "vmovntdqa 0x9b40(%%r11, %%r10), %%zmm31 \n" 199 | 200 | /* Assembly to perform non-temporal store */ 201 | #define SIZENTST_64_AVX512 \ 202 | "vmovntdq %%zmm0, 0x0(%%r9, %%r10) \n" \ 203 | "add $0x40, %%r10 \n" 204 | 205 | #define SIZENTST_128_AVX512 \ 206 | "vmovntdq %%zmm0, 0x0(%%r9, %%r10) \n" \ 207 | "vmovntdq %%zmm0, 0x40(%%r9, %%r10) \n" \ 208 | "add $0x80, %%r10 \n" 209 | 210 | #define SIZENTST_256_AVX512 \ 211 | "vmovntdq %%zmm0, 0x0(%%r9, %%r10) \n" \ 212 | "vmovntdq %%zmm0, 0x40(%%r9, %%r10) \n" \ 213 | "vmovntdq %%zmm0, 0x80(%%r9, %%r10) \n" \ 214 | "vmovntdq %%zmm0, 0xc0(%%r9, %%r10) \n" \ 215 | "add $0x100, %%r10 \n" 216 | 217 | #define SIZENTST_512_AVX512 \ 218 | "vmovntdq %%zmm0, 0x0(%%r9, %%r10) \n" \ 219 | "vmovntdq %%zmm0, 0x40(%%r9, %%r10) \n" \ 220 | "vmovntdq %%zmm0, 0x80(%%r9, %%r10) \n" \ 221 | "vmovntdq %%zmm0, 0xc0(%%r9, %%r10) \n" \ 222 | "vmovntdq %%zmm0, 0x100(%%r9, %%r10) \n" \ 223 | "vmovntdq %%zmm0, 0x140(%%r9, %%r10) \n" \ 224 | "vmovntdq %%zmm0, 0x180(%%r9, %%r10) \n" \ 225 | "vmovntdq %%zmm0, 0x1c0(%%r9, %%r10) \n" \ 226 | "add $0x200, %%r10 \n" 227 | 228 | #define SIZENTST_1024_AVX512 \ 229 | "vmovntdq %%zmm0, 0x0(%%r9, %%r10) \n" \ 230 | "vmovntdq %%zmm1, 0x40(%%r9, %%r10) \n" \ 231 | "vmovntdq %%zmm2, 0x80(%%r9, %%r10) \n" \ 232 | "vmovntdq %%zmm3, 0xc0(%%r9, %%r10) \n" \ 233 | "vmovntdq %%zmm4, 0x100(%%r9, %%r10) \n" \ 234 | "vmovntdq %%zmm5, 0x140(%%r9, %%r10) \n" \ 235 | "vmovntdq %%zmm6, 0x180(%%r9, %%r10) \n" \ 236 | "vmovntdq %%zmm7, 0x1c0(%%r9, %%r10) \n" \ 237 | "vmovntdq %%zmm8, 0x200(%%r9, %%r10) \n" \ 238 | "vmovntdq %%zmm9, 0x240(%%r9, %%r10) \n" \ 239 | "vmovntdq %%zmm10, 0x280(%%r9, %%r10) \n" \ 240 | "vmovntdq %%zmm11, 0x2c0(%%r9, %%r10) \n" \ 241 | "vmovntdq %%zmm12, 0x300(%%r9, %%r10) \n" \ 242 | "vmovntdq %%zmm13, 0x340(%%r9, %%r10) \n" \ 243 | "vmovntdq %%zmm14, 0x380(%%r9, %%r10) \n" \ 244 | "vmovntdq %%zmm15, 0x3c0(%%r9, %%r10) \n" \ 245 | "add $0x400, %%r10 \n" 246 | 247 | #define NTST_x1_RAND_AVX512 \ 248 | "vmovntdq %%zmm0, 0x9680(%%r11, %%r10) \n" 249 | 250 | #define NTST_x8_RAND_AVX512 \ 251 | "vmovntdq %%zmm0, 0x9680(%%r11, %%r10) \n" \ 252 | "vmovntdq %%zmm1, 0x15c0(%%r11, %%r10) \n" \ 253 | "vmovntdq %%zmm2, 0x4a80(%%r11, %%r10) \n" \ 254 | "vmovntdq %%zmm3, 0xb800(%%r11, %%r10) \n" \ 255 | "vmovntdq %%zmm4, 0x9700(%%r11, %%r10) \n" \ 256 | "vmovntdq %%zmm5, 0x2000(%%r11, %%r10) \n" \ 257 | "vmovntdq %%zmm6, 0x8d40(%%r11, %%r10) \n" \ 258 | "vmovntdq %%zmm7, 0xb640(%%r11, %%r10) \n" 259 | 260 | #define NTST_x16_RAND_AVX512 \ 261 | "vmovntdq %%zmm0, 0x3680(%%r11, %%r10) \n" \ 262 | "vmovntdq %%zmm1, 0x4140(%%r11, %%r10) \n" \ 263 | "vmovntdq %%zmm2, 0x2cc0(%%r11, %%r10) \n" \ 264 | "vmovntdq %%zmm3, 0x28c0(%%r11, %%r10) \n" \ 265 | "vmovntdq %%zmm4, 0x8440(%%r11, %%r10) \n" \ 266 | "vmovntdq %%zmm5, 0xec40(%%r11, %%r10) \n" \ 267 | "vmovntdq %%zmm6, 0x1080(%%r11, %%r10) \n" \ 268 | "vmovntdq %%zmm7, 0x6e00(%%r11, %%r10) \n" \ 269 | "vmovntdq %%zmm8, 0x3300(%%r11, %%r10) \n" \ 270 | "vmovntdq %%zmm9, 0xef80(%%r11, %%r10) \n" \ 271 | "vmovntdq %%zmm10, 0xb900(%%r11, %%r10) \n" \ 272 | "vmovntdq %%zmm11, 0x2280(%%r11, %%r10) \n" \ 273 | "vmovntdq %%zmm12, 0x85c0(%%r11, %%r10) \n" \ 274 | "vmovntdq %%zmm13, 0x240(%%r11, %%r10) \n" \ 275 | "vmovntdq %%zmm14, 0x40c0(%%r11, %%r10) \n" \ 276 | "vmovntdq %%zmm15, 0x3100(%%r11, %%r10) \n" 277 | 278 | #define NTST_x32_RAND_AVX512 \ 279 | "vmovntdq %%zmm0, 0x4240(%%r11, %%r10) \n" \ 280 | "vmovntdq %%zmm1, 0x6400(%%r11, %%r10) \n" \ 281 | "vmovntdq %%zmm2, 0xe4c0(%%r11, %%r10) \n" \ 282 | "vmovntdq %%zmm3, 0xf200(%%r11, %%r10) \n" \ 283 | "vmovntdq %%zmm4, 0xc400(%%r11, %%r10) \n" \ 284 | "vmovntdq %%zmm5, 0x9e80(%%r11, %%r10) \n" \ 285 | "vmovntdq %%zmm6, 0xaf80(%%r11, %%r10) \n" \ 286 | "vmovntdq %%zmm7, 0xb380(%%r11, %%r10) \n" \ 287 | "vmovntdq %%zmm8, 0xc7c0(%%r11, %%r10) \n" \ 288 | "vmovntdq %%zmm9, 0x65c0(%%r11, %%r10) \n" \ 289 | "vmovntdq %%zmm10, 0x5b40(%%r11, %%r10) \n" \ 290 | "vmovntdq %%zmm11, 0x8640(%%r11, %%r10) \n" \ 291 | "vmovntdq %%zmm12, 0x67c0(%%r11, %%r10) \n" \ 292 | "vmovntdq %%zmm13, 0xaa80(%%r11, %%r10) \n" \ 293 | "vmovntdq %%zmm14, 0x7640(%%r11, %%r10) \n" \ 294 | "vmovntdq %%zmm15, 0x6d40(%%r11, %%r10) \n" \ 295 | "vmovntdq %%zmm16, 0x1400(%%r11, %%r10) \n" \ 296 | "vmovntdq %%zmm17, 0x3fc0(%%r11, %%r10) \n" \ 297 | "vmovntdq %%zmm18, 0x6640(%%r11, %%r10) \n" \ 298 | "vmovntdq %%zmm19, 0x1f40(%%r11, %%r10) \n" \ 299 | "vmovntdq %%zmm20, 0x3a00(%%r11, %%r10) \n" \ 300 | "vmovntdq %%zmm21, 0x1080(%%r11, %%r10) \n" \ 301 | "vmovntdq %%zmm22, 0x9c0(%%r11, %%r10) \n" \ 302 | "vmovntdq %%zmm23, 0xf80(%%r11, %%r10) \n" \ 303 | "vmovntdq %%zmm24, 0xcb00(%%r11, %%r10) \n" \ 304 | "vmovntdq %%zmm25, 0x7e80(%%r11, %%r10) \n" \ 305 | "vmovntdq %%zmm26, 0x99c0(%%r11, %%r10) \n" \ 306 | "vmovntdq %%zmm27, 0x680(%%r11, %%r10) \n" \ 307 | "vmovntdq %%zmm28, 0x12c0(%%r11, %%r10) \n" \ 308 | "vmovntdq %%zmm29, 0x2880(%%r11, %%r10) \n" \ 309 | "vmovntdq %%zmm30, 0xd140(%%r11, %%r10) \n" \ 310 | "vmovntdq %%zmm31, 0xf400(%%r11, %%r10) \n" 311 | 312 | /* temporal load */ 313 | #define SIZELD_1024_AVX512 \ 314 | "vmovdqa64 0x0(%%r9, %%r10), %%zmm0 \n" \ 315 | "vmovdqa64 0x40(%%r9, %%r10), %%zmm1 \n" \ 316 | "vmovdqa64 0x80(%%r9, %%r10), %%zmm2 \n" \ 317 | "vmovdqa64 0xc0(%%r9, %%r10), %%zmm3 \n" \ 318 | "vmovdqa64 0x100(%%r9, %%r10), %%zmm4 \n" \ 319 | "vmovdqa64 0x140(%%r9, %%r10), %%zmm5 \n" \ 320 | "vmovdqa64 0x180(%%r9, %%r10), %%zmm6 \n" \ 321 | "vmovdqa64 0x1c0(%%r9, %%r10), %%zmm7 \n" \ 322 | "vmovdqa64 0x200(%%r9, %%r10), %%zmm8 \n" \ 323 | "vmovdqa64 0x240(%%r9, %%r10), %%zmm9 \n" \ 324 | "vmovdqa64 0x280(%%r9, %%r10), %%zmm10 \n" \ 325 | "vmovdqa64 0x2c0(%%r9, %%r10), %%zmm11 \n" \ 326 | "vmovdqa64 0x300(%%r9, %%r10), %%zmm12 \n" \ 327 | "vmovdqa64 0x340(%%r9, %%r10), %%zmm13 \n" \ 328 | "vmovdqa64 0x380(%%r9, %%r10), %%zmm14 \n" \ 329 | "vmovdqa64 0x3c0(%%r9, %%r10), %%zmm15 \n" \ 330 | "add $0x400, %%r10 \n" 331 | 332 | #define LD_x1_RAND_AVX512 \ 333 | "vmovdqa64 0x4140(%%r11, %%r10), %%zmm0 \n" 334 | 335 | #define LD_x8_RAND_AVX512 \ 336 | "vmovdqa64 0x4140(%%r11, %%r10), %%zmm0 \n" \ 337 | "vmovdqa64 0xf340(%%r11, %%r10), %%zmm1 \n" \ 338 | "vmovdqa64 0x2640(%%r11, %%r10), %%zmm2 \n" \ 339 | "vmovdqa64 0x1000(%%r11, %%r10), %%zmm3 \n" \ 340 | "vmovdqa64 0xda40(%%r11, %%r10), %%zmm4 \n" \ 341 | "vmovdqa64 0x5200(%%r11, %%r10), %%zmm5 \n" \ 342 | "vmovdqa64 0x180(%%r11, %%r10), %%zmm6 \n" \ 343 | "vmovdqa64 0xf3c0(%%r11, %%r10), %%zmm7 \n" 344 | 345 | #define LD_LFENCE_x16_RAND_AVX512 \ 346 | "vmovdqa64 0x2a80(%%r11, %%r10), %%zmm0 \n lfence \n" \ 347 | "vmovdqa64 0x680(%%r11, %%r10), %%zmm1 \n lfence \n" \ 348 | "vmovdqa64 0x8500(%%r11, %%r10), %%zmm2 \n lfence \n" \ 349 | "vmovdqa64 0x8980(%%r11, %%r10), %%zmm3 \n lfence \n" \ 350 | "vmovdqa64 0x6d40(%%r11, %%r10), %%zmm4 \n lfence \n" \ 351 | "vmovdqa64 0xf7c0(%%r11, %%r10), %%zmm5 \n lfence \n" \ 352 | "vmovdqa64 0x4640(%%r11, %%r10), %%zmm6 \n lfence \n" \ 353 | "vmovdqa64 0x1480(%%r11, %%r10), %%zmm7 \n lfence \n" \ 354 | "vmovdqa64 0x2f00(%%r11, %%r10), %%zmm8 \n lfence \n" \ 355 | "vmovdqa64 0x15c0(%%r11, %%r10), %%zmm9 \n lfence \n" \ 356 | "vmovdqa64 0xf100(%%r11, %%r10), %%zmm10 \n lfence \n" \ 357 | "vmovdqa64 0x66c0(%%r11, %%r10), %%zmm11 \n lfence \n" \ 358 | "vmovdqa64 0xe240(%%r11, %%r10), %%zmm12 \n lfence \n" \ 359 | "vmovdqa64 0xf480(%%r11, %%r10), %%zmm13 \n lfence \n" \ 360 | "vmovdqa64 0x84c0(%%r11, %%r10), %%zmm14 \n lfence \n" \ 361 | "vmovdqa64 0xe480(%%r11, %%r10), %%zmm15 \n lfence \n" 362 | 363 | #define LD_x16_RAND_AVX512 \ 364 | "vmovdqa64 0xc300(%%r11, %%r10), %%zmm0 \n" \ 365 | "vmovdqa64 0xda00(%%r11, %%r10), %%zmm1 \n" \ 366 | "vmovdqa64 0x1980(%%r11, %%r10), %%zmm2 \n" \ 367 | "vmovdqa64 0xddc0(%%r11, %%r10), %%zmm3 \n" \ 368 | "vmovdqa64 0xaa00(%%r11, %%r10), %%zmm4 \n" \ 369 | "vmovdqa64 0x5540(%%r11, %%r10), %%zmm5 \n" \ 370 | "vmovdqa64 0x6740(%%r11, %%r10), %%zmm6 \n" \ 371 | "vmovdqa64 0x5a80(%%r11, %%r10), %%zmm7 \n" \ 372 | "vmovdqa64 0xa680(%%r11, %%r10), %%zmm8 \n" \ 373 | "vmovdqa64 0xdb00(%%r11, %%r10), %%zmm9 \n" \ 374 | "vmovdqa64 0x3340(%%r11, %%r10), %%zmm10 \n" \ 375 | "vmovdqa64 0x7e40(%%r11, %%r10), %%zmm11 \n" \ 376 | "vmovdqa64 0x3600(%%r11, %%r10), %%zmm12 \n" \ 377 | "vmovdqa64 0x5080(%%r11, %%r10), %%zmm13 \n" \ 378 | "vmovdqa64 0x6e00(%%r11, %%r10), %%zmm14 \n" \ 379 | "vmovdqa64 0x1540(%%r11, %%r10), %%zmm15 \n" 380 | 381 | #define LD_x32_RAND_AVX512 \ 382 | "vmovdqa64 0x7b40(%%r11, %%r10), %%zmm0 \n" \ 383 | "vmovdqa64 0x7640(%%r11, %%r10), %%zmm1 \n" \ 384 | "vmovdqa64 0xdf00(%%r11, %%r10), %%zmm2 \n" \ 385 | "vmovdqa64 0xdb40(%%r11, %%r10), %%zmm3 \n" \ 386 | "vmovdqa64 0xb6c0(%%r11, %%r10), %%zmm4 \n" \ 387 | "vmovdqa64 0x6980(%%r11, %%r10), %%zmm5 \n" \ 388 | "vmovdqa64 0xf280(%%r11, %%r10), %%zmm6 \n" \ 389 | "vmovdqa64 0x3dc0(%%r11, %%r10), %%zmm7 \n" \ 390 | "vmovdqa64 0x6d80(%%r11, %%r10), %%zmm8 \n" \ 391 | "vmovdqa64 0xf580(%%r11, %%r10), %%zmm9 \n" \ 392 | "vmovdqa64 0xf300(%%r11, %%r10), %%zmm10 \n" \ 393 | "vmovdqa64 0x3140(%%r11, %%r10), %%zmm11 \n" \ 394 | "vmovdqa64 0x8980(%%r11, %%r10), %%zmm12 \n" \ 395 | "vmovdqa64 0xecc0(%%r11, %%r10), %%zmm13 \n" \ 396 | "vmovdqa64 0xc5c0(%%r11, %%r10), %%zmm14 \n" \ 397 | "vmovdqa64 0x1e40(%%r11, %%r10), %%zmm15 \n" \ 398 | "vmovdqa64 0xf3c0(%%r11, %%r10), %%zmm16 \n" \ 399 | "vmovdqa64 0xe800(%%r11, %%r10), %%zmm17 \n" \ 400 | "vmovdqa64 0x2200(%%r11, %%r10), %%zmm18 \n" \ 401 | "vmovdqa64 0x66c0(%%r11, %%r10), %%zmm19 \n" \ 402 | "vmovdqa64 0xc00(%%r11, %%r10), %%zmm20 \n" \ 403 | "vmovdqa64 0x2bc0(%%r11, %%r10), %%zmm21 \n" \ 404 | "vmovdqa64 0x6a80(%%r11, %%r10), %%zmm22 \n" \ 405 | "vmovdqa64 0x94c0(%%r11, %%r10), %%zmm23 \n" \ 406 | "vmovdqa64 0xbec0(%%r11, %%r10), %%zmm24 \n" \ 407 | "vmovdqa64 0xcdc0(%%r11, %%r10), %%zmm25 \n" \ 408 | "vmovdqa64 0xf80(%%r11, %%r10), %%zmm26 \n" \ 409 | "vmovdqa64 0xc000(%%r11, %%r10), %%zmm27 \n" \ 410 | "vmovdqa64 0x4340(%%r11, %%r10), %%zmm28 \n" \ 411 | "vmovdqa64 0x4640(%%r11, %%r10), %%zmm29 \n" \ 412 | "vmovdqa64 0xcc0(%%r11, %%r10), %%zmm30 \n" \ 413 | "vmovdqa64 0x6b40(%%r11, %%r10), %%zmm31 \n" 414 | 415 | #define STWB_x1_RAND_AVX512 \ 416 | "vmovdqa64 %%zmm0, 0xe80(%%r11, %%r10) \n clwb 0xe80(%%r11, %%r10) \n" 417 | 418 | #define STWB_x8_RAND_AVX512 \ 419 | "vmovdqa64 %%zmm0, 0xe80(%%r11, %%r10) \n clwb 0xe80(%%r11, %%r10) \n" \ 420 | "vmovdqa64 %%zmm1, 0xe4c0(%%r11, %%r10) \n clwb 0xe4c0(%%r11, %%r10) \n" \ 421 | "vmovdqa64 %%zmm2, 0x4780(%%r11, %%r10) \n clwb 0x4780(%%r11, %%r10) \n" \ 422 | "vmovdqa64 %%zmm3, 0xc240(%%r11, %%r10) \n clwb 0xc240(%%r11, %%r10) \n" \ 423 | "vmovdqa64 %%zmm4, 0x2e00(%%r11, %%r10) \n clwb 0x2e00(%%r11, %%r10) \n" \ 424 | "vmovdqa64 %%zmm5, 0xf4c0(%%r11, %%r10) \n clwb 0xf4c0(%%r11, %%r10) \n" \ 425 | "vmovdqa64 %%zmm6, 0xe5c0(%%r11, %%r10) \n clwb 0xe5c0(%%r11, %%r10) \n" \ 426 | "vmovdqa64 %%zmm7, 0x7040(%%r11, %%r10) \n clwb 0x7040(%%r11, %%r10) \n" 427 | 428 | //#define STWB_SFENCE_x16_RAND_AVX512 429 | 430 | #define STWB_x16_RAND_AVX512 \ 431 | "vmovdqa64 %%zmm0, 0x28c0(%%r11, %%r10) \n clwb 0x28c0(%%r11, %%r10) \n" \ 432 | "vmovdqa64 %%zmm1, 0xc880(%%r11, %%r10) \n clwb 0xc880(%%r11, %%r10) \n" \ 433 | "vmovdqa64 %%zmm2, 0x3cc0(%%r11, %%r10) \n clwb 0x3cc0(%%r11, %%r10) \n" \ 434 | "vmovdqa64 %%zmm3, 0xdd40(%%r11, %%r10) \n clwb 0xdd40(%%r11, %%r10) \n" \ 435 | "vmovdqa64 %%zmm4, 0x6bc0(%%r11, %%r10) \n clwb 0x6bc0(%%r11, %%r10) \n" \ 436 | "vmovdqa64 %%zmm5, 0xe600(%%r11, %%r10) \n clwb 0xe600(%%r11, %%r10) \n" \ 437 | "vmovdqa64 %%zmm6, 0x1c0(%%r11, %%r10) \n clwb 0x1c0(%%r11, %%r10) \n" \ 438 | "vmovdqa64 %%zmm7, 0xf540(%%r11, %%r10) \n clwb 0xf540(%%r11, %%r10) \n" \ 439 | "vmovdqa64 %%zmm8, 0x11c0(%%r11, %%r10) \n clwb 0x11c0(%%r11, %%r10) \n" \ 440 | "vmovdqa64 %%zmm9, 0xb000(%%r11, %%r10) \n clwb 0xb000(%%r11, %%r10) \n" \ 441 | "vmovdqa64 %%zmm10, 0x3f80(%%r11, %%r10) \n clwb 0x3f80(%%r11, %%r10) \n" \ 442 | "vmovdqa64 %%zmm11, 0x5c40(%%r11, %%r10) \n clwb 0x5c40(%%r11, %%r10) \n" \ 443 | "vmovdqa64 %%zmm12, 0xed00(%%r11, %%r10) \n clwb 0xed00(%%r11, %%r10) \n" \ 444 | "vmovdqa64 %%zmm13, 0xd600(%%r11, %%r10) \n clwb 0xd600(%%r11, %%r10) \n" \ 445 | "vmovdqa64 %%zmm14, 0x4c80(%%r11, %%r10) \n clwb 0x4c80(%%r11, %%r10) \n" \ 446 | "vmovdqa64 %%zmm15, 0xb280(%%r11, %%r10) \n clwb 0xb280(%%r11, %%r10) \n" 447 | 448 | /* temporal store */ 449 | #define STWB_x32_RAND_AVX512 \ 450 | "vmovdqa64 %%zmm0, 0x9c0(%%r11, %%r10) \n clwb 0x9c0(%%r11, %%r10) \n" \ 451 | "vmovdqa64 %%zmm1, 0x3b40(%%r11, %%r10) \n clwb 0x3b40(%%r11, %%r10) \n" \ 452 | "vmovdqa64 %%zmm2, 0xe540(%%r11, %%r10) \n clwb 0xe540(%%r11, %%r10) \n" \ 453 | "vmovdqa64 %%zmm3, 0xe180(%%r11, %%r10) \n clwb 0xe180(%%r11, %%r10) \n" \ 454 | "vmovdqa64 %%zmm4, 0x2b80(%%r11, %%r10) \n clwb 0x2b80(%%r11, %%r10) \n" \ 455 | "vmovdqa64 %%zmm5, 0xa380(%%r11, %%r10) \n clwb 0xa380(%%r11, %%r10) \n" \ 456 | "vmovdqa64 %%zmm6, 0x9ac0(%%r11, %%r10) \n clwb 0x9ac0(%%r11, %%r10) \n" \ 457 | "vmovdqa64 %%zmm7, 0xd500(%%r11, %%r10) \n clwb 0xd500(%%r11, %%r10) \n" \ 458 | "vmovdqa64 %%zmm8, 0x51c0(%%r11, %%r10) \n clwb 0x51c0(%%r11, %%r10) \n" \ 459 | "vmovdqa64 %%zmm9, 0x99c0(%%r11, %%r10) \n clwb 0x99c0(%%r11, %%r10) \n" \ 460 | "vmovdqa64 %%zmm10, 0xacc0(%%r11, %%r10) \n clwb 0xacc0(%%r11, %%r10) \n" \ 461 | "vmovdqa64 %%zmm11, 0x4900(%%r11, %%r10) \n clwb 0x4900(%%r11, %%r10) \n" \ 462 | "vmovdqa64 %%zmm12, 0x3540(%%r11, %%r10) \n clwb 0x3540(%%r11, %%r10) \n" \ 463 | "vmovdqa64 %%zmm13, 0x8ac0(%%r11, %%r10) \n clwb 0x8ac0(%%r11, %%r10) \n" \ 464 | "vmovdqa64 %%zmm14, 0x2580(%%r11, %%r10) \n clwb 0x2580(%%r11, %%r10) \n" \ 465 | "vmovdqa64 %%zmm15, 0xc5c0(%%r11, %%r10) \n clwb 0xc5c0(%%r11, %%r10) \n" \ 466 | "vmovdqa64 %%zmm16, 0xfd40(%%r11, %%r10) \n clwb 0xfd40(%%r11, %%r10) \n" \ 467 | "vmovdqa64 %%zmm17, 0xac40(%%r11, %%r10) \n clwb 0xac40(%%r11, %%r10) \n" \ 468 | "vmovdqa64 %%zmm18, 0x1240(%%r11, %%r10) \n clwb 0x1240(%%r11, %%r10) \n" \ 469 | "vmovdqa64 %%zmm19, 0xa00(%%r11, %%r10) \n clwb 0xa00(%%r11, %%r10) \n" \ 470 | "vmovdqa64 %%zmm20, 0x53c0(%%r11, %%r10) \n clwb 0x53c0(%%r11, %%r10) \n" \ 471 | "vmovdqa64 %%zmm21, 0xcd00(%%r11, %%r10) \n clwb 0xcd00(%%r11, %%r10) \n" \ 472 | "vmovdqa64 %%zmm22, 0xbac0(%%r11, %%r10) \n clwb 0xbac0(%%r11, %%r10) \n" \ 473 | "vmovdqa64 %%zmm23, 0x2500(%%r11, %%r10) \n clwb 0x2500(%%r11, %%r10) \n" \ 474 | "vmovdqa64 %%zmm24, 0xd300(%%r11, %%r10) \n clwb 0xd300(%%r11, %%r10) \n" \ 475 | "vmovdqa64 %%zmm25, 0xba40(%%r11, %%r10) \n clwb 0xba40(%%r11, %%r10) \n" \ 476 | "vmovdqa64 %%zmm26, 0xf500(%%r11, %%r10) \n clwb 0xf500(%%r11, %%r10) \n" \ 477 | "vmovdqa64 %%zmm27, 0x2080(%%r11, %%r10) \n clwb 0x2080(%%r11, %%r10) \n" \ 478 | "vmovdqa64 %%zmm28, 0xf2c0(%%r11, %%r10) \n clwb 0xf2c0(%%r11, %%r10) \n" \ 479 | "vmovdqa64 %%zmm29, 0xa980(%%r11, %%r10) \n clwb 0xa980(%%r11, %%r10) \n" \ 480 | "vmovdqa64 %%zmm30, 0x8880(%%r11, %%r10) \n clwb 0x8880(%%r11, %%r10) \n" \ 481 | "vmovdqa64 %%zmm31, 0x54c0(%%r11, %%r10) \n clwb 0x54c0(%%r11, %%r10) \n" 482 | 483 | #define SIZESTWB_1024_AVX512 \ 484 | "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ 485 | "clwb 0x0(%%r9, %%r10) \n" \ 486 | "vmovdqa64 %%zmm0, 0x40(%%r9, %%r10) \n" \ 487 | "clwb 0x40(%%r9, %%r10) \n" \ 488 | "vmovdqa64 %%zmm0, 0x80(%%r9, %%r10) \n" \ 489 | "clwb 0x80(%%r9, %%r10) \n" \ 490 | "vmovdqa64 %%zmm0, 0xc0(%%r9, %%r10) \n" \ 491 | "clwb 0xc0(%%r9, %%r10) \n" \ 492 | "vmovdqa64 %%zmm0, 0x100(%%r9, %%r10) \n" \ 493 | "clwb 0x100(%%r9, %%r10) \n" \ 494 | "vmovdqa64 %%zmm0, 0x140(%%r9, %%r10) \n" \ 495 | "clwb 0x140(%%r9, %%r10) \n" \ 496 | "vmovdqa64 %%zmm0, 0x180(%%r9, %%r10) \n" \ 497 | "clwb 0x180(%%r9, %%r10) \n" \ 498 | "vmovdqa64 %%zmm0, 0x1c0(%%r9, %%r10) \n" \ 499 | "clwb 0x1c0(%%r9, %%r10) \n" \ 500 | "vmovdqa64 %%zmm0, 0x200(%%r9, %%r10) \n" \ 501 | "clwb 0x200(%%r9, %%r10) \n" \ 502 | "vmovdqa64 %%zmm0, 0x240(%%r9, %%r10) \n" \ 503 | "clwb 0x240(%%r9, %%r10) \n" \ 504 | "vmovdqa64 %%zmm0, 0x280(%%r9, %%r10) \n" \ 505 | "clwb 0x280(%%r9, %%r10) \n" \ 506 | "vmovdqa64 %%zmm0, 0x2c0(%%r9, %%r10) \n" \ 507 | "clwb 0x2c0(%%r9, %%r10) \n" \ 508 | "vmovdqa64 %%zmm0, 0x300(%%r9, %%r10) \n" \ 509 | "clwb 0x300(%%r9, %%r10) \n" \ 510 | "vmovdqa64 %%zmm0, 0x340(%%r9, %%r10) \n" \ 511 | "clwb 0x340(%%r9, %%r10) \n" \ 512 | "vmovdqa64 %%zmm0, 0x380(%%r9, %%r10) \n" \ 513 | "clwb 0x380(%%r9, %%r10) \n" \ 514 | "vmovdqa64 %%zmm0, 0x3c0(%%r9, %%r10) \n" \ 515 | "clwb 0x3c0(%%r9, %%r10) \n" \ 516 | "add $0x400, %%r10 \n" 517 | 518 | #define SIZEST_1024_AVX512 \ 519 | "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ 520 | "vmovdqa64 %%zmm1, 0x40(%%r9, %%r10) \n" \ 521 | "vmovdqa64 %%zmm2, 0x80(%%r9, %%r10) \n" \ 522 | "vmovdqa64 %%zmm3, 0xc0(%%r9, %%r10) \n" \ 523 | "vmovdqa64 %%zmm4, 0x100(%%r9, %%r10) \n" \ 524 | "vmovdqa64 %%zmm5, 0x140(%%r9, %%r10) \n" \ 525 | "vmovdqa64 %%zmm6, 0x180(%%r9, %%r10) \n" \ 526 | "vmovdqa64 %%zmm7, 0x1c0(%%r9, %%r10) \n" \ 527 | "vmovdqa64 %%zmm8, 0x200(%%r9, %%r10) \n" \ 528 | "vmovdqa64 %%zmm9, 0x240(%%r9, %%r10) \n" \ 529 | "vmovdqa64 %%zmm10, 0x280(%%r9, %%r10) \n" \ 530 | "vmovdqa64 %%zmm11, 0x2c0(%%r9, %%r10) \n" \ 531 | "vmovdqa64 %%zmm12, 0x300(%%r9, %%r10) \n" \ 532 | "vmovdqa64 %%zmm13, 0x340(%%r9, %%r10) \n" \ 533 | "vmovdqa64 %%zmm14, 0x380(%%r9, %%r10) \n" \ 534 | "vmovdqa64 %%zmm15, 0x3c0(%%r9, %%r10) \n" \ 535 | "add $0x400, %%r10 \n" 536 | 537 | /* perform movdir64B */ 538 | #define SIZEMOV_1024 \ 539 | "movdir64b 0x0(%%r9, %%r10), %%r12 \n" \ 540 | "add $0x40, %%r12 \n" \ 541 | "movdir64b 0x40(%%r9, %%r10), %%r12\n" \ 542 | "add $0x40, %%r12 \n" \ 543 | "movdir64b 0x80(%%r9, %%r10), %%r12\n" \ 544 | "add $0x40, %%r12 \n" \ 545 | "movdir64b 0xc0(%%r9, %%r10), %%r12\n" \ 546 | "add $0x40, %%r12 \n" \ 547 | "movdir64b 0x100(%%r9, %%r10), %%r12\n" \ 548 | "add $0x40, %%r12 \n" \ 549 | "movdir64b 0x140(%%r9, %%r10), %%r12\n" \ 550 | "add $0x40, %%r12 \n" \ 551 | "movdir64b 0x180(%%r9, %%r10), %%r12\n" \ 552 | "add $0x40, %%r12 \n" \ 553 | "movdir64b 0x1c0(%%r9, %%r10), %%r12\n" \ 554 | "add $0x40, %%r12 \n" \ 555 | "movdir64b 0x200(%%r9, %%r10), %%r12\n" \ 556 | "add $0x40, %%r12 \n" \ 557 | "movdir64b 0x240(%%r9, %%r10), %%r12\n" \ 558 | "add $0x40, %%r12 \n" \ 559 | "movdir64b 0x280(%%r9, %%r10), %%r12\n" \ 560 | "add $0x40, %%r12 \n" \ 561 | "movdir64b 0x2c0(%%r9, %%r10), %%r12\n" \ 562 | "add $0x40, %%r12 \n" \ 563 | "movdir64b 0x300(%%r9, %%r10), %%r12\n" \ 564 | "add $0x40, %%r12 \n" \ 565 | "movdir64b 0x340(%%r9, %%r10), %%r12\n" \ 566 | "add $0x40, %%r12 \n" \ 567 | "movdir64b 0x380(%%r9, %%r10), %%r12\n" \ 568 | "add $0x40, %%r12 \n" \ 569 | "movdir64b 0x3c0(%%r9, %%r10), %%r12\n" \ 570 | "add $0x40, %%r12 \n" \ 571 | "add $0x400, %%r10 \n" \ 572 | 573 | /* Mixed read and write */ 574 | /* try using the same dest reg. Assign some value to zmm0 for storing. */ 575 | #define SIZE_R1W1_512 \ 576 | "vmovdqa64 0x0(%%r9, %%r10), %%zmm0 \n" \ 577 | "vmovdqa64 0x40(%%r9, %%r10), %%zmm1 \n" \ 578 | "vmovdqa64 0x80(%%r9, %%r10), %%zmm2 \n" \ 579 | "vmovdqa64 0xc0(%%r9, %%r10), %%zmm3 \n" \ 580 | "" \ 581 | "vmovq %1, %%xmm0 \n" \ 582 | "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ 583 | "clwb 0x0(%%r9, %%r10) \n" \ 584 | "vmovdqa64 %%zmm0, 0x40(%%r9, %%r10) \n" \ 585 | "clwb 0x40(%%r9, %%r10) \n" \ 586 | "vmovdqa64 %%zmm0, 0x80(%%r9, %%r10) \n" \ 587 | "clwb 0x80(%%r9, %%r10) \n" \ 588 | "vmovdqa64 %%zmm0, 0xc0(%%r9, %%r10) \n" \ 589 | "clwb 0xc0(%%r9, %%r10) \n" \ 590 | "add $0x200, %%r10 \n" \ 591 | 592 | 593 | #define SIZE_R2W1_576 \ 594 | "vmovdqa64 0x0(%%r9, %%r10), %%zmm0 \n" \ 595 | "vmovdqa64 0x40(%%r9, %%r10), %%zmm0 \n" \ 596 | "vmovdqa64 0x80(%%r9, %%r10), %%zmm0 \n" \ 597 | "vmovdqa64 0xc0(%%r9, %%r10), %%zmm0 \n" \ 598 | "vmovdqa64 0x100(%%r9, %%r10), %%zmm0 \n" \ 599 | "vmovdqa64 0x140(%%r9, %%r10), %%zmm0 \n" \ 600 | "vmovq %1, %%xmm1 \n" \ 601 | "vmovdqa64 %%zmm1, 0x0(%%r9, %%r10) \n" \ 602 | "clwb 0x0(%%r9, %%r10) \n" \ 603 | "vmovdqa64 %%zmm1, 0x40(%%r9, %%r10) \n" \ 604 | "clwb 0x40(%%r9, %%r10) \n" \ 605 | "vmovdqa64 %%zmm1, 0x80(%%r9, %%r10) \n" \ 606 | "clwb 0x80(%%r9, %%r10) \n" \ 607 | "add $0x180, %%r10 \n" \ 608 | 609 | 610 | #define SIZE_R2W1_384 \ 611 | "vmovdqa64 0x0(%%r9, %%r10), %%zmm0 \n" \ 612 | "vmovdqa64 0x40(%%r9, %%r10), %%zmm1 \n" \ 613 | "vmovdqa64 0x80(%%r9, %%r10), %%zmm2 \n" \ 614 | "vmovdqa64 0xc0(%%r9, %%r10), %%zmm3 \n" \ 615 | "vmovq %1, %%xmm0 \n" \ 616 | "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ 617 | "clwb 0x0(%%r9, %%r10) \n" \ 618 | "vmovdqa64 %%zmm0, 0x40(%%r9, %%r10) \n" \ 619 | "clwb 0x40(%%r9, %%r10) \n" \ 620 | "add $0x180, %%r10 \n" \ 621 | 622 | 623 | #define SIZE_R3W1_512 \ 624 | "vmovdqa64 0x0(%%r9, %%r10), %%zmm0 \n" \ 625 | "vmovdqa64 0x40(%%r9, %%r10), %%zmm1 \n" \ 626 | "vmovdqa64 0x80(%%r9, %%r10), %%zmm2 \n" \ 627 | "vmovdqa64 0xc0(%%r9, %%r10), %%zmm3 \n" \ 628 | "vmovdqa64 0x100(%%r9, %%r10), %%zmm4 \n" \ 629 | "vmovdqa64 0x140(%%r9, %%r10), %%zmm5 \n" \ 630 | "" \ 631 | "vmovq %1, %%xmm0 \n" \ 632 | "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ 633 | "clwb 0x0(%%r9, %%r10) \n" \ 634 | "vmovdqa64 %%zmm0, 0x40(%%r9, %%r10) \n" \ 635 | "clwb 0x40(%%r9, %%r10) \n" \ 636 | "add $0x200, %%r10 \n" \ 637 | 638 | /* snippets for latency measuring */ 639 | 640 | /* Assembly instructions utilize the following registers: 641 | * rsi: memory address 642 | * rax, rdx, rcx, r8d and r9d: timing 643 | * rdx: populating cache-lines 644 | * ymm0: streaming instructions 645 | */ 646 | #define REGISTERS "rsi", "rax", "rdx", "rcx", "r8", "r9", "ymm0" 647 | 648 | #define REGISTERS_AND_ZMM "rsi", "rax", "rdx", "rcx", "r8", "r9", \ 649 | 650 | #define ZMM_0_15 "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15" 651 | 652 | 653 | /* rdtscp: reads current timestamp to EDX:EAX and also sets ECX 654 | * higher 32-bits of RAX, RDX and RCX are cleared 655 | * 656 | * r9d = old EDX 657 | * r8d = old EAX 658 | * Here is what we do to compute t_start and t_end: 659 | * - RDX holds t_end 660 | * - RAX holds t_start 661 | */ 662 | 663 | /** Douglas: read this blog for more info about timing 664 | * http://sites.utexas.edu/jdm4372/2018/07/23/comments-on-timing-short-code-sections-on-intel-processors/ 665 | */ 666 | #define TIMING_BEGIN "rdtscp \n" \ 667 | "lfence \n" \ 668 | "mov %%edx, %%r9d \n" \ 669 | "mov %%eax, %%r8d \n" 670 | 671 | #define TIMING_END "mfence \n" \ 672 | "rdtscp \n" \ 673 | "lfence \n" \ 674 | "shl $32, %%rdx \n" \ 675 | "or %%rax, %%rdx \n" \ 676 | "mov %%r9d, %%eax \n" \ 677 | "shl $32, %%rax \n" \ 678 | "or %%r8, %%rax \n" \ 679 | "mov %%rax, %[t_start] \n" \ 680 | "mov %%rdx, %[t_end] \n" 681 | 682 | #define FLUSH_64K_BLOCK \ 683 | "LOOP_64K_BLOCK_FLUSH: \n" \ 684 | "clflush (%%r11, %%r10) \n" \ 685 | "add $0x40, %%r10 \n" \ 686 | "cmp $0x10000, %%r10 \n" \ 687 | "jl LOOP_64K_BLOCK_FLUSH\n" \ 688 | "xor %%r10, %%r10 \n" \ 689 | "mfence \n" 690 | 691 | 692 | #define FLUSH_CACHE_LINE "clflush 0*32(%%rsi) \n" \ 693 | "clflush 2*32(%%rsi) \n" \ 694 | "clflush 4*32(%%rsi) \n" \ 695 | "clflush 6*32(%%rsi) \n" \ 696 | "mfence \n" 697 | 698 | 699 | #define CLEAR_PIPELINE "nop \nnop \nnop \nnop \nnop \nnop \n" \ 700 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 701 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 702 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 703 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 704 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 705 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 706 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 707 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 708 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 709 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 710 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 711 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 712 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 713 | "nop \nnop \nnop \nnop \nnop \nnop \n" \ 714 | "nop \nnop \nnop \nnop \nnop \nnop \n" 715 | 716 | #define CLEAR_PIPELINE_x16 CLEAR_PIPELINE \ 717 | CLEAR_PIPELINE \ 718 | CLEAR_PIPELINE \ 719 | CLEAR_PIPELINE \ 720 | CLEAR_PIPELINE \ 721 | CLEAR_PIPELINE \ 722 | CLEAR_PIPELINE \ 723 | CLEAR_PIPELINE \ 724 | CLEAR_PIPELINE \ 725 | CLEAR_PIPELINE \ 726 | CLEAR_PIPELINE \ 727 | CLEAR_PIPELINE \ 728 | CLEAR_PIPELINE \ 729 | CLEAR_PIPELINE \ 730 | CLEAR_PIPELINE \ 731 | CLEAR_PIPELINE 732 | 733 | #endif // WORKLOAD_H 734 | --------------------------------------------------------------------------------