├── util_scripts
    ├── flush_page_cache.sh
    ├── check_cpu_freq.sh
    ├── lock_cpu_freq.sh
    ├── unlock_cpu_freq.sh
    ├── env.sh
    ├── reconfig_all.sh
    ├── config_all.sh
    ├── hyperthread_ctrl.sh
    └── numa_balance_ctrl.sh
├── caption_ae
    ├── example_input
    │   ├── single_mlc.sh
    │   ├── async_tune.txt
    │   └── sync_tune.sh
    ├── config.py
    ├── action.py
    ├── metrics
    │   ├── vmstat_mon.py
    │   ├── slab_mon.py
    │   ├── pmu_mon.py
    │   └── pcm_mon.py
    ├── algo.py
    ├── README.md
    ├── caption_ctrl.py
    └── caption.py
├── memo_ae
    ├── app
    │   └── mlc_linux
    │   │   └── place_holder.txt
    ├── src
    │   ├── test.h
    │   ├── Makefile
    │   ├── main.c
    │   ├── util.h
    │   ├── util.c
    │   ├── workload.c
    │   ├── test.c
    │   └── workload.h
    ├── evaluation
    │   ├── figure_4a.sh
    │   ├── figure_4b.sh
    │   └── figure_3.sh
    ├── generate_random_inst.py
    ├── test_cxl
    │   ├── test_single_op_latency.sh
    │   ├── test_movdir_bw.sh
    │   ├── test_seq_bw.sh
    │   ├── test_ptr_chase.sh
    │   ├── test_block_access_latency.sh
    │   └── test_rand_bw.sh
    └── README.md
├── README.md
└── LICENSE


/util_scripts/flush_page_cache.sh:
--------------------------------------------------------------------------------
1 | sudo sh -c "echo 3 > /proc/sys/vm/drop_caches"
2 | 


--------------------------------------------------------------------------------
/caption_ae/example_input/single_mlc.sh:
--------------------------------------------------------------------------------
1 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
2 | 


--------------------------------------------------------------------------------
/util_scripts/check_cpu_freq.sh:
--------------------------------------------------------------------------------
1 | sudo cpupower --cpu all frequency-info | grep "current CPU frequency"
2 | 


--------------------------------------------------------------------------------
/memo_ae/app/mlc_linux/place_holder.txt:
--------------------------------------------------------------------------------
1 | This is a place holder file.
2 | Please place the Intel MLC binary in the same directory.
3 | 


--------------------------------------------------------------------------------
/util_scripts/lock_cpu_freq.sh:
--------------------------------------------------------------------------------
1 | sudo cpupower --cpu all frequency-set --freq 2100MHz
2 | sudo sh -c 'echo 0 > /sys/devices/system/cpu/cpufreq/boost'
3 | 
4 | 


--------------------------------------------------------------------------------
/caption_ae/example_input/async_tune.txt:
--------------------------------------------------------------------------------
1 | ./single_mlc.sh
2 | ./single_mlc.sh
3 | ./single_mlc.sh
4 | ./single_mlc.sh
5 | ./single_mlc.sh
6 | ./single_mlc.sh
7 | 


--------------------------------------------------------------------------------
/util_scripts/unlock_cpu_freq.sh:
--------------------------------------------------------------------------------
1 | #sudo cpupower --cpu all frequency-set --governor osndemand
2 | sudo cpupower --cpu all frequency-set --governor ondemand
3 | 
4 | 


--------------------------------------------------------------------------------
/util_scripts/env.sh:
--------------------------------------------------------------------------------
1 | export CLOSEST_CORE=0
2 | export CLOSEST_NODE=0
3 | export NODE_MAX=2
4 | export TSC_FREQ=2100
5 | 
6 | echo "closest node to CXL=${CLOSEST_NODE}"
7 | echo "closest core to CXL=${CLOSEST_CORE}"
8 | echo "TSC_FREQ: $TSC_FREQ (Mhz)"
9 | 


--------------------------------------------------------------------------------
/util_scripts/reconfig_all.sh:
--------------------------------------------------------------------------------
 1 | #!bin/bash
 2 | 
 3 | SETUP_SCRIPT_DIR=./
 4 | 
 5 | # set
 6 | bash $SETUP_SCRIPT_DIR/unlock_cpu_freq.sh
 7 | bash $SETUP_SCRIPT_DIR/check_cpu_freq.sh
 8 | bash $SETUP_SCRIPT_DIR/hyperthread_ctrl.sh 1
 9 | bash $SETUP_SCRIPT_DIR/numa_balance_ctrl.sh 1
10 | 


--------------------------------------------------------------------------------
/util_scripts/config_all.sh:
--------------------------------------------------------------------------------
 1 | #!bin/bash
 2 | 
 3 | SETUP_SCRIPT_DIR=$(cd $(dirname $0) && pwd)
 4 | 
 5 | # set
 6 | bash $SETUP_SCRIPT_DIR/lock_cpu_freq.sh
 7 | bash $SETUP_SCRIPT_DIR/check_cpu_freq.sh
 8 | bash $SETUP_SCRIPT_DIR/hyperthread_ctrl.sh 0
 9 | bash $SETUP_SCRIPT_DIR/numa_balance_ctrl.sh 0
10 | 
11 | sudo systemctl stop numad
12 | 


--------------------------------------------------------------------------------
/memo_ae/src/test.h:
--------------------------------------------------------------------------------
 1 | #ifndef TEST_H
 2 | #define TEST_H
 3 | #include "util.h"
 4 | #include <pthread.h>
 5 | 
 6 | // spawn thread 
 7 | int run_test(test_cfg_t* cfg);
 8 | 
 9 | // dispatch to different workload wrappers
10 | void* thread_wrapper(void* arg);
11 | 
12 | void get_bw(test_cfg_t* cfg_arr, int iter, int delay);
13 | 
14 | #endif // TEST_H
15 | 


--------------------------------------------------------------------------------
/util_scripts/hyperthread_ctrl.sh:
--------------------------------------------------------------------------------
 1 | #!bin/bash
 2 | 
 3 | if [ $1 -eq 1 ];
 4 | then
 5 |     echo "enable hyperthreading"
 6 |     sudo sh -c "echo on > /sys/devices/system/cpu/smt/control"
 7 | fi
 8 | 
 9 | if [ $1 -eq 0 ];
10 | then
11 |     echo "disable hyperthreading"
12 |     sudo sh -c "echo off > /sys/devices/system/cpu/smt/control"
13 | fi
14 | 
15 | 


--------------------------------------------------------------------------------
/util_scripts/numa_balance_ctrl.sh:
--------------------------------------------------------------------------------
 1 | #!bin/bash
 2 | 
 3 | if [ $1 -eq 2 ];
 4 | then
 5 |     echo "enable numa balance in tpp mode"
 6 |     sudo sh -c "echo 2 > /proc/sys/kernel/numa_balancing"  
 7 | fi
 8 | 
 9 | if [ $1 -eq 1 ];
10 | then
11 |     echo "enable numa balance"
12 |     sudo sh -c "echo 1 > /proc/sys/kernel/numa_balancing"  
13 | fi
14 | 
15 | if [ $1 -eq 0 ];
16 | then
17 |     echo "disable numa balance"
18 |     sudo sh -c "echo 0 > /proc/sys/kernel/numa_balancing"  
19 | fi
20 | 
21 | 


--------------------------------------------------------------------------------
/memo_ae/src/Makefile:
--------------------------------------------------------------------------------
 1 | CC=gcc
 2 | CFLAGS=-I. -W -Wall -Wextra -Wuninitialized -Wstrict-aliasing 
 3 | DEPS=util.h test.h workload.h
 4 | OBJ=util.o test.o workload.o main.o 
 5 | LDLIBS=-lpthread -lnuma -lm
 6 | 
 7 | .PHONY: all
 8 | all: cxlMemTest
 9 | 
10 | %.o: %.c $(DEPS)
11 | 	$(CC) -c -o $@ $< $(CFLAGS)
12 | 
13 | cxlMemTest: $(OBJ)
14 | 	$(CC) -o $@ $^ $(CFLAGS) $(LDLIBS)
15 | 
16 | .PHONY: clean
17 | clean:
18 | 	$(RM) *~ *.o cxlMemTest 
19 | 
20 | debug: CFLAGS+=-g
21 | debug: cxlMemTest
22 | 
23 | 


--------------------------------------------------------------------------------
/memo_ae/evaluation/figure_4a.sh:
--------------------------------------------------------------------------------
 1 | bash ../../util_scripts/config_all.sh
 2 | source ../../util_scripts/env.sh
 3 | 
 4 | # config for SPR1
 5 | CXL_NODE="2"
 6 | NUMA_REMOTE_NODE="1"
 7 | CLOSEST_CORE_S="0-31"
 8 | 
 9 | # arg1: mem node
10 | # arg2: cores
11 | test_mlc_bw() {
12 |     echo "running mlc, node: ${1}, cores: ${2} ..."
13 |     sudo numactl --membind=$1 ../app/mlc_linux/mlc --peak_injection_bandwidth -k$2 -b104858 > ../results/figure_4a/c${2}_m${1}.txt
14 | 
15 |     echo "done!"
16 | }
17 | 
18 | 
19 | mkdir -p ../results/figure_4a
20 | echo "testing with MLC peak injection bw... "
21 | 
22 | test_mlc_bw $CXL_NODE $CLOSEST_CORE_S
23 | test_mlc_bw $NUMA_REMOTE_NODE $CLOSEST_CORE_S
24 | 


--------------------------------------------------------------------------------
/caption_ae/example_input/sync_tune.sh:
--------------------------------------------------------------------------------
 1 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
 2 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
 3 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
 4 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
 5 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
 6 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
 7 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
 8 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
 9 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
10 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
11 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
12 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
13 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
14 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
15 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
16 | ../../memo_ae/app/mlc_linux/mlc --latency_matrix 
17 | 


--------------------------------------------------------------------------------
/memo_ae/generate_random_inst.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | CNT=16
 4 | RANGE_LOW=0
 5 | RANGE_HIGH=1024
 6 | # NTLD
 7 | #inst_base = '"vmovntdqa  {0}(%%r11, %%r10), %%zmm{1} \\n" \\'
 8 | # NTST
 9 | #inst_base = '"vmovntdq %%zmm{1}, {0}(%%r11, %%r10) \\n" \\'
10 | # LD
11 | #inst_base = '"vmovdqa64 {0}(%%r11, %%r10), %%zmm{1} \\n lfence \\n" \\'
12 | #inst_base = '"vmovdqa64 {0}(%%r11, %%r10), %%zmm{1} \\n \\'
13 | # ST + WB
14 | #inst_base = '"vmovdqa64  %%zmm{1},  {0}(%%r11, %%r10) \\n clwb {0}(%%r11, %%r10) \\n" \\'
15 | # ST 
16 | inst_base = '"vmovdqa64  %%zmm{1},  {0}(%%r11, %%r10) \\n" \\'
17 | 
18 | seen_set = set()
19 | 
20 | for i in range(CNT):
21 |     curr_offset = random.randrange(RANGE_LOW, RANGE_HIGH)
22 |     while curr_offset in seen_set:
23 |         curr_offset = random.randrange(RANGE_LOW, RANGE_HIGH)
24 |         
25 |     seen_set.add(curr_offset)
26 | 
27 |     print(inst_base.format(hex(curr_offset << 6), i))
28 | 


--------------------------------------------------------------------------------
/memo_ae/test_cxl/test_single_op_latency.sh:
--------------------------------------------------------------------------------
 1 | ITERATION=10000
 2 | 
 3 | bash ../../util_scripts/config_all.sh
 4 | source ../../util_scripts/env.sh
 5 | 
 6 | test_single_op_lats() {
 7 |     echo "[INFO] Test started"
 8 |     for ((k=0;k<=$NODE_MAX;k=k+1)); do
 9 |         for ((j=0;j<4;j=j+1)); do # op
10 |             FOLDER_NAME=single_op_n${k}
11 |             CURR_RESULT_PATH=../results/$FOLDER_NAME
12 |             mkdir -p $CURR_RESULT_PATH
13 | 
14 |             echo "[TEST] op: $j node: $k......"
15 |             LINE=`sudo ../src/cxlMemTest -p $CLOSEST_CORE -t 1 -S 1 -n $k -T 0 -o $j -i $ITERATION -F $TSC_FREQ | awk '/Median latency/ {print}'`
16 |             echo $LINE
17 | 
18 |             LATS=`echo $LINE | awk '{print $(NF-2)}' | grep -Eo '[+-]?[0-9]+([.][0-9]+)?'`
19 |             echo $LATS >> $CURR_RESULT_PATH/single_op_lats_n${k}.txt
20 |             echo $LATS
21 |         done
22 |     done
23 | }
24 | 
25 | test_single_op_lats
26 | 


--------------------------------------------------------------------------------
/memo_ae/test_cxl/test_movdir_bw.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | NUM_THREADS=32
 3 | STEP=2
 4 | ITERATION=3
 5 | OP_MAX=3
 6 | 
 7 | bash ../../util_scripts/config_all.sh
 8 | source ../../util_scripts/env.sh
 9 | 
10 | FOLDER_NAME=movdir_bw_test
11 | CURR_RESULT_PATH=../results/$FOLDER_NAME
12 | mkdir -p $CURR_RESULT_PATH
13 | 
14 | for ((src=0;src<=$NODE_MAX;src++)); do
15 |     for ((dst=0;dst<=$NODE_MAX;dst++)); do
16 |         for ((i=0;i<=$NUM_THREADS;i=i+$STEP)); do
17 |             if [ $i == 0 ];then
18 |                 continue
19 |             fi
20 |             echo "[TEST] src: $src dst: $dst, num_thread: $i......"
21 |             THROUGHPUT=`sudo ../src/cxlMemTest -p $CLOSEST_CORE -f -t $i -S 6 -n $src -d $dst -T 1 -o 4 -i $ITERATION | awk '/get_bw/ {print}'` 
22 |             BW=`echo $THROUGHPUT | awk '{print $(NF-1)}'`
23 |             echo $BW >> $CURR_RESULT_PATH/s${src}_d${dst}.txt
24 |             echo $THROUGHPUT
25 |         done
26 |     done
27 | done
28 | 


--------------------------------------------------------------------------------
/memo_ae/test_cxl/test_seq_bw.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | NUM_THREADS=32
 3 | STEP=2
 4 | ITERATION=3
 5 | OP_MAX=3
 6 | 
 7 | bash ../../util_scripts/config_all.sh
 8 | source ../../util_scripts/env.sh
 9 | 
10 | for ((j=0;j<=$NODE_MAX;j++)); do
11 |     FOLDER_NAME=seq_bw_${j}_test
12 |     CURR_RESULT_PATH=../results/$FOLDER_NAME
13 |     mkdir -p $CURR_RESULT_PATH
14 | 
15 |     for ((k=0;k<=$OP_MAX;k++)); do
16 | 
17 |         for ((i=0;i<=$NUM_THREADS;i=i+$STEP)); do
18 |             if [ $i == 0 ];then
19 |                 continue
20 |             fi
21 | 
22 |             echo "[TEST] node: $j, op: $k, num_thread: $i......"
23 |             THROUGHPUT=`sudo ../src/cxlMemTest -p $CLOSEST_CORE -f -t $i -S 6 -n $j -d $j -T 1 -o $k -i $ITERATION | awk '/get_bw/ {print}'` 
24 |             BW=`echo $THROUGHPUT | awk '{print $(NF-1)}'`
25 |             echo $BW >> $CURR_RESULT_PATH/seq_bw_$k.txt
26 |             echo $THROUGHPUT
27 |         done
28 |     done
29 | done
30 | 


--------------------------------------------------------------------------------
/memo_ae/test_cxl/test_ptr_chase.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | STEPS=19
 3 | 
 4 | bash ../../util_scripts/config_all.sh
 5 | source ../../util_scripts/env.sh
 6 | 
 7 | for ((j=0;j<=$NODE_MAX;j++)); do
 8 |     FOLDER_NAME=chase_${j}_test
 9 |     mkdir -p ../results/$FOLDER_NAME
10 | 
11 |     # Start testing
12 |     echo "[INFO] Test started"
13 |     size=4096
14 |     iter=500
15 |     # max = 2^12 * 2^18 = 2^30 ~ 4GB
16 |     for ((i=0;i<$STEPS;i++)); do
17 |         if [ $((i%3)) -eq 1 ];
18 |         then
19 |             iter=$((iter / 2))
20 |         fi
21 |         echo -n "[TEST] test $i, iteration $iter......"
22 |         LATENCY=`sudo ../src/cxlMemTest -t 1 -m $size -o 0 -T 2 -n $j -i $iter -p $CLOSEST_CORE -F $TSC_FREQ | awk '/chase\/block_lats average/ {print $8}' | tail -n1 | grep -Eo '[+-]?[0-9]+([.][0-9]+)?'`
23 |         echo $LATENCY >> ../results/$FOLDER_NAME/ptr_chase_lat_vs_size.txt
24 |         echo "$LATENCY"
25 |         
26 |         size=$((size*2))
27 |     done
28 | done
29 | 


--------------------------------------------------------------------------------
/memo_ae/test_cxl/test_block_access_latency.sh:
--------------------------------------------------------------------------------
 1 | ITERATION=10000
 2 | 
 3 | bash ../../util_scripts/config_all.sh
 4 | source ../../util_scripts/env.sh
 5 | 
 6 | test_block_lats() {
 7 |     echo "[INFO] Test started"
 8 | 
 9 |     for ((k=0;k<=$NODE_MAX;k=k+1)); do # node
10 |         for ((j=0;j<4;j=j+1)); do # op
11 |             FOLDER_NAME=block_lats_n${k}
12 |             CURR_RESULT_PATH=../results/$FOLDER_NAME
13 |             mkdir -p $CURR_RESULT_PATH
14 | 
15 |             #echo "[TEST] $i $j $k......"
16 |             echo "[TEST] op: $j node: $k......"
17 |             LINE=`sudo ../src/cxlMemTest -p $CLOSEST_CORE -t 1 -S 1 -n $k -T 3 -o $j -i $ITERATION -B -F $TSC_FREQ | awk '/Median latency/ {print}'`
18 |             echo $LINE
19 | 
20 |             LATS=`echo $LINE | awk '{print $(NF-2)}' | grep -Eo '[+-]?[0-9]+([.][0-9]+)?'`
21 |             echo $LATS >> $CURR_RESULT_PATH/block_lats_n${k}.txt
22 |             echo $LATS
23 |         done
24 |     done
25 | }
26 | 
27 | test_block_lats
28 | 


--------------------------------------------------------------------------------
/memo_ae/test_cxl/test_rand_bw.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | NUM_THREADS=32
 3 | STEP=2
 4 | ITERATION=3
 5 | OP_MAX=3
 6 | 
 7 | THREAD_CNT=(1 2 4 6 8)
 8 | 
 9 | bash ../../util_scripts/config_all.sh
10 | source ../../util_scripts/env.sh
11 | 
12 | for ((k=0;k<=$NODE_MAX;k++)); do # k node
13 | 
14 |     for ((op=0;op<=$OP_MAX;op++)); do
15 | 
16 |         for i in ${THREAD_CNT[@]}; do # i thread
17 |             FOLDER_NAME=rand_bw_${k}_test
18 |             batch_size=16 #
19 |             echo "[INFO] ====> new core: ${i} <==== "
20 | 
21 |             for ((j=0;j<7;j=j+1)); do # j blocksize
22 |                 CURR_RESULT_PATH=../results/$FOLDER_NAME/
23 |                 mkdir -p $CURR_RESULT_PATH
24 | 
25 |                 echo "[TEST] node: $k, op: $op, thread: ${i}, batch_size: ${batch_size} ......"
26 |                 THROUGHPUT=`sudo ../src/cxlMemTest -p $CLOSEST_CORE -f -t $i -S 6 -n $k -T 1 -o $op -i $ITERATION -g $batch_size -r | awk '/get_bw/ {print}'` 
27 |                 BW=`echo $THROUGHPUT | awk '{print $(NF-1)}'`
28 |                 echo $BW >> $CURR_RESULT_PATH/${i}_${op}.txt
29 |                 echo $THROUGHPUT
30 |                 batch_size=$((batch_size*2))
31 |             done
32 |         done
33 |     done
34 | done
35 | 


--------------------------------------------------------------------------------
/memo_ae/evaluation/figure_4b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | NUM_THREADS=32
 3 | STEP=2
 4 | ITERATION=3
 5 | OP_MAX=3
 6 | 
 7 | bash ../../util_scripts/config_all.sh
 8 | source ../../util_scripts/env.sh
 9 | FOLDER_NAME="figure_4b"
10 | 
11 | # =======================================
12 | #       test 2-32 thread BW, stepping = 2
13 | # =======================================
14 | 
15 | for ((j=0;j<=$NODE_MAX;j++)); do
16 |     CURR_RESULT_PATH=../results/$FOLDER_NAME
17 |     mkdir -p $CURR_RESULT_PATH
18 | 
19 |     for ((k=0;k<=$OP_MAX;k++)); do
20 | 
21 |         for ((i=0;i<=$NUM_THREADS;i=i+$STEP)); do
22 |             if [ $i == 0 ];then
23 |                 continue
24 |             fi
25 | 
26 |             # A - B - C
27 |             #   B is the closest node
28 |             # A-B, B-B, B-C are sufficient to show Local-NUMA, Local-Local, Local-CXL
29 | 
30 |             echo "[TEST] node: $j, op: $k, num_thread: $i......"
31 |             THROUGHPUT=`sudo ../src/cxlMemTest -t $i -S 6 -n $j -d $j -T 1 -o $k -i $ITERATION -p $CLOSEST_CORE -f | awk '/get_bw/ {print}'` 
32 |             BW=`echo $THROUGHPUT | awk '{print $(NF-1)}'`
33 |             echo $BW >> $CURR_RESULT_PATH/seq_bw_op${k}_core${CLOSEST_CORE}_mem${j}.txt
34 |             echo $THROUGHPUT
35 |         done
36 |     done
37 | done
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cxl\_type3\_tests
 2 | This repository contains two parts:
 3 | 1. **memo** -- a versatile benchmark for CXL-related memory behaviors and characterizations.
 4 | 2. **caption** -- a performance tuning (based on memory page allocation) tool to maximize the system memory bandwidth utilazation in a CXL-enabled system.
 5 | 
 6 | They are the correpsonding artifacts of the paper `Demystifying CXL Memory with True CXL-Ready Systems and CXL Memory Devices (MICRO'23)`, the first research work of CXL memory characterization based on real CXL hardware devices. 
 7 | 
 8 | For the artifact evaluation configurations, please refer to [link to config repo](https://github.com/ece-fast-lab/cxl_type3_tests_ae)
 9 | 
10 | ### Contact
11 | 
12 | For any questions, please :e-mail: <yans3@illinois.edu>.
13 | 
14 | Thank you! :wink:
15 | 
16 | 
17 | ## [Related Publication](https://doi.org/10.1145/3613424.3614256)
18 | 
19 | 
20 | ```bibtex
21 | @inproceedings {sun-memo,
22 | author = {Sun, Yan and Yuan, Yifan and Yu, Zeduo and Kuper, Reese and Song, Chihun and Huang, Jinghan and Ji, Houxiang and Agarwal, Siddharth and Lou, Jiaqi and Jeong, Ipoom and Wang, Ren and Ahn, Jung Ho and Xu, Tianyin and Kim, Nam Sung},
23 | title = {Demystifying {CXL} memory with genuine {CXL}-ready systems and devices},
24 | booktitle = {Proceedings of the 48th IEEE/ACM International Symposium on Microarchitecture (MICRO'23)},
25 | year = {2023},
26 | }
27 | ```
28 | 


--------------------------------------------------------------------------------
/memo_ae/evaluation/figure_3.sh:
--------------------------------------------------------------------------------
 1 | ITERATION=10000
 2 | 
 3 | bash ../../util_scripts/config_all.sh
 4 | source ../../util_scripts/env.sh
 5 | 
 6 | test_block_lats() {
 7 |     echo "[INFO] Test started"
 8 |     echo "CLOSEST_NODE: $CLOSEST_NODE"
 9 |     echo "CLOSEST_CORE: $CLOSEST_CORE"
10 |     echo "TSC_FREQ: $TSC_FREQ"
11 |     FOLDER_NAME="figure_3_memo"
12 | 
13 |     for ((k=0;k<=$NODE_MAX;k=k+1)); do # node
14 |         for ((j=0;j<4;j=j+1)); do # op
15 |             CURR_RESULT_PATH=../results/$FOLDER_NAME
16 |             mkdir -p $CURR_RESULT_PATH
17 | 
18 |             #echo "[TEST] $i $j $k......"
19 |             echo "[TEST] op: $j node: $k, core: $CLOSEST_CORE......"
20 |             LINE=`sudo ../src/cxlMemTest -p $CLOSEST_CORE -t 1 -S 1 -n $k -T 3 -o $j -i $ITERATION -B -F $TSC_FREQ | awk '/Median latency/ {print}'`
21 |             echo $LINE
22 | 
23 |             LATS=`echo $LINE | awk '{print $(NF-2)}' | grep -Eo '[+-]?[0-9]+([.][0-9]+)?'`
24 |             echo $LATS >> $CURR_RESULT_PATH/block_lats_n${k}.txt
25 |             echo $LATS
26 |         done
27 |     done
28 | }
29 | 
30 | test_mlc_lats() {
31 |     echo "running mlc ..."
32 |     sudo ../app/mlc_linux/mlc --latency_matrix > ../results/figure_3_mlc/mlc.txt
33 | 
34 |     echo "done!"
35 | }
36 | 
37 | mkdir -p ../results/figure_3_mlc
38 | echo "testing with MLC ... "
39 | test_mlc_lats
40 | 
41 | echo "testing with MEMO ... "
42 | test_block_lats
43 | 


--------------------------------------------------------------------------------
/caption_ae/config.py:
--------------------------------------------------------------------------------
 1 | class bcolors:
 2 |     HEADER = '\033[95m'
 3 |     OKBLUE = '\033[94m'
 4 |     OKCYAN = '\033[96m'
 5 |     OKGREEN = '\033[92m'
 6 |     WARNING = '\033[93m'
 7 |     FAIL = '\033[91m'
 8 |     ENDC = '\033[0m'
 9 |     BOLD = '\033[1m'
10 |     UNDERLINE = '\033[4m'
11 | 
12 | LOG_NONE = 0
13 | LOG_ACTION = LOG_NONE + 1
14 | LOG_DEBUG = LOG_ACTION + 1
15 | LOG_METRIC = LOG_DEBUG + 1
16 | 
17 | DO_LOG = LOG_DEBUG
18 | ACTION_ENABLE = True
19 | #ACTION_ENABLE = False
20 | 
21 | IL_TOP_RESET = 10
22 | IL_BOT_MAX = IL_TOP_RESET * 2
23 | IL_BOT_RESET = 1
24 | STEP_RESET = 3
25 | MIN_STEP = 1
26 | 
27 | 
28 | WINDOW_SIZE=5
29 | 
30 | # Config #1
31 | RESET_THRESHOLD = 500
32 | TUNE_TRESHOLD = 0.2
33 | IDLE_THRESHOLD = 70000
34 | ALLOC_THRESHOLD = 150000
35 | ALLOC_DROP_THRESHOLD = 50000
36 | 
37 | scale_dict = { }
38 | 
39 | metric_dict = {
40 |     'norm_ipc': 0,
41 |     'L1.miss.lats': 0,
42 |     'DDR.read.lats': 0,
43 | }
44 | 
45 | # Model from R-studio
46 | #norm_ipc    99.55281   11.24329   8.854 2.35e-08 ***
47 | #l1_lat      -0.04686    0.01539  -3.045  0.00639 ** 
48 | #ddr_lat     -0.48751    0.14351  -3.397  0.00286 ** 
49 | coeff_dict = {
50 |     'norm_ipc': 99.55281,
51 |     'L1.miss.lats': -0.04686,
52 |     'DDR.read.lats': -0.48751,
53 | }
54 | 
55 | pmu_translateion = { }
56 | 
57 | pcm_translateion = {
58 |     'norm_ipc': 'pcm_norm_ipc',
59 |     'L1.miss.lats': 'pcm_l1miss',
60 |     'DDR.read.lats': 'pcm_ddrReadLat',
61 | }
62 | 


--------------------------------------------------------------------------------
/caption_ae/action.py:
--------------------------------------------------------------------------------
 1 | from config import *
 2 | import subprocess
 3 | 
 4 | def log_action(color, log):
 5 |     if DO_LOG >= LOG_ACTION:
 6 |         print("[ACTION] === " + color + log + bcolors.ENDC + " ===")
 7 |     pass
 8 | def log_metric(tag, log):
 9 |     log = str(log)
10 |     log = tag + " " + log
11 |     if DO_LOG >= LOG_METRIC:
12 |         print("[METRIC] === " + bcolors.UNDERLINE + log + bcolors.ENDC + " ===")
13 |         pass
14 |     pass
15 | def log_debug(log):
16 |     log = str(log)
17 |     if DO_LOG >= LOG_DEBUG:
18 |         print("[DEBUG] === " + bcolors.BOLD + log + bcolors.ENDC + " ===")
19 |     pass
20 | 
21 | def update_metric(pcm_dict, pmu_dict):
22 |     #if len(pcm_dict) == 0 or len(pmu_dict) == 0:
23 |     #    return
24 |     for i, (v, k) in enumerate(pcm_translateion.items()):
25 |         if k not in pcm_dict:
26 |             continue
27 |         val = pcm_dict[k]
28 |         if 'log' in v:
29 |             val = math.log10(val)        
30 |         if "2)2" in v:
31 |             val = val * val
32 |         metric_dict[v] = val
33 |         log_metric(v, val)
34 | 
35 |     for i, (v, k) in enumerate(pmu_translateion.items()):
36 |         if k not in pmu_dict:
37 |             continue
38 |         val = pmu_dict[k]
39 |         if 'log' in v:
40 |             val = math.log10(val)        
41 |         if "2)2" in v:
42 |             val = val * val
43 |         metric_dict[v] = pmu_dict[k]
44 |         log_metric(v, val)
45 | 
46 | def set_ratio(top, bot):
47 |     if ACTION_ENABLE:
48 |         subprocess.run(['sudo','sysctl','-w','vm.numa_tier_interleave_top='+str(top)])
49 |         subprocess.run(['sudo','sysctl','-w','vm.numa_tier_interleave_bot='+str(bot)])
50 | 
51 | def reset_default():
52 |     set_ratio(IL_TOP_RESET, IL_BOT_RESET)
53 | 


--------------------------------------------------------------------------------
/memo_ae/src/main.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Developed by FAST Lab @ ECE-UIUC -- 2022-2023
 3 |  */
 4 | #include "util.h"
 5 | #include "test.h"
 6 | #include <stdlib.h>
 7 | #include <stdio.h>
 8 | #include <numa.h>
 9 | #include <numaif.h>
10 | 
11 | int main(int argc, char*argv[]) {
12 |     int ret;
13 |     test_cfg_t* cfg;
14 |     cfg = malloc(sizeof(test_cfg_t));
15 | 
16 |     ret = parse_arg(argc, argv, cfg);
17 |     if (ret < 0) {
18 |         if (ret == -1) {
19 |             printf("BAD parse_arg\n");
20 |         }
21 |         goto out;
22 |     }
23 | 
24 |     ret = init_buf(cfg->total_buf_size, cfg->buf_a_numa_node, &(cfg->buf_a));
25 |     if (ret < 0) {
26 |         if (ret == -1) {
27 |             printf("BAD init_buf buf_a, fail to alloc\n");
28 |             goto out;
29 |         } else { // already alloc, needs to free
30 |             printf("BAD init_buf buf_a, alloc strange\n");
31 |             goto out1;
32 |         }
33 |     }
34 | 
35 |     if (cfg->op == MOV) {
36 |         ret = init_buf(cfg->total_buf_size, cfg->buf_b_numa_node, &(cfg->buf_b));
37 |         if (ret < 0) {
38 |             if (ret == -1) {
39 |                 printf("BAD init_buf buf_b, fail to alloc\n");
40 |                 goto out1; // free buf_a
41 |             } else { // already alloc, needs to free
42 |                 printf("BAD init_buf buf_b, alloc strange\n");
43 |                 goto out2; // free buf_b then buf_a
44 |             }
45 |         }
46 |     }
47 | 
48 |     ret = run_test(cfg);
49 | 
50 |     ret = get_node(cfg->buf_a, cfg->total_buf_size);
51 |     printf("end, buf_a is on node %d\n", ret);
52 | 
53 |     if (cfg->op == MOV) {
54 |         ret = get_node(cfg->buf_b, cfg->total_buf_size);
55 |         printf("end, buf_b is on node %d\n", ret);
56 |     }
57 | 
58 | out2:
59 |     numa_free(cfg->buf_b, cfg->total_buf_size);
60 | out1:
61 |     numa_free(cfg->buf_a, cfg->total_buf_size);
62 | out:
63 |     free(cfg);
64 |     return 0;	
65 | }
66 | 


--------------------------------------------------------------------------------
/caption_ae/metrics/vmstat_mon.py:
--------------------------------------------------------------------------------
 1 | #!/home/zeduoyu2/anaconda3/bin/python3
 2 | 
 3 | import os
 4 | import subprocess
 5 | import time
 6 | import sys
 7 | import signal
 8 | from queue import Queue, Empty
 9 | from threading  import Thread
10 | import re
11 | import time
12 | 
13 | class vmstat_metric:
14 |     def __init__(self) -> None:
15 |         self.cnt = 0
16 |         self.val = 0
17 |             
18 |     def run_realtime(self, interval=1000, print_info=True) -> None:
19 |         # define and start the parsing threads
20 |         def catch_output():
21 |             prev_val = 0 
22 |             curr_val = 0
23 |             while(True):
24 |                 output = subprocess.check_output("cat /proc/vmstat", shell=True)
25 |                 output = output.splitlines()
26 |                 target_line = output[66]
27 |                 target_line = target_line.decode("utf-8") 
28 |                 curr_val = int(target_line.split()[-1])  
29 |                 '''
30 |                 for idx, line in enumerate(output):
31 |                     line = line.decode("utf-8") 
32 |                     if "pgalloc_normal" not in line:
33 |                         continue
34 |                     else:
35 |                         curr_val = int(line.split()[-1])  
36 |                         print(idx)
37 |                         break
38 |                 '''
39 | 
40 |                 self.val = curr_val - prev_val
41 |                 prev_val = curr_val
42 |                 time.sleep(1)
43 | 
44 |         t = Thread(target=catch_output)
45 |         t.daemon = True
46 |         t.start()
47 |         
48 |         if print_info:
49 |             while(True):
50 |                 time.sleep(1)
51 | 
52 |         
53 |     def get_stat(self) -> int:
54 |         return self.val
55 | 
56 | if __name__ == "__main__":
57 |     vmstat = vmstat_metric()
58 |     vmstat.run_realtime(print_info=False)
59 |     while(True):
60 |         print(vmstat.get_stat())
61 |         time.sleep(1)
62 | 


--------------------------------------------------------------------------------
/memo_ae/src/util.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Developed by FAST Lab @ ECE-UIUC -- 2022-2023
  3 |  */
  4 | #ifndef UTIL_H
  5 | #define UTIL_H
  6 | 
  7 | #include <stdint.h>
  8 | #include <stdbool.h>
  9 | 
 10 | #define DEBUG 1
 11 | #define debug_print(fmt, ...) \
 12 |     do { if (DEBUG) fprintf(stderr, "%s:%d:%s(): " fmt, __FILE__, \
 13 |             __LINE__, __func__, __VA_ARGS__); } while (0)
 14 | 
 15 | 
 16 | /* text color */
 17 | #define RED   "\x1B[31m"
 18 | #define GRN   "\x1B[32m"
 19 | #define YEL   "\x1B[33m"
 20 | #define BLU   "\x1B[34m"
 21 | #define MAG   "\x1B[35m"
 22 | #define CYN   "\x1B[36m"
 23 | #define WHT   "\x1B[37m"
 24 | #define RESET "\x1B[0m"
 25 | 
 26 | typedef struct chase_struct chase_t;
 27 | 
 28 | struct chase_struct {
 29 |     // 64-bit addr, 64 * 64 = 512 bit per cacheline
 30 |     chase_t* ptr_arr[8]; 
 31 | };
 32 | 
 33 | typedef enum test_op {
 34 |     READ,
 35 |     READ_NT,
 36 |     WRITE,
 37 |     WRITE_NT,
 38 |     MOV,
 39 |     MIXED  /* mix read and write */
 40 | } test_op_t;
 41 | 
 42 | typedef enum test_type {
 43 |     LATS_CLFLUSH,
 44 |     BW,
 45 |     LATS_CHASE, 
 46 |     BLOCK_LATS
 47 | } test_type_t;
 48 | 
 49 | typedef struct test_cfg {
 50 |     // overall
 51 |     uint64_t num_thread;
 52 |     uint64_t total_buf_size;
 53 |     int buf_a_numa_node;
 54 |     int buf_b_numa_node;
 55 |     char* buf_a;
 56 |     char* buf_b;
 57 |     bool prefetch_en;
 58 |     int bw_granu; // number of cache line (n * 64B)
 59 |     double tsc_freq; // GHz
 60 | 
 61 |     // thread 
 62 |     int thread_idx;
 63 |     int core_a;
 64 |     int core_b;
 65 |     char* start_addr_a;
 66 |     char* start_addr_b;
 67 |     uint64_t per_thread_size; // num byte per thread
 68 |     int op_iter;
 69 |     test_type_t type;
 70 |     test_op_t op;
 71 |     int starting_core;
 72 |     bool random;
 73 |     int stall_ratio;
 74 |     int read_ratio; /* computed by (read / write) */
 75 |     bool flush_block;
 76 |     int num_clear_pipe;
 77 | 
 78 |     // monitoring
 79 |     volatile uint64_t curr_op_cnt; 
 80 | 
 81 |     // thread sync 	
 82 |     volatile int halt;
 83 | 
 84 | } test_cfg_t;
 85 | 
 86 | int parse_arg(int argc, char*argv[], test_cfg_t* cfg);
 87 | 
 88 | int get_node(void* p, uint64_t size);
 89 | 
 90 | int init_buf(uint64_t size, int node, char** alloc_ptr);
 91 | 
 92 | uint64_t read_MSR(int cpu);
 93 | 
 94 | void write_MSR(int cpu, uint64_t val);
 95 | 
 96 | void disable_prefetch(int cpu);
 97 | 
 98 | void enable_prefetch(int cpu);
 99 | 
100 | uint64_t xorshf96(uint64_t* x);
101 | 
102 | void flush_all_cache();
103 | 
104 | #endif // UTIL_H
105 | 


--------------------------------------------------------------------------------
/caption_ae/algo.py:
--------------------------------------------------------------------------------
 1 | from action import *
 2 | from config import *
 3 | 
 4 | # Input (tracked by the caller)
 5 | #   curr state
 6 | #   prev state
 7 | #   prev stepping
 8 | #   bot ratio 
 9 | # Output (returned to the caller)
10 | #   curr stepping
11 | #   new bot ratio 
12 | 
13 | # In this implementation, the top is fixed to some ratio
14 | #   The bot is tunned in some range of value
15 | 
16 | def algo(dynamic_state, static_state, prev_step, bot_ratio):
17 |     diff = dynamic_state - static_state
18 |     abs_diff = abs(diff)
19 |     log_debug("PRE: diff:{0}, prev_step:{1}, bot_ratio:{2}".format(diff, prev_step, bot_ratio))
20 |     log_debug("PRE: dyn:{0}, stc:{1}".format(dynamic_state, static_state))
21 | 
22 |     # ================================== pre-condition
23 |     if dynamic_state > IDLE_THRESHOLD:
24 |         log_action(bcolors.OKGREEN, "Pass -- idle")
25 |         return MIN_STEP, bot_ratio
26 |     elif abs_diff < TUNE_TRESHOLD:
27 |         log_action(bcolors.OKGREEN, "Pass -- stable")
28 |         set_ratio(IL_TOP_RESET, bot_ratio)
29 |         if prev_step < 0:
30 |             return -MIN_STEP , bot_ratio
31 |         else:
32 |             return MIN_STEP , bot_ratio
33 |     elif abs_diff > RESET_THRESHOLD:
34 |         log_action(bcolors.WARNING, "Reset")
35 |         reset_default() 
36 |         return STEP_RESET, -1 
37 | 
38 |     # ================================== step
39 |     curr_step = prev_step
40 |     log_action(bcolors.OKBLUE, "Tune")
41 |     if diff > 0: # gets better
42 |         log_action(bcolors.OKGREEN, "better")
43 |         curr_step = prev_step
44 |     else: # gets worse
45 |         curr_step = -prev_step / 2 # apply reversed half step
46 |         log_action(bcolors.OKCYAN, "worse, pre-bound step = " + str(curr_step))
47 | 
48 |     # ================================== bound step
49 |     if curr_step < MIN_STEP and curr_step > -MIN_STEP:
50 |         if curr_step < 0:
51 |             curr_step = -MIN_STEP
52 |         else:
53 |             curr_step = MIN_STEP
54 |     curr_step = int(curr_step)
55 |     log_debug("post-bound step = " + str(curr_step))
56 | 
57 |     ## ================================== bound ratio
58 |     bot_ratio += curr_step 
59 |     if bot_ratio <= 1: # cap at ddr:cxl = 10:1
60 |         log_action(bcolors.WARNING, "lower bound: bot{0}, step{1}".format(bot_ratio, curr_step))
61 |         bot_ratio = 1
62 |     elif bot_ratio >= (IL_BOT_MAX):
63 |         log_action(bcolors.WARNING, "upper bound: bot{0}, step{1}".format(bot_ratio, curr_step))
64 |         bot_ratio = IL_BOT_MAX
65 | 
66 |     # ================================== set
67 |     bot_ratio = int(bot_ratio)
68 |     set_ratio(IL_TOP_RESET, bot_ratio)
69 | 
70 |     log_debug("POST: curr_step:{0}, prev_step:{1}, bot_ratio:{2}".format(curr_step, prev_step, bot_ratio))
71 |     log_debug("POST: dynamic_state:{0}, static_state:{1}, bot_ratio:{2}".format(dynamic_state, static_state, bot_ratio))
72 |     return curr_step, bot_ratio
73 | 
74 | # Linear function
75 | def calculate_state():
76 |     ret = 0
77 |     for i, (k, v) in enumerate(coeff_dict.items()):
78 |         # Log/square applied in the translation
79 |         mul = v * metric_dict[k]
80 |         ret += mul
81 |         #print(mul, v, metric_dict[k])
82 |     return ret
83 | 


--------------------------------------------------------------------------------
/caption_ae/README.md:
--------------------------------------------------------------------------------
 1 | # Caption
 2 | 
 3 | ## Setup
 4 | ### Prerequisite
 5 | - Must
 6 |     - Python 3
 7 |     - Linux Kernel with N:M interleaving [patch](https://lore.kernel.org/linux-mm/YqD0%2FtzFwXvJ1gK6@cmpxchg.org/T/)
 8 |       + The patch added a tunable parameter (numa\_tier\_interleave) in `vm_table` in `kernel/sysctl.c`
 9 |       + In our case, we use two parameters to control the top and bot ratio independently.
10 |         * `numa_tier_interleave_top` for top tier
11 |         * `numa_tier_interleave_bot` for bot tier
12 |       + The rest of the patch is applied without any modification
13 |     - Intel PCM
14 |       + Please follow [intel-pcm](https://github.com/intel/pcm) to clone and build PCM.
15 |       + Please update the `PCM_PATH` in `metrics/pcm_mon.py` to your pcm binary path.
16 | 
17 | ### Clone
18 | ```bash
19 | $ git clone https://github.com/ece-fast-lab/cxl_type3_tests.git 
20 | $ cd caption_ae 
21 | ```
22 | 
23 | ## Notes
24 | * The interleaving ratio is applied to `libnuma`, `numactl --interleave` calls for memory interleaving **allocations**. Therefore, the ratio is only applied upon new memory allocations. This is orthogonal to works on memory migration.
25 | * Currently, `Caption` assumes major memory allocation happens when the application launches, and thus, the tuning happesns at the end of each iteration of an application and before its next launch.
26 | * Caption is independent of application output and only monitors system performance counters. However, in some cases, it may be desirable to have application output as a feed back on the direction of tunning. We leave the enhancement of the monitoring scheme as a future work. 
27 | * Although `IPC` alone may seems sufficient, `L1 latency` and `DDR latency` are here to assit the model to identify subtle change in application's performance.
28 | 
29 | ## Known issues 
30 | * If the tunning time interval is too small, `Caption` may not be able to capture enough information about the system state.
31 | * If the tuning stepping is too small, `Caption` may not be able to correctly indentify if the direction is correct -- i.e. the performance difference is too subtle. 
32 | 
33 | ## Arguments
34 | | Argument | Brief description | Default | Valid inputs | Note |
35 | | -------- | ----------------- | ------- | ------------ | ---- |
36 | | h | Help message generated by `argparser` | - | - | - |
37 | | x | Stepping mode | - | - | Test the Caption model by simply iterating through the interleaving ratio. By default, this will iterate from DDR:CXL = 10:1 and increase the CXL ratio by 2, i.e. 10:3, 10:5 ...|
38 | | n | No tune | - | - | This is used for monitoring the model output at a fixed interleaving ratio. The tuning algorithm is not applied in this case.|
39 | | s | Test synchronous mode, shell script path | - | String, path to a shell script | The script may contain multiple program, where each program may execute in the background with '&'. In this case, tuning happen when the shell script exits. We provide an example script in the `example_input` folder.|
40 | | t | Test asynchronous mode, txt file path | - | String, path to a txt file | The txt may contain muliple shell scripts. Each script will be executed by the python program in a seperate thread. In this case, tunning happen if one of the shell script exits. You may change the tuning mask (`tune_mask` in `caption.py`) to enable tunning for the ending on a specific thread (script). We provide an example txt in the `example_input`.|
41 | 
42 | ## Example usage
43 | ### Syncrhnous mode
44 | ```
45 | $ python3 caption.py -s example_input/sync_tune.sh
46 | ```
47 | 
48 | ### Asynchronous mode
49 | ```
50 | $ python3 caption.py -t example_input/async_tune.sh
51 | ```
52 | 


--------------------------------------------------------------------------------
/caption_ae/metrics/slab_mon.py:
--------------------------------------------------------------------------------
  1 | #!/home/zeduoyu2/anaconda3/bin/python3
  2 | 
  3 | import os
  4 | import subprocess
  5 | import time
  6 | import sys
  7 | import signal
  8 | from queue import Queue, Empty
  9 | from threading  import Thread
 10 | 
 11 | class slab_metric:
 12 |     def __init__(self, window_size=5) -> None:
 13 |         self.slab_realtime_cmd = ["sudo", "python3", "/home/yans3/bcc/tools/slabratetop.py", "-C"]
 14 |         self.stats = {'alloc_sum':[]}
 15 |         self.cnt = 0
 16 |         self.moving_sum = 0
 17 |         self.window_size = window_size
 18 |         for i in range(window_size):
 19 |             self.stats['alloc_sum'].append(0)
 20 |             
 21 |     def run_realtime(self, interval=1000, print_info=True) -> None:
 22 | 
 23 |         # define the SIGINT handler
 24 |         def signal_handler(sig, frame):
 25 |             os.killpg(os.getpgid(self.p_slab.pid), signal.SIGINT)
 26 |             print('[INFO] You pressed Ctrl+C!')
 27 |             sys.exit(0)
 28 | 
 29 |         #signal.signal(signal.SIGINT, signal_handler)
 30 |         print('[INFO] Press Ctrl+C to exit')
 31 | 
 32 |         cmd = self.slab_realtime_cmd
 33 |         print("[COMMAND]", cmd)
 34 |         self.p_slab = subprocess.Popen(cmd, text=True, stdout=subprocess.PIPE, preexec_fn=os.setsid)
 35 | 
 36 |         # define and start the recording threads
 37 |         def enqueue_output(stdout, queue):
 38 |             for line in stdout:
 39 |                 queue.put(line)
 40 |             stdout.close()
 41 | 
 42 |         q_slab = Queue()
 43 |         t_slab = Thread(target=enqueue_output, args=(self.p_slab.stdout, q_slab))
 44 |         t_slab.daemon = True
 45 |         t_slab.start()
 46 |         
 47 |         # define and start the parsing threads
 48 |         def catch_output(q:Queue):
 49 |             curr_sum = 0
 50 |             while(True):
 51 |                 try: 
 52 |                     line = q.get_nowait() # or q.get(timeout=.1)
 53 |                 except Empty:
 54 |                     time.sleep(interval / 1000)  # tune to 0.5 just in case
 55 |                 else: # got line
 56 |                     if line.isspace(): continue
 57 |                     line = line.split()
 58 |                     try:
 59 |                         float(line[-1])
 60 |                     except ValueError:
 61 |                         continue
 62 |                     
 63 |                     if "loadavg" in line[1]:
 64 |                         self.moving_sum += curr_sum
 65 |                         self.cnt += 1
 66 |                         if self.cnt >= self.window_size:
 67 |                             self.moving_sum -= self.stats['alloc_sum'][self.cnt % self.window_size] 
 68 | 
 69 |                         if print_info:
 70 |                             print('mov', self.moving_sum, 'curr', curr_sum)
 71 |                             print(self.stats['alloc_sum'])
 72 | 
 73 |                         self.stats['alloc_sum'][self.cnt % self.window_size] = curr_sum
 74 |                         curr_sum = 0
 75 |                     else:
 76 |                         curr_sum += int(line[1])
 77 | 
 78 |                     if print_info:
 79 |                         print(line)
 80 | 
 81 | 
 82 |         t_catch_latency = Thread(target=catch_output, args=[q_slab])
 83 |         t_catch_latency.daemon = True
 84 |         t_catch_latency.start()
 85 |         
 86 |         if print_info:
 87 |             while(True):
 88 |                 time.sleep(1)
 89 | 
 90 |         
 91 |     def get_stat(self) -> int:
 92 |         return self.moving_sum / self.window_size
 93 | 
 94 | if __name__ == "__main__":
 95 |     slab = slab_metric(1)
 96 |     slab.run_realtime(print_info=True)
 97 |     while(True):
 98 |         print(slab.get_stat())
 99 |         time.sleep(1)
100 | 


--------------------------------------------------------------------------------
/caption_ae/caption_ctrl.py:
--------------------------------------------------------------------------------
  1 | import random 
  2 | import time
  3 | from threading  import Thread
  4 | import math
  5 | import threading
  6 | import statistics
  7 | 
  8 | from metrics.pcm_mon import *
  9 | from metrics.vmstat_mon import *
 10 | 
 11 | from algo import *
 12 | from action import *
 13 | from config import *
 14 | 
 15 | MIN_SAMPLE_CNT = 20
 16 | 
 17 | print("=======================")
 18 | print("reset basics: top:{0}, bot_max:{1}, bot_rst:{2}, step_rst:{3}".format(
 19 |     IL_TOP_RESET,
 20 |     IL_BOT_MAX,
 21 |     IL_BOT_RESET,
 22 |     STEP_RESET
 23 |     ))
 24 | print("=======================")
 25 | 
 26 | class caption_ctrl:
 27 |     def __init__(self) -> None:
 28 |         self.arr = {'norm_ipc': [], 
 29 |                     'L1.miss.lats': [], 
 30 |                     'DDR.read.lats': []}
 31 |         self.prev_state = -1
 32 |         self.prev_step = STEP_RESET
 33 |         self.prev_ratio = IL_BOT_RESET
 34 |     
 35 |     def run_realtime(self, log_level=LOG_DEBUG)->None:
 36 |         pcm = pcm_metric()
 37 |         pcm_thread = Thread(target=pcm.run_realtime, args=(False,))
 38 |         pcm_thread.start()
 39 |         reset_default()
 40 | 
 41 |         def catch_output():
 42 |             cnt = 0
 43 |             while(True):
 44 |                 update_metric(pcm.get_stat(WINDOW_SIZE), {})
 45 | 
 46 |                 self.arr['norm_ipc'].append(metric_dict['norm_ipc'])
 47 |                 self.arr['L1.miss.lats'].append(metric_dict['L1.miss.lats'])
 48 |                 self.arr['DDR.read.lats'].append(metric_dict['DDR.read.lats'])
 49 |                 time.sleep(1)
 50 |                 cnt += 1
 51 | 
 52 |         t = Thread(target=catch_output)
 53 |         t.daemon = True
 54 |         t.start()
 55 | 
 56 |         '''
 57 |         while(True):
 58 |             time.sleep(1)
 59 |             self.prev_state += 1
 60 |         '''
 61 | 
 62 |     def get_set_tune_val(self, first_time=False, model_only=False):
 63 |         log_action(bcolors.OKCYAN, "=============== TUNE ================ ")
 64 | 
 65 |         # get
 66 |         if len(self.arr['norm_ipc']) > MIN_SAMPLE_CNT:
 67 |             log_action(bcolors.OKCYAN, "=============== TUNE -- valid ================ ")
 68 |             accu_avg = 0
 69 |             # linear equation, SUM(coeff * mean of sample)
 70 |             for k, v in self.arr.items():
 71 |                 mean = sum(v) / len(v)
 72 |                 accu_avg += coeff_dict[k] * mean
 73 |                 self.arr[k] = []
 74 |                 print(k, mean)
 75 |             log_debug("=============== accu_avg -- {0} ================ ".format(accu_avg))
 76 |             if model_only:
 77 |                 return
 78 |         else:
 79 |             log_action(bcolors.OKCYAN, "=============== TUNE -- not enough samples ================ ")
 80 |             log_action(bcolors.OKCYAN, "=============== TUNE -- need: %d, has: %d samples ================ " % (MIN_SAMPLE_CNT, 
 81 |                                                                                                     len(self.arr['norm_ipc'])))
 82 |             return
 83 | 
 84 |         log_action(bcolors.OKCYAN, "=============== TUNE -- algo ================ ")
 85 |         # first time tuning always attempt to tune toward more CXL
 86 |         if first_time:
 87 |             self.prev_state = accu_avg - 1
 88 |         step, ratio = algo(accu_avg, self.prev_state, self.prev_step, self.prev_ratio)
 89 | 
 90 |         log_action(bcolors.OKCYAN, "=============== TUNE -- track states ================ ")
 91 |         self.prev_state = accu_avg
 92 |         self.prev_step = step
 93 |         if ratio < 0:
 94 |             self.prev_ratio = IL_BOT_RESET
 95 |         else:
 96 |             self.prev_ratio = ratio
 97 | 
 98 | if __name__ == "__main__":
 99 |     tuner = caption_ctrl()
100 |     tuner.run_realtime()
101 |     while(True):
102 |         time.sleep(1)
103 |         print(tuner.get_set_tune_val())
104 | 


--------------------------------------------------------------------------------
/caption_ae/caption.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import sys
  4 | from caption_ctrl import *
  5 | import argparse
  6 | import subprocess
  7 | import threading
  8 | import time
  9 | 
 10 | MAX_TUNE_ITER = 7
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument("-x", "--step", help="stepping mode (10:x+=step). Algorithm is not applied, but the output of the estimator will be printed.", action='store_true')
 14 | parser.add_argument("-s", "--sh-path", help="shell script that houses the program to be tuned. You may embed several program in this shell script")
 15 | parser.add_argument("-n", "--no-tune", help="disable tuning. This will simply run the passed in shell script", action='store_true')
 16 | parser.add_argument("-t", "--batch_txt", help="txt script that houses several shell scripts to be exectued. Tuning will happen whenevr a script ended its execution. You may set 'tune_mask' to enable tunning when one of the program ends")
 17 | args = parser.parse_args()
 18 | 
 19 | #print(args.echo)
 20 | def exec_cmd_and_wait(sh_path):
 21 |     print(sh_path)
 22 |     subprocess.call(["sudo", "bash", sh_path]) 
 23 | 
 24 | def exec_cmd_and_wait_arg(sh_path):
 25 |     sh_path_arr = sh_path.split()
 26 |     sh_path_arr.insert(0, "bash") 
 27 |     sh_path_arr.insert(0, "sudo") 
 28 |     print(sh_path_arr)
 29 |     subprocess.call(sh_path_arr) 
 30 | 
 31 | def sync_tune():
 32 |     # start monitor
 33 |     tuner = caption_ctrl()
 34 |     tuner.run_realtime()
 35 |     time.sleep(5)
 36 | 
 37 |     tune_iter = 0
 38 |     first_time = True
 39 | 
 40 |     stepping_ratio = 1
 41 | 
 42 |     try:
 43 |         while True:
 44 |             print("sync tuning iteration: %d" % (tune_iter))
 45 |                 
 46 |             # run exec
 47 |             exec_cmd_and_wait(args.sh_path)
 48 | 
 49 |             # tune
 50 |             if args.no_tune:
 51 |                 print(" ============ no tune ============== ")
 52 |             if args.step is not None:
 53 |                 print(" ============ no tune, stepping only ============== ")
 54 |                 tuner.get_set_tune_val(first_time, model_only=True)
 55 |                 stepping_ratio += 2
 56 |                 set_ratio(10, stepping_ratio)
 57 |             else:
 58 |                 tuner.get_set_tune_val(first_time)
 59 | 
 60 |             first_time = False
 61 |             time.sleep(5)
 62 |             tune_iter += 1
 63 | 
 64 |             if tune_iter >= MAX_TUNE_ITER:
 65 |                 break
 66 |     except KeyboardInterrupt:
 67 |         print("ended with ctrl-c")
 68 | 
 69 | def async_tune():
 70 |     # start monitor
 71 |     tuner = caption_ctrl()
 72 |     tuner.run_realtime()
 73 |     time.sleep(5)
 74 | 
 75 |     tune_iter = 0
 76 |     first_time = True
 77 | 
 78 |     # read path 
 79 |     # (t, sh_path)
 80 |     thread_arr = []
 81 |     tune_mask = []
 82 |     with open(args.batch_txt) as f:
 83 |         for line in f.readlines():
 84 |             line = line.strip()
 85 |             print(line)
 86 |             t = threading.Thread(target=exec_cmd_and_wait_arg, args=(line,))
 87 |             thread_arr.append((t, line))
 88 | 
 89 |             # FIXME, set to the desired mask
 90 |             tune_mask.append(True)
 91 |             # for example: tune whenever "roms" ended 
 92 |             #tune_mask.append("roms" in line)
 93 | 
 94 |     for t, _ in thread_arr:
 95 |         t.start()
 96 | 
 97 |     print("tune_mask: ", tune_mask)
 98 | 
 99 |     try:
100 |         while True:
101 |             print("async tuning iteration: %d" % (tune_iter))
102 |             
103 |             found_end = False
104 |             # This will find the targeted ending thread
105 |             while found_end is False:
106 | 
107 |                 for idx, (t, sh_path) in enumerate(thread_arr):
108 |                     if not t.is_alive():
109 |                         print("ended %s" % sh_path)
110 | 
111 |                         # restart
112 |                         new_t = threading.Thread(target=exec_cmd_and_wait_arg, args=(sh_path,))
113 |                         thread_arr[idx] = (new_t, sh_path)
114 |                         new_t.start()
115 | 
116 |                         # stop
117 |                         if tune_mask[idx]:
118 |                             print("tune!")
119 |                             found_end = True
120 |                             break
121 | 
122 |                     # sleep, avoid spinning 
123 |                     time.sleep(1)
124 | 
125 |             # tune
126 |             if args.no_tune:
127 |                 print(" ============ no tune ============== ")
128 |             else:
129 |                 tuner.get_set_tune_val(first_time)
130 | 
131 |             first_time = False
132 |             time.sleep(5)
133 |             tune_iter += 1
134 | 
135 |             if tune_iter >= MAX_TUNE_ITER:
136 |                 break
137 | 
138 |     except KeyboardInterrupt:
139 |         print("ctrl-c pressed")
140 |         for t, _ in thread_arr:
141 |             t.stop()
142 |         print("ended with ctrl-c")
143 | 
144 | if __name__ == "__main__":
145 |     if args.batch_txt is not None:
146 |         async_tune()
147 |     else:
148 |         sync_tune()
149 | 
150 | 


--------------------------------------------------------------------------------
/caption_ae/metrics/pmu_mon.py:
--------------------------------------------------------------------------------
  1 | #!/home/zeduoyu2/anaconda3/bin/python3
  2 | 
  3 | import os
  4 | import subprocess
  5 | import time
  6 | import sys
  7 | import signal
  8 | from queue import Queue, Empty
  9 | from threading  import Thread
 10 | 
 11 | class pmu_metric:
 12 |     def __init__(self, nodes=["Backend_Bound.Memory_Bound.DRAM_Bound.MEM_Latency", "Backend_Bound.Memory_Bound.L1_Bound",
 13 |          "Backend_Bound.Memory_Bound.DRAM_Bound.MEM_Bandwidth"], output_path="result/pmu_tools") -> None:
 14 | 
 15 |         self.node_list = nodes
 16 |         self.node_names = ','.join(nodes)
 17 |         self.output_path = output_path
 18 |         self.numaRatio_cmd = "sudo sysctl -w vm.numa_tier_interleave={ratio}"
 19 |         self.numaRatioTop_cmd = "sudo sysctl -w vm.numa_tier_interleave_top={top}"
 20 |         self.numaRatioBot_cmd = "sudo sysctl -w vm.numa_tier_interleave_bot={bottom}"
 21 |         self.toplev_cmd = ["sudo", "/home/yans3/pmu-tools/toplev", "-x,", "-o", "{filename}", "--no-desc", "-I", "1000", "-v", "--nodes", 
 22 |                            "!" + self.node_names]
 23 |         self.toplev_realtime_cmd = ["sudo", "/home/yans3/pmu-tools/toplev", "--no-desc", "-I", "1000", "-v", 
 24 |                                     "--nodes", "!" + self.node_names]
 25 |         # self.stats = {"L1_Bound":[], "BW_Bound":[], "Lat_Bound":[]}
 26 |         self.stats = {}
 27 |         for node in self.node_list:
 28 |             self.stats[node] = []
 29 |         
 30 |         if not os.path.exists(self.output_path):
 31 |             os.mkdir(self.output_path)
 32 |             
 33 | 
 34 |     def set_ratio(self, top:int, bot:int) -> None:
 35 | 
 36 |         print("[INFO] Configuring NUMA interleave ratio to %d:%d (DRAM:CXL)"%(top, bot))
 37 |         cmd = self.numaRatioTop_cmd.format(top=top)
 38 |         print("[COMMAND]", cmd)
 39 |         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True, text=True)
 40 |         ret = p.wait()
 41 |         if ret != 0:
 42 |             sys.exit(ret)
 43 |         out = p.communicate()[0]
 44 |         print("[RETURN] Output:", out)
 45 | 
 46 |         cmd = self.numaRatioBot_cmd.format(bottom=bot)
 47 |         print("[COMMAND]", cmd)
 48 |         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True, text=True)
 49 |         ret = p.wait()
 50 |         if ret != 0:
 51 |             sys.exit(ret)
 52 |         out = p.communicate()[0]
 53 |         print("[RETURN] Output:", out)
 54 | 
 55 | 
 56 | 
 57 |     def start_recording(self, filename:str = "pmu_result.csv") -> None:
 58 | 
 59 |         # define the SIGINT handler
 60 |         def signal_handler(sig, frame):
 61 |             os.killpg(os.getpgid(self.p_toplev.pid), signal.SIGINT)
 62 |             print('[INFO] You pressed Ctrl+C!')
 63 |             sys.exit(0)
 64 | 
 65 |         signal.signal(signal.SIGINT, signal_handler)
 66 |         print('[INFO] Press Ctrl+C to exit')
 67 | 
 68 |         assert filename.endswith('.csv')
 69 |         file_path = os.path.join(self.output_path, filename)
 70 |         cmd = self.toplev_cmd
 71 |         cmd[4] = file_path
 72 |         print("[COMMAND]", cmd)
 73 |         self.p_toplev = subprocess.Popen(cmd, text=True, preexec_fn=os.setsid)
 74 | 
 75 |     
 76 |     def stop_recording(self) -> None:
 77 |         # self.fp.close()
 78 |         os.killpg(os.getpgid(self.p_toplev.pid), signal.SIGINT)
 79 |         print("[INFO] PMU monitoring stopped.")
 80 | 
 81 | 
 82 |     def run_realtime(self, interval=1000, print_info=True) -> None:
 83 | 
 84 |         # define the SIGINT handler
 85 |         def signal_handler(sig, frame):
 86 |             os.killpg(os.getpgid(self.p_toplev.pid), signal.SIGINT)
 87 |             print('[INFO] You pressed Ctrl+C!')
 88 |             sys.exit(0)
 89 | 
 90 |         #signal.signal(signal.SIGINT, signal_handler)
 91 |         print('[INFO] Press Ctrl+C to exit')
 92 | 
 93 |         cmd = self.toplev_realtime_cmd
 94 |         cmd[4] = str(interval)
 95 |         print("[COMMAND]", cmd)
 96 |         self.p_toplev = subprocess.Popen(cmd, text=True, stderr=subprocess.PIPE, preexec_fn=os.setsid)
 97 | 
 98 |         # define and start the recording threads
 99 |         def enqueue_output(stdout, queue):
100 |             for line in stdout:
101 |                 queue.put(line)
102 |             stdout.close()
103 | 
104 |         q_toplev = Queue()
105 |         t_toplev = Thread(target=enqueue_output, args=(self.p_toplev.stderr, q_toplev))
106 |         t_toplev.daemon = True
107 |         t_toplev.start()
108 |         
109 |         # define and start the parsing threads
110 |         def catch_output(q:Queue):
111 |             while(True):
112 |                 try: 
113 |                     line = q.get_nowait() # or q.get(timeout=.1)
114 |                 except Empty:
115 |                     time.sleep(interval / 1000)  # tune to 0.5 just in case
116 |                 else: # got line
117 |                     if line.isspace(): continue
118 |                     line = line.split()
119 |                     try:
120 |                         float(line[0])
121 |                     except ValueError:
122 |                         continue
123 | 
124 |                     node = line[2]
125 |                     val = line[5]
126 | 
127 |                     if print_info:
128 |                         print("[RESULT] {node_name:<60} {value:<4} %".format(node_name=node, value=val))
129 | 
130 |                     self.stats[node].append(val)
131 | 
132 |         t_catch_latency = Thread(target=catch_output, args=[q_toplev])
133 |         t_catch_latency.daemon = True
134 |         t_catch_latency.start()
135 |         
136 |         if print_info:
137 |             while(True):
138 |                 time.sleep(1)
139 | 
140 |         
141 |     def get_stat(self, window_size:int=5) -> dict:
142 | 
143 |         res = {}
144 |         
145 |         # length = len(self.stats["L1_Bound"])
146 |         # if length >= 1:
147 |         #     series = np.array(self.stats["L1_Bound"][-min(length,window_size):]).astype('float')
148 |         #     res["L1_Bound"] = series.mean()
149 | 
150 |         # length = len(self.stats["BW_Bound"])
151 |         # if length >= 1:
152 |         #     series = np.array(self.stats["BW_Bound"][-min(length,window_size):]).astype('float')
153 |         #     res["BW_Bound"] = series.mean()
154 | 
155 |         # length = len(self.stats["Lat_Bound"])
156 |         # if length >= 1:
157 |         #     series = np.array(self.stats["Lat_Bound"][-min(length,window_size):]).astype('float')
158 |         #     res["Lat_Bound"] = series.mean()
159 | 
160 |         for node in self.node_list:
161 |             length = len(self.stats[node])
162 |             if length >= 1:
163 |                 series = np.array(self.stats[node][-min(length,window_size):]).astype('float')
164 |                 res[node] = series.mean()
165 | 
166 |         return res
167 | 
168 | 
169 | if __name__ == "__main__":
170 | 
171 |     node_names = ["Backend_Bound.Memory_Bound.DRAM_Bound.MEM_Latency", "Backend_Bound.Memory_Bound.L1_Bound",
172 |             "Backend_Bound.Memory_Bound.DRAM_Bound.MEM_Bandwidth", "Backend_Bound.Memory_Bound",
173 |             "Backend_Bound.Memory_Bound.L2_Bound", "Backend_Bound.Memory_Bound.L3_Bound"]
174 |     
175 |     pmu = pmu_metric(node_names)
176 |     # pmu.set_ratio(10,20)
177 |     # pmu.start_recording("dlrm.csv")
178 |     pmu.run_realtime(print_info=False)
179 |     while(True):
180 |         print(pmu.get_stat())
181 |         # print(pmu.stats)
182 |         time.sleep(1)
183 | 


--------------------------------------------------------------------------------
/memo_ae/README.md:
--------------------------------------------------------------------------------
  1 | # memo benchmark
  2 | 
  3 | ## Setup
  4 | ### Prerequisite
  5 | - Must
  6 |     - Installing cpupower,turbostat
  7 |         - `sudo apt-get install -y linux-tools-$(uname -r)`
  8 |     - libnuma installation
  9 |         - `sudo apt-get install libnuma-dev`
 10 | 
 11 | ### Clone & Build
 12 | ```bash
 13 | git clone https://github.com/ece-fast-lab/cxl_type3_tests.git 
 14 | cd memo_ae/src
 15 | make
 16 | ```
 17 | 
 18 | ### Get Turbo stat
 19 | ```bash
 20 | # under memo_ae 
 21 | cd ./test_cxl/
 22 | bash get_turbostat.sh
 23 | ```
 24 | 
 25 | ### Setup `env.sh`
 26 | ```bash
 27 | # Use your favorite editor to open env.sh
 28 | vim ../util_scripts/env.sh
 29 | ```
 30 | 1. Set `CLOSEST_NODE`: The NUMA node that the CXL device is directly attached to.  
 31 |     - Command `sudo lspci -vvv` should also show the NUMA node that a CXL device attaches to.
 32 | 2. Set `CLOSEST_CORE`: This is one of the CPU cores within CLOSEST NODE 
 33 |     - Place the first core number of the `CLOSEST_NODE` in `CLOSEST_CORE`. You may find the core range of a NUMA node using this command: `numactl -H`. 
 34 |     - For example, if CXL is connected to NUMA node 1, please place the first CPU in `node 1 cpus: XX, XX+1` (XX) in `CLOSEST_CPU`. 
 35 | 3. Set `TSC_FREQ`:
 36 |     - This should be the output from `Test Turbo stat` in `results/turbostat.txt`, it should look like this: 
 37 | 4. Set `MAX_NODE`
 38 |     - For a machine with 2 socket and 1 CXL node, this should be set to "2"
 39 |     - For a machine with 1 socket and 1 CXL node, this should be set to "1"
 40 | ```
 41 | Core	CPU	Avg_MHz	Busy%	Bzy_MHz	TSC_MHz	IPC	IRQ	SMI	POLL	C1	C1E	C3	C6	POLL%	C1%	C1E%	C3%	C6%	CPU%c1	CPU%c3	CPU%c6	CoreTmp	CoreThr	PkgTmp	Pkg%pc2	Pkg%pc3	Pkg%pc6	PkgWatt	RAMWatt	PKG_%	RAM_%
 42 | -	-	10	0.57	1753	2000	0.52	985	0	14	15	108	0	945	0.00	0.12	0.40	0.00	98.93	2.23	0.00	97.20	22	0	25	2.14	0.00	76.94	25.52	0.00	0.00	0.00
 43 | 0	0	4	0.25	1802	2000	1.02	29	0	0	0	0	0	34	0.00	0.00	0.00	0.00	99.76	0.93	0.00	98.83	22	0	25	2.14	0.00	76.95	25.52	0.00	0.00	0.00
 44 | ```
 45 | **Note:**
 46 | 1. There should be a constant number for all cores, 2000MHz in the example above. Please set `TSC_FREQ` (unit = MHz) to 2000 if the number is 2000.
 47 | 2. In most system, this should also be the 6th number in the second row of `results/turbostat.txt`
 48 | 
 49 | ## Notes
 50 | * memo is only tested on AVX-enabled machines
 51 | * Single-op latency (`-T 0`) has a much higher absolute values than block access latency (`-T 3`).
 52 | * Block access is default to issue 16 accesses with randomly hard-coded offsets within a 64KB region.
 53 |     - You may play with `generate_random_inst.py` to generate a new set of random offsets.
 54 |     - You may change the MACROs in `src/workload.h`, `BLOCK_xN` and `*_xN_RAND_AVX512` to see how the number of parallel issue affects the average latency of each access.
 55 | * Setting the `-F` argument is critical for all latency measures. `-F` is not used for all bandwidth tests.
 56 | * Latency tests should always put `-t 1` for the thread count argument.
 57 | 
 58 | ## Known issues
 59 | * Random pointer chasing is NOT implemented, i.e. running with `-T 2` with `-r`
 60 |     - Testing with pointer chasing should always pin to core with `-p <core>`, which defaults to run a sequential link list chasing with prefetching OFF on `<core>`
 61 | * The `-R` and `-o 5` options for read-write ratio are experimental.
 62 | 
 63 | ## Arguments
 64 | 
 65 | | Argument | Brief description | Default | Valid inputs | Note |
 66 | | -------- | ----------------- | ------- | ------------ | ---- |
 67 | | t | Number of testing threads | 32 | 1 - X | This should be set to 1 for all latency tests |
 68 | | f | Prefetching enabled | disabled | -f | When `-p` is not specified, prefetch is NOT toggled. When `-p` is specified, default to prefetching disabled |
 69 | | m | Total buffer size in bytes | 2^30 | 32-bit integer | Anything larger than 2^30 should use the `-S` argument |
 70 | | S | Total buffer size in GiB | 1 GiB | 1 - total memory size on a NUMA node | / |
 71 | | n | Buffer NUMA node | 0 | 0 - (Number of NUMA node - 1) | When `-o 4` is specified, this argument is used as the source buffer of the move operation |
 72 | | d | Buffer NUMA node | 0 | 0 - (Number of NUMA node - 1) | When `-o 4` is specified, this argument is used as the destination buffer of the move operation. Otherwise, this argument is ignored. |
 73 | | s | Stall Ratio | 0 | 0 - X | This argument will be used in the bandwidth test where each block of accesses is accompanied by `-s` number of stall blocks. A stall block consists of 6 x 16 x 16 x 4 = 6144 `nop` instructions. |
 74 | | i | Iteration | 1 | 1 - X | For bandwidth tests, each iteration monitor the number of byte accessed across all thread in 0.5 second. For latency tests, each iteration is a single op / single block of accesses. For the pointer chasing test, each iteration is chasing through all cachelines in the specified buffer size. |
 75 | | T | Type of operation | 0 | 0 - 3 | <ul><li>0 = single-op latency</li><li>1 = bandwidth</li><li>2 = pointer chasing</li><li>3 = block-access latency|
 76 | | p | Pin to core | -1 (not pin to core) | 0 - number of cores | Pinning to core affects argument `-f` |
 77 | | a | Two core bandwidth test (core a) | -1 | 0 - number of cores | When testing with 2 thread bandwidth test, `-a` will specify the first core the thread should pin to. The policy for prefetching is aligned with `-p`.|
 78 | | b | Two core bandwidth test (core b) | -1 | 0 - number of cores | (Same with `-a`, this will pin the second thread)|
 79 | | g | Bandwidth test access block size | 512 | 16 - (per thread buff size / 64) | The MACROs for bandwidth tests are all 1024 Byte, thus the smallest stepping would be 16 cachlines |
 80 | | r | Random bandwidth test | disabled | -r | This argument will trigger the next block of access to increment by a somewhat random fashion. However, accesses within a block remains sequential |
 81 | | o | Operation | 0 (Load) | 0 - 4 | <ul><li>0 = Load</li><li>1 = NT-load</li><li>2 = Store</li><li>3 = Nt-store</li><li>4 = movdir64B (only in bandwidth tests)|
 82 | | B | Flush before block access latency test | NOT flushed | -B | Used only in `-T 3`, this argument decides whether the 64KB region to be access will be flushed. | 
 83 | | C | Number of `nop` blocks before block access latency test | 0 | 0 - X | Used only in `-T 3`, this argument decides how many blocks of `nop` should be issued after the cacheline flushes (if there's any) and before the test begines.  |
 84 | | F | TSC frequnecy | 2GHz | X | This value should **always** present for any latency tests. Please refer to the turbostat section for determining this value. | 
 85 | 
 86 | 
 87 | ## Other profilings
 88 | Under `./test_cxl/`
 89 | #### Block access (fast) < 5 min
 90 | ```
 91 | bash test_block_access_latency.sh 
 92 | ```
 93 | 
 94 | #### Ptr chasing (fast) < 5 min
 95 | ```
 96 | bash test_ptr_chase.sh 
 97 | ```
 98 | 
 99 | #### Single operation latency (fast) < 5 min
100 | ```
101 | bash test_single_op_latency.sh
102 | ```
103 | 
104 | #### `movdir64B` bandwidth (long long) > 15 min
105 | ```
106 | bash test_movdir_bw.sh
107 | ```
108 | 
109 | #### Sequential access bandwidth (long long) > 15 min
110 | ```
111 | bash test_seq_bw.sh
112 | ```
113 | 
114 | #### Random access bandwidth (long long) > 30 min
115 | ```
116 | bash test_rand_bw.sh
117 | ```
118 | 
119 | ## Results
120 | All results are under the `results` folder. 
121 | 
122 | 
123 | ## Acknowledgement  
124 | Some parts of this source code and the methodology are inspired by the marvalous work in this [publication(FAST20-Yang)](https://www.usenix.org/conference/fast20/presentation/yang) and this [reposiroty(OptaneStudy)](https://github.com/NVSL/OptaneStudy/tree/master).
125 | 
126 | 


--------------------------------------------------------------------------------
/caption_ae/metrics/pcm_mon.py:
--------------------------------------------------------------------------------
  1 | #!/home/zeduoyu2/anaconda3/bin/python3
  2 | 
  3 | import os
  4 | import subprocess
  5 | import time
  6 | import sys
  7 | import signal
  8 | from queue import Queue, Empty
  9 | from threading  import Thread
 10 | import re
 11 | 
 12 | PCM_PATH = "/home/yans3/AE_root/pcm/build/bin/"
 13 | 
 14 | class pcm_metric:
 15 |     def __init__(self) -> None:
 16 | 
 17 |         self.pcmLat_cmd = ["sudo", PCM_PATH + "pcm-latency"]
 18 |         self.pcmBw_cmd = ["sudo",  PCM_PATH + "pcm-memory"]
 19 |         self.pcmAll_cmd = ["sudo", PCM_PATH + "pcm", "-nc", "-ns"]
 20 | 
 21 |         self.stats = {
 22 |                 "pcm_l1miss":[], 
 23 |                 "pcm_ddrReadLat":[], 
 24 |                 "pcm_norm_ipc":[],
 25 |                 "pcm_ipc":[]}
 26 | 
 27 |         def signal_handler(sig, frame):
 28 |             os.killpg(os.getpgid(self.p_latency.pid), signal.SIGINT)
 29 |             os.killpg(os.getpgid(self.p_all.pid), signal.SIGINT)
 30 |             print('[INFO] You pressed Ctrl+C!')
 31 |             sys.exit(0)
 32 | 
 33 |         #signal.signal(signal.SIGINT, signal_handler)
 34 |         print('[INFO] Press Ctrl+C to exit')
 35 | 
 36 |     def get_stat(self, window_size:int=5):
 37 |         res = {}
 38 |         # Take mean of last N samples
 39 |         for k, v in self.stats.items():
 40 |             length = len(v)
 41 |             if length >= 1:
 42 |                 last_n = v[-min(length,window_size):]
 43 |                 res[k] = sum(last_n) / len(last_n)
 44 |         return res
 45 | 
 46 |     def run_realtime(self, print_info=True) -> None:
 47 | 
 48 |         def signal_handler(sig, frame):
 49 |             os.killpg(os.getpgid(self.p_latency.pid), signal.SIGINT)
 50 |             os.killpg(os.getpgid(self.p_all.pid), signal.SIGINT)
 51 |             print('[INFO] You pressed Ctrl+C!')
 52 |             sys.exit(0)
 53 | 
 54 |         #signal.signal(signal.SIGINT, signal_handler)
 55 |         print('[INFO] Press Ctrl+C to exit')
 56 | 
 57 |         # Start the PCM processes
 58 |         print("[COMMAND]", self.pcmLat_cmd)
 59 |         self.p_latency = subprocess.Popen(self.pcmLat_cmd, stdout=subprocess.PIPE, text=True, preexec_fn=os.setsid)
 60 |         time.sleep(5)
 61 | 
 62 |         print("[COMMAND]", self.pcmAll_cmd)
 63 |         self.p_all = subprocess.Popen(self.pcmAll_cmd, stdout=subprocess.PIPE, text=True, preexec_fn=os.setsid)
 64 | 
 65 | 
 66 |         # define and start the recording threads
 67 |         def enqueue_output(stdout, queue):
 68 |             for line in stdout:
 69 |                 queue.put(line)
 70 |             stdout.close()
 71 | 
 72 |         q_latency = Queue()
 73 |         q_all = Queue()
 74 | 
 75 |         t_latency = Thread(target=enqueue_output, args=(self.p_latency.stdout, q_latency))
 76 |         t_all = Thread(target=enqueue_output, args=(self.p_all.stdout, q_all))
 77 | 
 78 |         t_latency.daemon = True
 79 |         t_all.daemon = True
 80 | 
 81 |         t_latency.start()
 82 |         t_all.start()
 83 | 
 84 | 
 85 |         def catch_output_all_cal(q:Queue):
 86 |             lines = ""
 87 |             result_arr = []
 88 |             while(True):
 89 |                 try: 
 90 |                     line = q.get_nowait()
 91 |                 except Empty:
 92 |                     time.sleep(1)  # tune to 0.5 just in case
 93 |                 else:
 94 |                     lines += line
 95 |                     if line == "---------------------------------------------------------------------------------------------------------------\n":
 96 |                         norm_ipc_cal = 0
 97 |                         ipc_cal = 0
 98 |                         valid_cnt = 0
 99 | 
100 |                         for norm_ipc, ipc in result_arr:
101 |                             # XXX, hack -- norm_ipc > 0.1 kind of indicates the core is running something
102 |                             if True or norm_ipc > 0.1:
103 |                                 norm_ipc_cal += norm_ipc
104 |                                 ipc_cal += ipc
105 |                                 valid_cnt += 1
106 | 
107 |                         if valid_cnt > 0:
108 |                             norm_ipc_cal = norm_ipc_cal / valid_cnt
109 |                             ipc_cal = ipc_cal / valid_cnt
110 |                             print('valid_cnt:', valid_cnt)
111 | 
112 |                         result_arr = []
113 |                         self.stats["pcm_norm_ipc"].append(norm_ipc_cal)
114 |                         self.stats["pcm_ipc"].append(ipc_cal)
115 | 
116 |                     else:
117 |                         line_arr = line.split()
118 |                         if len(line_arr) < 5:
119 |                             continue
120 |                         elif line_arr[0].isdigit():
121 |                             norm_ipc = float(line_arr[2])
122 |                             ipc = float(line_arr[3])
123 |                             result_arr.append((norm_ipc, ipc))
124 |                             '''
125 |                             if int(line_arr[0]) > 7: 
126 |                                 result_arr.append((norm_ipc, ipc))
127 |                             '''
128 | 
129 |         # define and start the parsing threads
130 |         def catch_output_all(q:Queue):
131 |             lines = ""
132 |             pattern_norm_ipc = re.compile(r"Instructions per nominal CPU cycle: ([0-9]+\.[0-9]+)")
133 |             pattern_ipc = re.compile(r" PHYSICAL CORE IPC                 : ([0-9]+\.[0-9]+)")
134 |             while(True):
135 |                 try: 
136 |                     line = q.get_nowait()
137 |                 except Empty:
138 |                     time.sleep(1)  # tune to 0.5 just in case
139 |                 else:
140 |                     lines += line
141 |                     if line == "---------------------------------------------------------------------------------------------------------------\n":
142 |                         matches = pattern_norm_ipc.findall(lines)
143 |                         if matches: 
144 |                             if print_info: print("norm IPC: " + matches[0])
145 |                             self.stats["pcm_norm_ipc"].append(float(matches[0]))
146 | 
147 |                         matches = pattern_ipc.findall(lines)
148 |                         if matches: 
149 |                             if print_info: print("IPC: " + matches[0])
150 |                             self.stats["pcm_ipc"].append(float(matches[0]))
151 |                         lines = ""
152 | 
153 |         def catch_output_latency(q:Queue):
154 |             lines = ""
155 |             while(True):
156 |                 try: 
157 |                     line = q.get_nowait()
158 |                 except Empty:
159 |                     time.sleep(1)  # tune to 0.5 just in case
160 |                 else: # got line
161 |                     lines += line
162 |                     if line == "-----------------------------------------------------------------------------\n":
163 |                         # q_out.put(lines)
164 |                         pattern = re.compile(r"L1 Cache Miss Latency\(ns\) \[Adding 5 clocks for L1 Miss\]\n+Socket0: ([0-9]+\.[0-9]+)")
165 |                         matches = pattern.findall(lines)
166 |                         if matches: 
167 |                             if print_info: print("[RESULT] L1 Miss Latency:" + matches[0])
168 |                             self.stats["pcm_l1miss"].append(float(matches[0]))
169 | 
170 |                         pattern = re.compile(r"DDR read Latency\(ns\)\nSocket0: ([0-9]+\.[0-9]+)\s*")
171 |                         matches = pattern.findall(lines)
172 |                         if matches:
173 |                             if print_info: print("[RESULT] DDR Read Latency:" + matches[0])
174 |                             self.stats["pcm_ddrReadLat"].append(float(matches[0]))
175 | 
176 |                         lines = ""
177 | 
178 |         t_catch_latency = Thread(target=catch_output_latency, args=[q_latency])
179 |         t_catch_all = Thread(target=catch_output_all, args=[q_all])
180 | 
181 |         t_catch_latency.daemon = True
182 |         t_catch_all.daemon = True
183 | 
184 |         t_catch_latency.start()
185 |         t_catch_all.start()
186 | 
187 |         if print_info:
188 |             while(True):
189 |                 time.sleep(1) # fixed BUG: used to be 'pass', which cause the utilization to be 100%
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     pcm = pcm_metric()
194 |     pcm.run_realtime(print_info=False)
195 |     while(True):
196 |         print(pcm.get_stat())
197 |         time.sleep(1)
198 | 


--------------------------------------------------------------------------------
/memo_ae/src/util.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Developed by FAST Lab @ ECE-UIUC -- 2022-2023
  3 |  */
  4 | #include "util.h"
  5 | #include <stdio.h>
  6 | #include <numa.h>
  7 | #include <numaif.h>
  8 | #include <stdlib.h>
  9 | #include <unistd.h>
 10 | #include <errno.h>
 11 | #include <fcntl.h>
 12 | 
 13 | #define MAX_NUM_THREAD 		128
 14 | #define MAX_BUF_GB 			16
 15 | #define MAX_NUMA_NODE 		10
 16 | #define MAX_SKIP_BYTE 		1024
 17 | #define PREFETCH_REG_ADDR   0x1A4
 18 | #define MAX_CORE_NUM		63
 19 | #define FLUSH_SIZE          (512 * (1 << 20)) // MB
 20 | #define TSC_FREQ_GHZ	    2.0
 21 | 
 22 | char* help_str = " Usage: \n" \
 23 |                   "-t  	Number of threads.\n" \
 24 |                   "-f   enable prefetching (when -p is not specified, prefetch is NOT toggled.) (when -p is specified, default to prefetching disabled)\n" \
 25 |                   "-m   buffer size, in byte (!!! only with 32-bit int).\n" \
 26 |                   "-S  	buffer size, in GB.\n" \
 27 |                   "-n  	NUMA node, if op {0,1,2,3}; SRC node if op {4}.\n" \
 28 |                   "-d  	NUMA node, DST node, only used if op {4}.\n" \
 29 |                   "-s  	stall ratio -- 5 for 1:5 ratio in op:stall in bandwidth tests.\n" \
 30 |                   "-i  	number of iteration. For BW it means how many times we probe all threads.\n" \
 31 |                   "-T  	0 stands for Latency, 1 stands for Bandwidth, 2 stands for pointer tracing, 3 stands for block access latency.\n" \
 32 |                   "-p  	Pin to cores starting at core X, default -- do not pin to core (let the Linux scheduler decides).\n" \
 33 |                   "-a/b pin to core a and core b. -a and -b must be used at the same time\n" \
 34 |                   "-g   Bandwidth granularity -- batch size per each workload call, in the unit of 64B.\n" \
 35 |                   "-r   random access in bandwidth tests (sequential by default) \n" \
 36 |                   "-o   0 - Read, 1 - Read Non-temporal, 2 - Write, 3 - Write Non-temporal, 4 - movdir64B, 5 - Read/Write Mixed.\n" \
 37 |                   "-R   If chose operation as Read/Write Mixed, this argument is used to specify the read ratio. Example: 20:80.\n" \
 38 |                   "-B   flush 64KB of data block during block latency test (default to 0)\n" \
 39 |                   "-C   number of clear pipeline block in block latency tests (default to 0)\n" \
 40 |                   "-F   TSC_Freq, used for calculating cycle --> ns. Unit = MHz; default to (2000MHz). Please check with turbostat";
 41 | 
 42 | void set_default_cfg(test_cfg_t* cfg) {
 43 |     cfg->op = READ;		
 44 |     cfg->type = BW;
 45 |     cfg->num_thread = 32;
 46 |     cfg->total_buf_size = (1 << 30);
 47 |     cfg->buf_a_numa_node = 0; // src
 48 |     cfg->buf_b_numa_node = 0; // dst
 49 |     cfg->op_iter = 1;
 50 |     cfg->per_thread_size = cfg->total_buf_size / cfg->num_thread;
 51 |     cfg->starting_core = -1;
 52 |     cfg->random = false;
 53 |     cfg->prefetch_en = false;
 54 |     cfg->stall_ratio = 0;
 55 |     cfg->bw_granu = 512;
 56 |     cfg->core_a = -1;
 57 |     cfg->core_b = -1;
 58 |     cfg->read_ratio = 1;
 59 |     cfg->flush_block = 0;
 60 |     cfg->num_clear_pipe = 0;
 61 |     cfg->tsc_freq = TSC_FREQ_GHZ;
 62 | }
 63 | 
 64 | void print_cfg(test_cfg_t* cfg) {
 65 | 
 66 |     fprintf (stdout, "==========================\n");
 67 |     fprintf (stdout, "num_thread:     %lu\n", cfg->num_thread);
 68 |     fprintf (stdout, "total_buf_size: %lu\n", cfg->total_buf_size);
 69 |     fprintf (stdout, "buf_a_numa_node:%d\n", cfg->buf_a_numa_node);
 70 |     fprintf (stdout, "buf_b_numa_node:%d\n", cfg->buf_b_numa_node);
 71 |     fprintf (stdout, "per_thread_size:%ld\n", cfg->per_thread_size);
 72 |     fprintf (stdout, "op_iter:        %d\n", cfg->op_iter);
 73 |     fprintf (stdout, "type:           %d\n", cfg->type);
 74 |     fprintf (stdout, "op:             %d\n", cfg->op);
 75 |     fprintf (stdout, "starting_core:  %d\n", cfg->starting_core);
 76 |     fprintf (stdout, "random:         %d\n", cfg->random);
 77 |     fprintf (stdout, "stall_ratio:    %d\n", cfg->stall_ratio);
 78 |     fprintf (stdout, "bw_granu:       %d\n", cfg->bw_granu);
 79 |     fprintf (stdout, "core_a:         %d\n", cfg->core_a);
 80 |     fprintf (stdout, "core_b:         %d\n", cfg->core_b);
 81 |     fprintf (stdout, "flush_block:    %d\n", cfg->flush_block);
 82 |     fprintf (stdout, "num_clear_pipe: %d\n", cfg->num_clear_pipe);
 83 |     fprintf (stdout, "tsc_freq (GHz): %f\n", cfg->tsc_freq);
 84 |     fprintf (stdout, "==========================\n");
 85 | }
 86 | 
 87 | 
 88 | 
 89 | int parse_arg(int argc, char*argv[], test_cfg_t* cfg) {
 90 |     int opt;	
 91 |     int num;
 92 |     int read;
 93 |     int write;
 94 |     set_default_cfg(cfg);
 95 | 
 96 |     // TODO, parse arg for operation / type
 97 |     while ((opt = getopt(argc, argv, "F:C:p:a:b:t:m:S:n:d:s:i:g:T:o:R:rhfB")) != -1) {
 98 |         switch (opt) {
 99 |             case 'F':
100 |                 num = atoi(optarg);
101 |                 cfg->tsc_freq = (double)num / (double)(1000.0);
102 |                 break;
103 |             case 'C':
104 |                 num = atoi(optarg);
105 |                 cfg->num_clear_pipe = num;
106 |                 break;
107 |             case 'B':
108 |                 cfg->flush_block = 1;
109 |                 break;
110 |             case 'a':
111 |                 num = atoi(optarg);
112 |                 if (num < 0 || num > MAX_CORE_NUM) {
113 |                     fprintf (stderr, "Can't start a from core: %d\n", num);
114 |                     return -1;
115 |                 }
116 |                 cfg->core_a = num;
117 |                 break;
118 |             case 'b':
119 |                 num = atoi(optarg);
120 |                 if (num < 0 || num > MAX_CORE_NUM) {
121 |                     fprintf (stderr, "Can't start b from core: %d\n", num);
122 |                     return -1;
123 |                 }
124 |                 cfg->core_b = num;
125 |                 break;
126 |             case 'p':
127 |                 num = atoi(optarg);
128 |                 if (num > MAX_CORE_NUM || num < 0) {
129 |                     fprintf (stderr, "Can't start from core: %d\n", num);
130 |                     return -1;
131 |                 }
132 |                 cfg->starting_core = num;
133 |                 break;
134 |             case 't':
135 |                 num = atoi(optarg);
136 |                 if (num > MAX_NUM_THREAD) {
137 |                     fprintf (stderr, "Can't have more than %d threads, %d\n", MAX_NUM_THREAD, num);
138 |                     return -1;
139 |                 } else {
140 |                     cfg->num_thread = num;
141 |                 }
142 |                 break;
143 |             case 'm':
144 |                 num = atoi(optarg);
145 |                 cfg->total_buf_size = num;
146 |                 break;
147 | 
148 |             case 'S':
149 |                 num = atoi(optarg);
150 |                 if (num > MAX_BUF_GB) {
151 |                     fprintf (stderr, "Can't have more than %d GB buf, %d\n", MAX_BUF_GB, num);
152 |                     return -1;
153 |                 } else {
154 |                     cfg->total_buf_size = ((uint64_t)num << 30);
155 |                 }
156 |                 break;
157 | 
158 |             case 'n':
159 |                 num = atoi(optarg);
160 |                 if (num < 0 || num > MAX_NUMA_NODE) {
161 |                     fprintf (stderr, "NUMA node out of range (0, %d): %d\n", MAX_NUMA_NODE, num);
162 |                     return -1;
163 |                 } else {
164 |                     cfg->buf_a_numa_node = num;
165 |                 }
166 |                 break;
167 | 
168 |             case 'd':
169 |                 num = atoi(optarg);
170 |                 if (num < 0 || num > MAX_NUMA_NODE) {
171 |                     fprintf (stderr, "NUMA node out of range (0, %d): %d\n", MAX_NUMA_NODE, num);
172 |                     return -1;
173 |                 } else {
174 |                     cfg->buf_b_numa_node = num;
175 |                 }
176 |                 break;
177 | 
178 |             case 's':
179 |                 num = atoi(optarg);
180 |                 if (num < 0) {
181 |                     fprintf (stderr, "stall ratio must be greater than 0, found: %d\n", num);
182 |                     return -1;
183 |                 } else {
184 |                     cfg->stall_ratio = num;
185 |                 }
186 |                 break;
187 | 
188 |             case 'i':
189 |                 num = atoi(optarg);
190 |                 if (num < 0) {
191 |                     fprintf (stderr, "iteration count must be positive: %d\n", num);
192 |                     return -1;
193 |                 } else {
194 |                     cfg->op_iter = num;
195 |                 }
196 |                 break;
197 | 
198 |             case 'T':
199 |                 num = atoi(optarg);
200 |                 if(num < 0 || num > 3){
201 |                     fprintf(stderr, "type must be 0(latency clflush), 1(bandwidth), 2(pointer chasing), 3(block latency).\n");
202 |                     return -1;
203 |                 } else {
204 |                     cfg->type = num;
205 |                 }
206 |                 break;
207 | 
208 |             case 'o':
209 |                 num = atoi(optarg);
210 |                 if(num < 0 || num > 5){
211 |                     fprintf(stderr, "operation must be 0(read), 1(read non-temporal), 2(write), 3(write non-temporal), 4(movdir64B) or 5(mix RW).\n");
212 |                     return -1;
213 |                 } else {
214 |                     cfg->op = num;
215 |                 }
216 |                 break;
217 | 
218 |             case 'R':
219 |                 sscanf(optarg, "%d:%d", &read, &write);
220 |                 if (read <= 0 || write <= 0) {
221 |                     fprintf(stderr, "Read/Write ratio cannot be negative numbers!\n");
222 |                 } else {
223 |                     cfg->read_ratio = read / write;
224 |                 }
225 |                 break;
226 | 
227 |             case 'g':
228 |                 num = atoi(optarg);
229 |                 cfg->bw_granu = num;
230 |                 break;
231 | 
232 |             case 'r':
233 |                 cfg->random = true;
234 |                 break;
235 | 
236 |             case 'f':
237 |                 cfg->prefetch_en = true;
238 |                 break;
239 | 
240 |             case 'h':
241 |                 fprintf (stdout, "%s\n", help_str);
242 |                 return -2;
243 |                 break;
244 | 
245 |             case '?':
246 |                 fprintf (stderr, "Option -%c requires an argument.\n", optopt);
247 |                 return -1;
248 |                 break;
249 | 
250 |             default:
251 |                 fprintf (stderr, "default, %c, abort\n", optopt);
252 |                 return -1;
253 |                 abort();
254 |         }
255 |     }
256 | 
257 |     if (cfg->core_a * cfg->core_b < 0) {
258 |         fprintf (stderr, "found core_a: %d, core_b: %d, please set them accordingly\n", cfg->core_a, cfg->core_b);
259 |         return -1;
260 |     }
261 | 
262 |     cfg->per_thread_size = cfg->total_buf_size / cfg->num_thread;
263 |     uint64_t calculated_buf_size = cfg->per_thread_size * cfg->num_thread;
264 |     printf("cal: %lu vs total: %lu\n", calculated_buf_size, cfg->total_buf_size); 
265 | 
266 |     if (calculated_buf_size != cfg->total_buf_size) {
267 |         // reset per thread size to 2^12 byte aligned (avoid AVX run out of addresss)
268 |         cfg->per_thread_size &= 0xFFFFFFFFFFFFF000;
269 |     }
270 | 
271 |     // optind is for the extra arguments
272 |     // which are not parsed
273 |     for(; optind < argc; optind++){     
274 |         printf("extra arguments: %s\n", argv[optind]); 
275 |     }
276 | 
277 |     print_cfg(cfg);
278 |     return 0;
279 | }
280 | 
281 | // This function returns the NUMA node that a pointer address resides on.
282 | int get_node(void *p, uint64_t size)
283 | {
284 |     int* status;
285 |     void** page_arr;
286 |     unsigned long page_size;
287 |     unsigned long page_cnt;
288 |     int ret;
289 |     char* start_addr;
290 | 
291 |     page_size = (unsigned long)getpagesize();
292 |     page_cnt = (size / page_size);
293 |     status = malloc(page_cnt * sizeof(int));
294 |     page_arr = malloc(page_cnt * sizeof(char*));
295 |     start_addr = (char*)p;
296 | 
297 |     fprintf(stdout, "[get_node] buf: %lx, page_size: %ld, page_cnt: %ld\n", (uint64_t)(p), page_size, page_cnt);
298 | 
299 |     for (unsigned long i = 0; i < page_cnt; i++) {
300 |         page_arr[i] = start_addr;
301 |         if (i < page_cnt) {
302 |             start_addr = &(start_addr[page_size]);
303 |         }
304 |     }
305 | 
306 | 
307 |     ret = move_pages(0, page_cnt, page_arr, NULL, status, 0); 
308 |     if (ret != 0) {
309 |         fprintf(stderr, "Problem in %s line %d calling move_pages(), ret = %d\n", __FILE__,__LINE__, ret);
310 |         printf("%s\n", strerror(errno));
311 |     }
312 | 
313 |     ret = status[0];
314 |     for (uint64_t i = 0; i < page_cnt; i++) {
315 |         if (ret != status[i]) {
316 |             fprintf(stderr, "found page: %lu on node: %d, different from node: %d\n", i, status[i], ret);
317 |             ret = status[i];
318 |             break;
319 |         }
320 |     }
321 | 
322 |     if (ret == status[0]) {
323 |         fprintf(stdout, "all pages: %lx, %lx ... are on node: %d\n", (uint64_t)(page_arr[0]), (uint64_t)(page_arr[1]), ret);
324 |     }
325 | 
326 |     free(page_arr);
327 |     free(status);
328 |     return ret;
329 | }
330 | 
331 | int init_buf(uint64_t size, int node, char** alloc_ptr) {
332 |     char *ptr;
333 |     int ret;
334 |     unsigned long page_size;
335 |     uint64_t page_cnt;
336 |     uint64_t idx;
337 | 
338 |     if ((ptr = (char *)numa_alloc_onnode(size, node)) == NULL) {
339 |         fprintf(stderr,"Problem in %s line %d allocating memory\n",__FILE__,__LINE__);
340 |         return -1;
341 |     }
342 |     printf("[INFO] done alloc. Next, touch all pages\n");
343 |     // alloc is only ready when accessed
344 |     page_size = (unsigned long)getpagesize();
345 |     page_cnt = (size / page_size);
346 |     idx = 0;
347 |     for (uint64_t i = 0; i < page_cnt; i++) {
348 |         ptr[idx] = 0;	
349 |         idx += page_size;
350 |     }
351 |     printf("[INFO] done touching pages. Next, validate on node X\n");
352 |     ret = get_node(ptr, size);
353 |     if (ret != node) {
354 |         printf("ptr is on node %d, but expect node %d\n", ret, node);
355 |         return -2;
356 |     }
357 |     printf("ptr is on node %d\n", ret);
358 |     printf("allocated: %luMB\n", (size >> 20));
359 | 
360 |     *alloc_ptr = ptr;
361 |     
362 |     return 0;
363 | }
364 | 
365 | uint64_t read_MSR(int cpu){
366 |     int fd;
367 |     uint64_t data;
368 |     char msr_file_name[64];
369 | 
370 |     sprintf(msr_file_name, "/dev/cpu/%d/msr", cpu);
371 |     fd = open(msr_file_name, O_RDONLY);
372 | 
373 |     if (fd < 0) {
374 |         if (errno == ENXIO) {
375 |             fprintf(stderr, "rdmsr: No CPU %d\n", cpu);
376 |             exit(2);
377 |         } else if (errno == EIO) {
378 |             fprintf(stderr, "rdmsr: CPU %d doesn't support MSRs\n",
379 |                     cpu);
380 |             exit(3);
381 |         } else {
382 |             perror("rdmsr: open");
383 |             exit(127);
384 |         }
385 |     }
386 | 
387 |     if (pread(fd, &data, sizeof data, PREFETCH_REG_ADDR) != sizeof data) {
388 |         if (errno == EIO) {
389 |             fprintf(stderr, "rdmsr: CPU %d cannot read ", cpu);
390 |             exit(4);
391 |         } else {
392 |             perror("rdmsr: pread");
393 |             exit(127);
394 |         }
395 |     }
396 | 
397 |     close(fd);
398 | 
399 |     return data;
400 | }
401 | 
402 | void write_MSR(int cpu, uint64_t val){
403 |     int fd;
404 |     char msr_file_name[64];
405 | 
406 |     sprintf(msr_file_name, "/dev/cpu/%d/msr", cpu);
407 |     fd = open(msr_file_name, O_WRONLY);
408 | 
409 |     if (fd < 0) {
410 |         if (errno == ENXIO) {
411 |             fprintf(stderr, "rdmsr: No CPU %d\n", cpu);
412 |             exit(2);
413 |         } else if (errno == EIO) {
414 |             fprintf(stderr, "rdmsr: CPU %d doesn't support MSRs\n",
415 |                     cpu);
416 |             exit(3);
417 |         } else {
418 |             perror("rdmsr: open");
419 |             exit(127);
420 |         }
421 |     }
422 | 
423 |     if (pwrite(fd, &val, sizeof(val), PREFETCH_REG_ADDR) != sizeof(val)){
424 |         if (errno == EIO) {
425 |             fprintf(stderr,
426 |                     "wrmsr: CPU %d cannot set MSR ", cpu);
427 |             exit(4);
428 |         } else {
429 |             perror("wrmsr: pwrite");
430 |             exit(127);
431 |         }
432 |     }
433 | 
434 |     close(fd);
435 | 
436 |     return;
437 | }
438 | 
439 | void disable_prefetch(int cpu){
440 |     uint64_t val;
441 |     val = read_MSR(cpu);
442 |     write_MSR(cpu, val | 0xF);
443 |     val = read_MSR(cpu);
444 |     printf(YEL "[INFO]" RESET " CPU %d prefetch disabled. Now at 0x1A4: %lx\n", cpu, val);
445 | }
446 | 
447 | void enable_prefetch(int cpu){
448 |     uint64_t val;
449 |     val = read_MSR(cpu);
450 |     write_MSR(cpu, val & 0xFFFFFFFFFFFFFFF0);
451 |     printf(YEL "[INFO]" RESET " CPU %d prefetch enabled.\n", cpu);
452 | }
453 | 
454 | // taken from https://stackoverflow.com/questions/1046714/what-is-a-good-random-number-generator-for-a-game
455 | static uint64_t y=362436069, z=521288629;
456 | uint64_t xorshf96(uint64_t* xx) {          //period 2^96-1
457 |     uint64_t t;
458 |     uint64_t x = *xx;
459 |     x ^= x << 16;
460 |     x ^= x >> 5;
461 |     x ^= x << 1;
462 | 
463 |     t = x;
464 |     x = y;
465 |     y = z;
466 |     z = t ^ x ^ y;
467 |     *xx = x;
468 | 
469 |     return z;
470 | }
471 | 
472 | // can't use WBINVD
473 | // https://stackoverflow.com/questions/1756825/how-can-i-do-a-cpu-cache-flush-in-x86-windows
474 | // alloc large bue and read/write
475 | void flush_all_cache() {
476 |     char* buf;
477 |     printf(YEL "[INFO]" RESET " Flushing cache, with %d MB access ... \n", FLUSH_SIZE >> 20);
478 | 
479 |     buf = malloc(FLUSH_SIZE);
480 |     for (int j = 0; j < 2; j++) {
481 |         for (int i = 0; i < FLUSH_SIZE; i++) {
482 |             buf[i] = i + 1; // make sure this is not optimized
483 |         }
484 |     }
485 |     free(buf);
486 |     printf(YEL "[INFO]" RESET " Cache flush done ... \n");
487 | }
488 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/memo_ae/src/workload.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Developed by FAST Lab @ ECE-UIUC -- 2022-2023
  3 |  *  Some part of this file follows the methodology of FAST-20 Yang's resporitory
  4 |  *  @ https://github.com/NVSL/OptaneStudy/tree/master
  5 |  */
  6 | #include "workload.h"
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <stdint.h>
 10 | #include <util.h>
 11 | #include <math.h>
 12 | #include <string.h>
 13 | 
 14 | #define MIN_GRANULARITY     512
 15 | 
 16 | // change me to use different size of AVX
 17 | //#define SIZENTLD_MACRO SIZENTLD_512_AVX512
 18 | #define SIZENTLD_MACRO SIZENTLD_1024_AVX512
 19 | #define SIZELD_MACRO   SIZELD_1024_AVX512
 20 | #define SIZEST_MACRO   SIZEST_1024_AVX512
 21 | #define SIZEMOV_MACRO  SIZEMOV_1024
 22 | //#define SIZEST_MACRO   SIZEST_WB_1024_AVX512
 23 | 
 24 | 
 25 | /**
 26 |  * op_ntld
 27 |  *   @brief Load the given size data from the memory with non-temporal hint.
 28 |  *   @param addr  the load start address
 29 |  *   @param size   		the size of the memory we want to access (in byte)
 30 |  *   @return none
 31 |  */
 32 | void op_ntld(char* addr, long size){
 33 |     /* by default we perform load in 512 byte granularity */
 34 |     /* sanity check */
 35 |     if(size < MIN_GRANULARITY){
 36 |         fprintf(stderr, RED "[ERROR]" RESET "op_ntld(): buffer size is smaller than %d byte.", MIN_GRANULARITY);
 37 |         exit(1);
 38 |     }
 39 |     /* round down to MIN_GRANULARITY */
 40 |     size = size - (size % MIN_GRANULARITY);
 41 | 
 42 |     asm volatile(
 43 |             "mov %[addr], %%r9 \n"
 44 |             "xor %%r10, %%r10 \n"
 45 |             "LOOP_NTLD: \n"
 46 |             SIZENTLD_MACRO	
 47 |             "cmp %[size], %%r10 \n"
 48 |             "jl LOOP_NTLD \n"
 49 |             : /* output */
 50 |             :[size]"r"(size), [addr]"r"(addr) /* input */
 51 |             :"%r9", "%r10" /* clobbered register */
 52 |             );
 53 | }
 54 | 
 55 | /**
 56 |  * op_ld
 57 |  *   @brief Load the given size data from the memory with non-temporal hint.
 58 |  *   @param addr  the load start address
 59 |  *   @param size   		the size of the memory we want to access (in byte)
 60 |  *   @return none
 61 |  */
 62 | void op_ld(char* addr, long size){
 63 |     /* by default we perform load in 512 byte granularity */
 64 |     /* sanity check */
 65 |     if(size < MIN_GRANULARITY){
 66 |         fprintf(stderr, RED "[ERROR]" RESET "op_ld(): buffer size is smaller than %d byte.", MIN_GRANULARITY);
 67 |         exit(1);
 68 |     }
 69 |     /* round down to MIN_GRANULARITY*/
 70 |     size = size - (size % MIN_GRANULARITY);
 71 | 
 72 |     asm volatile(
 73 |             "mov %[addr], %%r9 \n"
 74 |             "xor %%r10, %%r10 \n"
 75 |             "LOOP_LD: \n"
 76 |             SIZELD_MACRO	
 77 |             "cmp %[size], %%r10 \n"
 78 |             "jl LOOP_LD \n"
 79 |             : /* output */
 80 |             :[size]"r"(size), [addr]"r"(addr) /* input */
 81 |             :"%r9", "%r10", ZMM_0_15 /* clobbered register */
 82 |             );
 83 | }
 84 | 
 85 | /**
 86 |  * op_ntst
 87 |  *   @brief Store the given size data to the memory with non-temporal hint.
 88 |  *   @param addr the store start address
 89 |  *   @param size	   the size of the memory we want to store (in byte)
 90 |  *   @return none
 91 |  */
 92 | void op_ntst(char* addr, long size){
 93 |     /* by default we perform load in 512 byte granularity */
 94 |     /* sanity check */
 95 |     if(size < MIN_GRANULARITY){
 96 |         fprintf(stderr, RED "[ERROR]" RESET "op_ntst(): buffer size is smaller than %d byte.", MIN_GRANULARITY);
 97 |         exit(1);
 98 |     }
 99 |     /* round down to MIN_GRANULARITY*/
100 |     size = size - (size % MIN_GRANULARITY);
101 | 
102 |     asm volatile(
103 |             "mov %[addr], %%r9 \n"
104 |             "xor %%r10, %%r10 \n"
105 |             "LOOP_NTST: \n"
106 |             SIZENTST_1024_AVX512
107 |             "cmp %[size], %%r10 \n"
108 |             "jl LOOP_NTST \n"
109 |             "sfence \n"
110 |             : /* output */
111 |             :[size]"r"(size), [addr]"r"(addr) /* input */
112 |             :"%r9", "%r10", ZMM_0_15 /* clobbered register */
113 |             );
114 | }
115 | 
116 | /**
117 |  * op_st
118 |  *   @brief Store the given size data to the memory with non-temporal hint.
119 |  *   @param addr the store start address
120 |  *   @param size	   the size of the memory we want to store (in byte)
121 |  *   @return none
122 |  */
123 | void op_st(char* addr, long size){
124 |     /* by default we perform load in 512 byte granularity */
125 |     /* sanity check */
126 |     if(size < MIN_GRANULARITY){
127 |         fprintf(stderr, RED "[ERROR]" RESET "op_st(): buffer size is smaller than %d byte.", MIN_GRANULARITY);
128 |         exit(1);
129 |     }
130 |     /* round down to MIN_GRANULARITY*/
131 |     size = size - (size % MIN_GRANULARITY);
132 | 
133 |     asm volatile(
134 |             "mov %[addr], %%r9 \n"
135 |             "xor %%r10, %%r10 \n"
136 |             "LOOP_ST: \n"
137 |             SIZEST_MACRO
138 |             "cmp %[size], %%r10 \n"
139 |             "jl LOOP_ST \n"
140 |             : /* output */
141 |             :[size]"r"(size), [addr]"r"(addr) /* input */
142 |             :REGISTERS, "%r10" /* clobbered register */
143 |             );
144 | }
145 | 
146 | /**
147 |  * op_mixed
148 |  *   @brief Store the given size data to the memory with non-temporal hint.
149 |  *   @param addr the store start address
150 |  *   @param size the total size of the memory we want to operate (in byte)
151 |  * 	 @param ratio the read to write ratio of memory accesses
152 |  *   @return none
153 |  */
154 | void op_mixed(char* addr, long size, int ratio){
155 |     /* by default we perform load in 512 byte granularity */
156 |     /* sanity check */
157 |     if(size < 384){
158 |         fprintf(stderr, RED "[ERROR]" RESET "op_mix(): buffer size is smaller than 384 byte.");
159 |         exit(1);
160 |     }
161 |     /* round down to 512 */
162 |     // size = size - (size % 512);
163 | 
164 |     switch (ratio)
165 |     {
166 |         case 1:  // 1R:1W
167 |             asm volatile(
168 |                     "mov %[addr], %%r9 \n"
169 |                     "xor %%r10, %%r10 \n"
170 |                     "LOOP_MIXED1: \n"
171 |                     SIZE_R1W1_512
172 |                     "cmp %[size], %%r10 \n"
173 |                     "jl LOOP_MIXED1 \n"
174 |                     : /* output */
175 |                     :[size]"r"(size), [addr]"r"(addr) /* input */
176 |                     :REGISTERS, "%r10" /* clobbered register */
177 |                     );
178 |             break;
179 | 
180 |         case 2:  // 2R:1W
181 |             asm volatile(
182 |                     "mov %[addr], %%r9 \n"
183 |                     "xor %%r10, %%r10 \n"
184 |                     "LOOP_MIXED2: \n"
185 |                     // SIZE_R2W1_384
186 |                     SIZE_R2W1_576
187 |                     "cmp %[size], %%r10 \n"
188 |                     "jl LOOP_MIXED2 \n"
189 |                     : /* output */
190 |                     :[size]"r"(size), [addr]"r"(addr) /* input */
191 |                     :REGISTERS, "%r10" /* clobbered register */
192 |                     );
193 |             break;
194 |         case 3: // 3R:1W
195 |             asm volatile(
196 |                     "mov %[addr], %%r9 \n"
197 |                     "xor %%r10, %%r10 \n"
198 |                     "LOOP_MIXED3: \n"
199 |                     SIZE_R3W1_512
200 |                     "cmp %[size], %%r10 \n"
201 |                     "jl LOOP_MIXED3 \n"
202 |                     : /* output */
203 |                     :[size]"r"(size), [addr]"r"(addr) /* input */
204 |                     :REGISTERS, "%r10" /* clobbered register */
205 |                     );
206 |             break;
207 | 
208 |         default:
209 |             fprintf(stderr, RED "[ERROR]" RESET "op_mix(): Invalid RW ratio.");
210 |             exit(1);
211 |             break;
212 |     }
213 | 
214 | 
215 | }
216 | 
217 | /**
218 |  * op_stall 
219 |  *   @brief stall the core by issuing nop
220 |  */
221 | 
222 | void op_stall() {
223 |     asm volatile(
224 |             CLEAR_PIPELINE
225 |             CLEAR_PIPELINE
226 |             CLEAR_PIPELINE
227 |             CLEAR_PIPELINE
228 |             :
229 |             :
230 |             :
231 |             );
232 | }
233 | 
234 | /**
235 |  * op_movdir64B
236 |  *   @brief Store the given size data to the memory with non-temporal hint.
237 | 
238 |  *   @param addr the store start address
239 |  *   @param size	   the size of the memory we want to store (in byte)
240 |  *   @return none
241 |  */
242 | void op_movdir64B(char* src_addr, char* dst_addr, long size) {
243 |     /* by default we perform load in 512 byte granularity */
244 |     /* sanity check */
245 |     if(size < MIN_GRANULARITY){
246 |         fprintf(stderr, RED "[ERROR]" RESET "op_st(): buffer size is smaller than %d byte.", MIN_GRANULARITY);
247 |         exit(1);
248 |     }
249 |     /* round down to MIN_GRANULARITY */
250 |     size = size - (size % MIN_GRANULARITY);
251 |     asm volatile(
252 |             "mov %[src_addr], %%r9 \n"
253 |             "mov %[dst_addr], %%r12 \n"
254 |             "xor %%r10, %%r10 \n"
255 |             "LOOP_MOV: \n"
256 |             SIZEMOV_MACRO
257 |             "cmp %[size], %%r10 \n"
258 |             "jl LOOP_MOV \n"
259 |             "sfence \n"
260 |             : /* output */
261 |             :[size]"r"(size), [src_addr]"r"(src_addr), [dst_addr]"r"(dst_addr)/* input */
262 |             :REGISTERS, "%r10", "%r11", "%r12" /* clobbered register */
263 |             );
264 | }
265 | 
266 | /**
267 |  * op_ntld_32B_lat
268 |  *   @brief measure the latency of loading 32 bytes with non-temporal hint
269 |  *   @param addr the memory address from where we load the 32 bytes
270 |  *   @return the time elapsed during load process. In unit of CPU cycles.
271 |  */
272 | uint64_t op_ntld_32B_lat(char* addr){
273 |     uint64_t t_start = 0, t_end = 0;
274 | 
275 |     /* make sure the addr is 32 byte aligned */
276 |     addr = (char*)((uint64_t)addr & (~0x1F));
277 | 
278 |     asm volatile(
279 |             "mov %[addr], %%rsi\n"
280 |             "mfence\n"
281 |             FLUSH_CACHE_LINE
282 |             TIMING_BEGIN
283 |             "vmovntdqa 0*32(%%rsi), %%ymm0 \n"
284 |             TIMING_END
285 |             :[t_start] "=r" (t_start), [t_end] "=r" (t_end)
286 |             :[addr] "r" (addr)
287 |             :REGISTERS
288 |             );
289 | 
290 |     return (t_end - t_start);
291 | }
292 | 
293 | /**
294 |  * op_ntld_64B_lat
295 |  *   @brief measure the latency of loading 64 bytes with non-temporal hint
296 |  *   @param addr the memory address from where we load the 64 bytes
297 |  *   @return the time elapsed during load process. In unit of CPU cycles.
298 |  */
299 | uint64_t op_ntld_64B_lat(char* addr){
300 |     uint64_t t_start = 0, t_end = 0;
301 | 
302 |     /* make sure address is 64byte aligned (what will happen if not?) */
303 |     addr = (char*)((uint64_t)addr & (~0x3F));
304 | 
305 |     asm volatile(
306 |             "mov %[addr], %%rsi\n"
307 |             "mfence\n"
308 |             FLUSH_CACHE_LINE
309 |             TIMING_BEGIN
310 |             "vmovntdqa 0*32(%%rsi), %%ymm0 \n"
311 |             "vmovntdqa 1*32(%%rsi), %%ymm1 \n"
312 |             TIMING_END
313 |             :[t_start] "=r" (t_start), [t_end] "=r" (t_end)
314 |             :[addr] "r" (addr)
315 |             :REGISTERS
316 |             );
317 | 
318 |     return (t_end - t_start);
319 | }
320 | 
321 | 
322 | /**
323 |  * op_ntst_64B_lat
324 |  *   @brief measure the latency of storing 64 bytes with non-temporal hint
325 |  *   @param addr the memory address from where we store the 64 bytes
326 |  *   @return the time elapsed during store process. In unit of CPU cycles.
327 |  */
328 | uint64_t op_ntst_64B_lat(char* addr){
329 |     uint64_t t_start = 0, t_end = 0;
330 | 
331 |     /* make sure address is 64byte aligned (what will happen if not?) */
332 |     addr = (char*)((uint64_t)addr & (~0x3F));
333 | 
334 |     asm volatile(
335 |             "mov %[addr], %%rsi\n"
336 |             "mfence\n"
337 |             FLUSH_CACHE_LINE
338 |             CLEAR_PIPELINE
339 |             TIMING_BEGIN
340 |             "vmovntpd %%ymm0, 0*32(%%rsi) \n"
341 |             "vmovntpd %%ymm1, 1*32(%%rsi) \n"
342 | 
343 |             TIMING_END
344 |             :[t_start] "=r" (t_start), [t_end] "=r" (t_end)
345 |             :[addr] "r" (addr)
346 |             :REGISTERS
347 |             );
348 | 
349 |     return (t_end - t_start);
350 | }
351 | 
352 | /**
353 |  * op_ld_64B_lat
354 |  *   @brief measure the latency of loading 64 bytes without non-temporal hint
355 |  *   @param addr the memory address from where we load the 64 bytes
356 |  *   @return the time elapsed during load process. In unit of CPU cycles.
357 |  */
358 | uint64_t op_ld_64B_lat(char* addr){
359 |     uint64_t t_start = 0, t_end = 0;
360 | 
361 |     /* make sure address is 64byte aligned (what will happen if not?) */
362 |     addr = (char*)((uint64_t)addr & (~0x3F));
363 | 
364 |     asm volatile(
365 |             "mov %[addr], %%rsi\n"
366 |             "mfence\n"
367 |             FLUSH_CACHE_LINE
368 |             CLEAR_PIPELINE
369 |             TIMING_BEGIN
370 |             "vmovdqa 0*32(%%rsi), %%ymm0 \n"
371 |             "vmovdqa 1*32(%%rsi), %%ymm1 \n"
372 |             TIMING_END
373 |             :[t_start] "=r" (t_start), [t_end] "=r" (t_end)
374 |             :[addr] "r" (addr)
375 |             :REGISTERS
376 |             );
377 | 
378 |     return (t_end - t_start);
379 | }
380 | 
381 | /**
382 |  * op_st_64B_lat
383 |  *   @brief measure the latency of storing 64 bytes without non-temporal hint
384 |  *   @param addr the memory address from where we store the 64 bytes
385 |  *   @return the time elapsed during store process. In unit of CPU cycles.
386 |  */
387 | uint64_t op_st_64B_lat(char* addr){
388 |     uint64_t t_start = 0, t_end = 0;
389 | 
390 |     /* make sure address is 64byte aligned (what will happen if not?) */
391 |     addr = (char*)((uint64_t)addr & (~0x3F));
392 | 
393 |     asm volatile(
394 |             "mov %[addr], %%rsi\n"
395 |             "mfence\n"
396 |             FLUSH_CACHE_LINE
397 |             CLEAR_PIPELINE
398 |             TIMING_BEGIN
399 |             "vmovdqa %%ymm0, 0*32(%%rsi) \n"
400 |             "vmovdqa %%ymm0, 1*32(%%rsi) \n"
401 |             //"sfence \n"
402 |             TIMING_END
403 |             :[t_start] "=r" (t_start), [t_end] "=r" (t_end)
404 |             :[addr] "r" (addr)
405 |             :REGISTERS
406 |             );
407 | 
408 |     return (t_end - t_start);
409 | }
410 | 
411 | /**
412 |  * op_st_cl_flush_64B_lat
413 |  *   @brief measure the latency of storing 64 bytes & flushing the cacheline, without non-temporal hint
414 |  *   @param addr the memory address from where we store the 64 bytes
415 |  *   @return the time elapsed during store process. In unit of CPU cycles.
416 |  */
417 | uint64_t op_st_cl_flush_64B_lat(char* addr){
418 |     uint64_t t_start = 0, t_end = 0;
419 | 
420 |     /* make sure address is 64byte aligned (what will happen if not?) */
421 |     addr = (char*)((uint64_t)addr & (~0x3F));
422 | 
423 |     asm volatile(
424 |             "mov %[addr], %%rsi\n"
425 |             "mfence\n"
426 |             CLEAR_PIPELINE
427 |             TIMING_BEGIN
428 |             "vmovdqa %%ymm0, 0*32(%%rsi) \n"
429 |             "vmovdqa %%ymm0, 1*32(%%rsi) \n"
430 |             "clwb 0*32(%%rsi) \n"
431 |             TIMING_END
432 |             :[t_start] "=r" (t_start), [t_end] "=r" (t_end)
433 |             :[addr] "r" (addr)
434 |             :REGISTERS
435 |             );
436 |     return (t_end - t_start);
437 | }
438 | 
439 | /**
440 |  * op_st_32B_lat
441 |  *   @brief measure the latency of storing 32 bytes without non-temporal hint
442 |  *   @param addr the memory address from where we store the 32 bytes
443 |  *   @return the time elapsed during store process. In unit of CPU cycles.
444 |  */
445 | uint64_t op_st_32B_lat(char* addr){
446 |     uint64_t t_start = 0, t_end = 0;
447 | 
448 |     /* make sure address is 32 byte aligned (what will happen if not?) */
449 |     addr = (char*)((uint64_t)addr & (~0x1F));
450 | 
451 |     asm volatile(
452 |             "mov %[addr], %%rsi\n"
453 |             "mfence\n"
454 |             FLUSH_CACHE_LINE
455 |             TIMING_BEGIN
456 |             "vmovdqa %%ymm0, 0*32(%%rsi) \n"
457 |             TIMING_END
458 |             :[t_start] "=r" (t_start), [t_end] "=r" (t_end)
459 |             :[addr] "r" (addr)
460 |             :REGISTERS
461 |             );
462 | 
463 |     return (t_end - t_start);
464 | }
465 | 
466 | uint64_t op_ptr_chase(char* addr, uint64_t num_chase_block) {
467 |     uint64_t t_start = 0, t_end = 0;
468 |     asm volatile(
469 |             "mov %[addr], %%r11 \n"
470 |             "xor %%r10, %%r10 \n"
471 |             TIMING_BEGIN
472 | 
473 |             "LOOP_CHASE: \n"
474 |             "mov (%%r11), %%r11 \n"
475 |             "inc %%r10 \n"
476 |             "cmp %[num_chase_block], %%r10 \n"
477 |             "jl LOOP_CHASE \n"
478 | 
479 |             TIMING_END
480 |             :[t_start] "=r" (t_start), [t_end] "=r" (t_end)
481 |             :[addr] "r" (addr), [num_chase_block] "r" (num_chase_block)
482 |             :REGISTERS, "%r10", "%r11"
483 |             );
484 |     return (t_end - t_start);
485 | }
486 | 
487 | uint64_t op_stwb_block_lat(char* addr, bool flush_block, long num_clear_pipe) {
488 |     uint64_t t_start = 0, t_end = 0;
489 |     //assume 64KB buff
490 |     asm volatile(
491 |             "mov %[addr], %%r11 \n"
492 |             "xor %%r10, %%r10 \n"
493 | 
494 |             "cmp $0x0, %[flush_block] \n"
495 |             "je LOOP_BLOCK_STWB_FLUSH_DONE \n"
496 |             "LOOP_BLOCK_STWB_FLUSH: \n"
497 |                 "clflush (%%r11, %%r10) \n"
498 |                 "add $0x40, %%r10 \n"
499 |                 "cmp $0x10000, %%r10 \n"
500 |                 "jl LOOP_BLOCK_STWB_FLUSH \n"
501 |             "xor %%r10, %%r10 \n"
502 |             "mfence \n"
503 | 
504 |             "LOOP_BLOCK_STWB_FLUSH_DONE: \n"
505 | 
506 |                 "cmp %[num_clear_pipe], %%r10 \n"
507 |                 "je LOOP_BLOCK_STWB_START \n"
508 |                 CLEAR_PIPELINE_x16
509 |                 "add $0x1, %%r10 \n"
510 |                 "jmp LOOP_BLOCK_STWB_FLUSH_DONE \n"
511 | 
512 |             "LOOP_BLOCK_STWB_START: \n"
513 |             "xor %%r10, %%r10 \n"
514 | 
515 |             // Test 
516 |             TIMING_BEGIN
517 |             STWB_xN_RAND_AVX512 
518 |             TIMING_END
519 | 
520 |             :[t_start] "=r" (t_start), [t_end] "=r" (t_end)
521 |             :[addr] "r" (addr), [flush_block] "r" (flush_block), [num_clear_pipe] "r" (num_clear_pipe)
522 |             :REGISTERS, "%r10", "%r11", ZMM_0_15
523 |     );
524 | 
525 |     return (t_end - t_start);
526 | }
527 | 
528 | uint64_t op_ld_block_lat(char* addr, bool flush_block, long num_clear_pipe) {
529 |     uint64_t t_start = 0, t_end = 0;
530 |     asm volatile(
531 |             "mov %[addr], %%r11 \n"
532 |             "xor %%r10, %%r10 \n"
533 | 
534 |             // flush data
535 |             "cmp $0x0, %[flush_block] \n"
536 |             "je LOOP_BLOCK_LD_FLUSH_DONE \n"
537 |             "LOOP_BLOCK_LD_FLUSH: \n"
538 |                 "clflush (%%r11, %%r10) \n"
539 |                 "add $0x40, %%r10 \n"
540 |                 "cmp $0x10000, %%r10 \n"
541 |                 "jl LOOP_BLOCK_LD_FLUSH \n"
542 |             "xor %%r10, %%r10 \n"
543 |             "mfence \n"
544 | 
545 |             "LOOP_BLOCK_LD_FLUSH_DONE: \n"
546 | 
547 |                 "cmp %[num_clear_pipe], %%r10 \n"
548 |                 "je LOOP_BLOCK_LD_START \n"
549 |                 CLEAR_PIPELINE_x16
550 |                 "add $0x1, %%r10 \n"
551 |                 "jmp LOOP_BLOCK_LD_FLUSH_DONE \n"
552 | 
553 |             "LOOP_BLOCK_LD_START: \n"
554 |             "xor %%r10, %%r10 \n"
555 | 
556 |             // Test 
557 |             TIMING_BEGIN
558 |             LD_xN_RAND_AVX512
559 |             TIMING_END
560 | 
561 |             :[t_start] "=r" (t_start), [t_end] "=r" (t_end)
562 |             :[addr] "r" (addr), [flush_block] "r" (flush_block), [num_clear_pipe] "r" (num_clear_pipe)
563 |             :REGISTERS, "%r10", "%r11", ZMM_0_15
564 |     );
565 |     return (t_end - t_start);
566 | }
567 | 
568 | uint64_t op_ntld_block_lat(char* addr, bool flush_block, long num_clear_pipe) {
569 |     uint64_t t_start = 0, t_end = 0;
570 |     asm volatile(
571 |             "mov %[addr], %%r11 \n"
572 |             "xor %%r10, %%r10 \n"
573 | 
574 |             // flush data
575 |             "cmp $0x0, %[flush_block] \n"
576 |             "je LOOP_BLOCK_NTLD_FLUSH_DONE \n"
577 |             "LOOP_BLOCK_NTLD_FLUSH: \n"
578 |                 "clflush (%%r11, %%r10) \n"
579 |                 "add $0x40, %%r10 \n"
580 |                 "cmp $0x10000, %%r10 \n"
581 |                 "jl LOOP_BLOCK_NTLD_FLUSH \n"
582 |             "xor %%r10, %%r10 \n"
583 |             "mfence \n"
584 | 
585 |             "LOOP_BLOCK_NTLD_FLUSH_DONE: \n"
586 | 
587 |                 "cmp %[num_clear_pipe], %%r10 \n"
588 |                 "je LOOP_BLOCK_NTLD_START \n"
589 |                 CLEAR_PIPELINE_x16
590 |                 "add $0x1, %%r10 \n"
591 |                 "jmp LOOP_BLOCK_NTLD_FLUSH_DONE \n"
592 | 
593 |             "LOOP_BLOCK_NTLD_START: \n"
594 |             "xor %%r10, %%r10 \n"
595 | 
596 |             // Test 
597 |             TIMING_BEGIN
598 |             NTLD_xN_RAND_AVX512
599 |             TIMING_END
600 | 
601 |             :[t_start] "=r" (t_start), [t_end] "=r" (t_end)
602 |             :[addr] "r" (addr), [flush_block] "r" (flush_block), [num_clear_pipe] "r" (num_clear_pipe)
603 |             :REGISTERS, "%r10", "%r11", ZMM_0_15
604 |     );
605 |     return (t_end - t_start);
606 | }
607 | 
608 | uint64_t op_ntst_block_lat(char* addr, bool flush_block, long num_clear_pipe) {
609 |     uint64_t t_start = 0, t_end = 0;
610 |     asm volatile(
611 |             "mov %[addr], %%r11 \n"
612 |             "xor %%r10, %%r10 \n"
613 | 
614 |             // flush data
615 |             "cmp $0x0, %[flush_block] \n"
616 |             "je LOOP_BLOCK_NTST_FLUSH_DONE \n"
617 |             "LOOP_BLOCK_NTST_FLUSH: \n"
618 |                 "clflush (%%r11, %%r10) \n"
619 |                 "add $0x40, %%r10 \n"
620 |                 "cmp $0x10000, %%r10 \n"
621 |                 "jl LOOP_BLOCK_NTST_FLUSH \n"
622 |             "xor %%r10, %%r10 \n"
623 |             "mfence \n"
624 | 
625 |             "LOOP_BLOCK_NTST_FLUSH_DONE: \n"
626 | 
627 |                 "cmp %[num_clear_pipe], %%r10 \n"
628 |                 "je LOOP_BLOCK_NTST_START \n"
629 |                 CLEAR_PIPELINE_x16
630 |                 "add $0x1, %%r10 \n"
631 |                 "jmp LOOP_BLOCK_NTST_FLUSH_DONE \n"
632 | 
633 |             "LOOP_BLOCK_NTST_START: \n"
634 |             "xor %%r10, %%r10 \n"
635 | 
636 |             // Test 
637 |             TIMING_BEGIN
638 |             NTST_xN_RAND_AVX512
639 |             "sfence \n"
640 |             TIMING_END
641 | 
642 |             :[t_start] "=r" (t_start), [t_end] "=r" (t_end)
643 |             :[addr] "r" (addr), [flush_block] "r" (flush_block), [num_clear_pipe] "r" (num_clear_pipe)
644 |             :REGISTERS, "%r10", "%r11", ZMM_0_15
645 |     );
646 |     return (t_end - t_start);
647 | }
648 | 
649 | void set_all_zmm(char* addr) {
650 |     asm volatile(
651 |         "mov %[addr], %%r9 \n"
652 |         "xor %%r10, %%r10 \n"
653 |         SIZELD_MACRO	
654 |         "mfence\n"
655 |         : /* output */
656 |         :[addr]"r"(addr) /* input */
657 |         :"%r9", "%r10", REGISTERS, ZMM_0_15 /* clobbered register */
658 |     );
659 | }
660 | 
661 | void dump_zmm(char* dst, uint64_t size) {
662 |     char* data_buf;
663 |     posix_memalign((void**)(&data_buf), 4096, 4096);
664 |     for (int i = 0; i < 4096; i++) {
665 |         data_buf[i] = 0;
666 |     }
667 |     asm volatile(
668 |         "mov %[addr], %%r9 \n"
669 |         "xor %%r10, %%r10 \n"
670 |         SIZEST_MACRO
671 |         "mfence\n"
672 |         : /* output */
673 |         :[addr]"r"(data_buf) /* input */
674 |         :"%r9", "%r10", REGISTERS, ZMM_0_15 /* clobbered register */
675 |     );
676 |     for (int i = 0; i < 1024; i++) {
677 |         if (i % 64 == 0) {
678 |             printf("zmm%d ", i / 64);
679 |         }
680 |         printf("%x", (unsigned char)data_buf[i]);
681 |         if (i % 64 == 63) {
682 |             printf("\n");
683 |         }
684 |     }
685 |     if (dst != NULL) {
686 |         uint64_t copy_size = size > 1024 ? 1024 : size;
687 |         memcpy(data_buf, dst, copy_size);
688 |     }
689 |     free(data_buf);
690 | }
691 | 


--------------------------------------------------------------------------------
/memo_ae/src/test.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Developed by FAST Lab @ ECE-UIUC -- 2022-2023
  3 |  */
  4 | #define _GNU_SOURCE
  5 | 
  6 | #include "test.h"
  7 | #include "util.h"
  8 | #include "workload.h"
  9 | #include <pthread.h>
 10 | #include <stdlib.h>
 11 | #include <stdio.h>
 12 | #include <string.h>
 13 | #include <unistd.h>
 14 | #include <signal.h>
 15 | #include <sched.h>
 16 | #include <errno.h>
 17 | #include <time.h>
 18 | 
 19 | #define WAIT_SEC_US 	5000000
 20 | 
 21 | #define US_TO_S         1000000
 22 | 
 23 | #define SET_VAL         15
 24 | 
 25 | #define PAGE_SIZE       4096
 26 | 
 27 | //#define CHECK_NT_ST
 28 | 
 29 | //#define DUMP_ZMM
 30 | 
 31 | // =================================================
 32 | //              zmm test functions
 33 | // =================================================
 34 | /*
 35 |  * These functions were used for dumping the data in
 36 |  * the avx zmm registers. In zmm0, zmm1, and zmm2, there
 37 |  * are a few bytes stayed constant despite storing to it.
 38 |  * Hence the weird conditions in `check_buff`
 39 |  * Maybe because of this: https://stackoverflow.com/questions/41819514/why-do-sse-instructions-preserve-the-upper-128-bit-of-the-ymm-registers
 40 |  */
 41 | void set_data_buf(char* data_buf, uint64_t size) {
 42 |     fprintf(stdout, "[set_data_buf] \n");
 43 |     for (uint64_t i = 0; i < size; i++) {
 44 |         data_buf[i] = i; 
 45 |     }
 46 | }
 47 | 
 48 | void clear_buff(char* buff, uint64_t size) {
 49 |     fprintf(stdout, "[clear_buff]\n");
 50 |     for (uint64_t i = 0; i < size; i++) {
 51 |         buff[i] = 0;
 52 |     }
 53 | }
 54 | 
 55 | void check_buff(char* buff, uint64_t size) {
 56 |     fprintf(stdout, "[check_buff]\n");
 57 |     uint64_t error_cnt = 0;
 58 |     uint64_t correct_cnt = 0;
 59 | 
 60 |     char* truth_buf;
 61 |     posix_memalign((void**)(&truth_buf), PAGE_SIZE, PAGE_SIZE);
 62 |     dump_zmm(truth_buf, 1024);
 63 |     int mod;
 64 |     for (uint64_t i = 0; i < size; i++) {
 65 |         mod = i % 1024;
 66 |         if (mod <= 0xF) continue;
 67 |         if (mod >= 0x40 && mod <= 0x4F) continue;
 68 |         if (mod >= 0x80 && mod <= 0x8F) continue;
 69 |         if (mod >= 0x140 && mod <= 0x14F) continue;
 70 |         if (buff[i] != truth_buf[mod]) {
 71 |             fprintf(stdout, "[check_buff] buff[%lx] != truth, found %x, expect %x\n", i, (unsigned char)buff[i], (unsigned char)truth_buf[mod]);
 72 |             error_cnt++;
 73 |         } else {
 74 |             correct_cnt++;
 75 |         }
 76 |         if (error_cnt > 100) {
 77 |             fprintf(stdout, "[check_buff] before exit with 100 error, correct_cnt %ld\n", correct_cnt);
 78 |             return;
 79 |         }
 80 |     }
 81 |     free(truth_buf);
 82 |     if (error_cnt == 0) {
 83 |         fprintf(stdout, "[check_buff] all correct! correct_cnt: %ld\n", correct_cnt);
 84 |     }
 85 | }
 86 | 
 87 | 
 88 | // =================================================
 89 | //              benchmark wrapping functions
 90 | // =================================================
 91 | 
 92 | static volatile int keepRunning = 1;
 93 | void stop_threads(test_cfg_t* cfg_arr) {
 94 |     int num_thread;
 95 |     num_thread = cfg_arr[0].num_thread;
 96 |     fprintf(stdout, "[stop_threads]\n");
 97 | 
 98 |     for (int i = 0; i < num_thread; i++) {
 99 |         cfg_arr[i].halt = 1;
100 |     }
101 | }
102 | 
103 | void intHandler(int dummy) {
104 |     fprintf(stdout, "Ctrl-c detected, %d\n", dummy);
105 |     keepRunning = 0;
106 | }
107 | 
108 | /** 
109 |  * get_bw
110 |  *   @brief Read the curr_op_cnt from each thread and calculate the sum every *delay* microsecond.
111 |  *   @param cfg_arr array of config.
112 |  *	 @param	iter 	how many times we probe the bandwidth.
113 |  *	 @param delay 	interval between probes in unit of us.
114 |  *   @return none
115 |  */
116 | void get_bw(test_cfg_t* cfg_arr, int iter, int delay) {
117 |     int num_thread;
118 |     num_thread = cfg_arr[0].num_thread;
119 |     uint64_t prev_cnt, curr_cnt;
120 |     prev_cnt = 0;
121 | 
122 |     for (int j = 0; j < iter; j++) {
123 |         curr_cnt = 0;
124 |         for (int i = 0; i < num_thread; i++) {
125 |             curr_cnt += cfg_arr[i].curr_op_cnt;
126 |         }
127 |         // fprintf(stdout, GRN "[get_bw] " RESET "%.1f MB/sec\n", ((double)(curr_cnt - prev_cnt) / (double)((1 << 20) * (delay / 1000000))));
128 |         /* for easier processing */
129 |         fprintf(stdout, "[get_bw] %.1f MB/sec\n", ((double)(curr_cnt - prev_cnt) / (double)((1 << 20) * (delay / US_TO_S))));
130 |         prev_cnt = curr_cnt;
131 |         if (keepRunning == 0) break;
132 | 
133 |         usleep(delay);
134 |     }
135 | }
136 | 
137 | /*
138 |  * wrapping:
139 |  *  main
140 |  * 		run_test
141 |  *			thread_wrapper
142 |  *				lats/bw_wrapper
143 |  *					operation
144 |  */
145 | 
146 | // spawn thread 
147 | int run_test(test_cfg_t* cfg) {
148 |     pthread_t* thread_arr;
149 |     test_cfg_t* cfg_arr;
150 |     test_cfg_t* curr_cfg;
151 |     int ret, num_thread; 
152 | 
153 |     // just in case
154 |     signal(SIGINT, intHandler);
155 | 
156 |     // alloc
157 |     num_thread = cfg->num_thread;
158 |     thread_arr = malloc(num_thread * sizeof(pthread_t));
159 |     cfg_arr = malloc(num_thread * sizeof(test_cfg_t));
160 |     memset(cfg_arr, 0, num_thread * sizeof(test_cfg_t));
161 | 
162 | 
163 |     // clear buff
164 | #ifdef CHECK_NT_ST
165 |     clear_buff(cfg->buf_a, cfg->total_buf_size);
166 |     if (cfg->op == MOV) {
167 |         clear_buff(cfg->buf_b, cfg->total_buf_size);
168 |     }
169 | #endif
170 |     
171 |     // launch thread
172 |     for (int i = 0; i < num_thread; i++) {
173 |         curr_cfg = &(cfg_arr[i]);
174 |         memcpy(curr_cfg, cfg, sizeof(test_cfg_t));
175 | 
176 |         curr_cfg->thread_idx = i;
177 |         curr_cfg->halt = 0;
178 |         curr_cfg->curr_op_cnt = 0;
179 | 
180 |         curr_cfg->start_addr_a = &(curr_cfg->buf_a[i * curr_cfg->per_thread_size]);
181 |         if (cfg->op == MOV) {
182 |             curr_cfg->start_addr_b = &(curr_cfg->buf_b[i * curr_cfg->per_thread_size]);
183 |         }
184 |         ret = pthread_create(&thread_arr[i], NULL, thread_wrapper, (void*)curr_cfg);
185 |     }
186 | 
187 |     // monitor threads
188 |     switch(cfg->type) {
189 |         case LATS_CLFLUSH:
190 |             // do nothing, latency is monitored within a single thread
191 |             break;
192 |         case BW:
193 |             get_bw(cfg_arr, cfg->op_iter, WAIT_SEC_US); 
194 |             break;
195 |         case LATS_CHASE:
196 |             // do nothing, latency is monitored within a single thread
197 |             break;
198 |         case BLOCK_LATS:
199 |             // do nothing, latency is monitored within a single thread
200 |             break; 
201 |         default:
202 |             fprintf(stderr, "unknown type, thread idx: %d\n", cfg->thread_idx);
203 |     }
204 |     
205 |     if (cfg->type == BW) {
206 |         stop_threads(cfg_arr);
207 |     }
208 | 
209 |     // join threads
210 |     for (int i = 0; i < num_thread; i++) {
211 |         ret = pthread_join(thread_arr[i], NULL);
212 |     }
213 | 
214 |     free(cfg_arr);
215 |     free(thread_arr);
216 |     return ret;
217 | }
218 | 
219 | void print_lats(test_cfg_t* cfg, uint64_t min, uint64_t max, uint64_t sum, uint64_t num_chase_block) {
220 |     uint64_t avg_cycle = sum / cfg->op_iter;
221 |     printf(GRN "[RESULT]" RESET " Max latency: %.1f, Min latency: %.1f\n", 1.0*max/cfg->tsc_freq, 1.0*min/cfg->tsc_freq);
222 |     printf(GRN "[RESULT]" RESET " Max cycle  : %lu,  Min cycle  : %lu, Avg cycle: %lu\n", max, min, avg_cycle);
223 |     printf(GRN "[RESULT]" RESET " Thread %d average latency among %d iterations: %.1fns (assume %fGHz)\n", cfg->thread_idx, \
224 |             cfg->op_iter, 1.0 / cfg->tsc_freq * avg_cycle, cfg->tsc_freq);
225 |     if (num_chase_block != 0) {
226 |         printf(RED "[RESULT]" RESET "chase/block_lats average cycle among %d iterations: %.1fcycles\n", cfg->op_iter, 1.0 * avg_cycle / num_chase_block);
227 |         printf(RED "[RESULT]" RESET " chase/block_lats average latency among %d iterations: %.1fns (assume %fGHz)\n", cfg->op_iter, 1.0 / cfg->tsc_freq * avg_cycle / num_chase_block, cfg->tsc_freq);
228 |     }
229 | }
230 | 
231 | int comp(const void* elem1, const void* elem2){
232 |     uint64_t f = *((uint64_t*)elem1);
233 |     uint64_t s = *((uint64_t*)elem2);
234 |     if (f > s) return  1;
235 |     if (f < s) return -1;
236 |     return 0;
237 | }
238 | 
239 | void print_lats_median(test_cfg_t* cfg, uint64_t* result) {
240 |     qsort(result, cfg->op_iter, sizeof(*result), comp);
241 |     uint64_t median = result[cfg->op_iter / 2];
242 |     printf(RED "[RESULT]" RESET " Median latency among %d iterations: %.1fns (assume %fGHz)\n", cfg->op_iter, 1.0 / cfg->tsc_freq * median, cfg->tsc_freq);
243 | }
244 | 
245 | void init_ptr_buf_random(test_cfg_t* cfg) {
246 |     // FIXME -- this is NOT implemented
247 |     printf(YEL "[INFO]" RESET " Random pointer chasing is NOT implemented. Building the ptr array in order\n");
248 |     chase_t* curr_ptr;
249 |     chase_t* next_ptr;
250 |     uint64_t num_chase_block;
251 | 
252 |     curr_ptr = (chase_t*)cfg->start_addr_a;
253 |     num_chase_block = cfg->total_buf_size / 64;
254 | 
255 |     for (uint64_t i = 0; i < num_chase_block - 1; i++) {
256 |         next_ptr = &(curr_ptr[1]);
257 |         curr_ptr->ptr_arr[0] = next_ptr;
258 |         curr_ptr = next_ptr;
259 |     }
260 |     curr_ptr->ptr_arr[0] = (chase_t*)cfg->start_addr_a;
261 | }
262 | 
263 | uint64_t init_ptr_buf(test_cfg_t* cfg) {
264 | 
265 |     chase_t* curr_ptr;
266 |     chase_t* next_ptr;
267 |     uint64_t num_chase_block;
268 | 
269 |     printf(YEL "[INFO]" RESET " building pointer chasing link list, block size: %ld bytes ... \n", sizeof(chase_t));
270 | 
271 |     if (cfg->random) {
272 |         init_ptr_buf_random(cfg);
273 | 
274 |     } else {
275 |         curr_ptr = (chase_t*)cfg->start_addr_a;
276 |         num_chase_block = cfg->total_buf_size / sizeof(chase_t);
277 | 
278 |         for (uint64_t i = 0; i < num_chase_block - 1; i++) {
279 |             //for (uint64_t i = 0; i < 5; i++) {
280 |             next_ptr = &(curr_ptr[1]);
281 |             curr_ptr->ptr_arr[0] = next_ptr;
282 |             curr_ptr = next_ptr;
283 |         }
284 |         curr_ptr->ptr_arr[0] = (chase_t*)cfg->start_addr_a;
285 |     }
286 | 
287 |     printf(YEL "[INFO]" RESET " num blocks: %lu \n", num_chase_block);
288 |     printf(YEL "[INFO]" RESET " Chase confirm: start_addr: 0x%lx, first chase addr: 0x%lx\n",
289 |             (uint64_t)cfg->start_addr_a,
290 |             (uint64_t)(&((chase_t*)(cfg->start_addr_a))->ptr_arr[0]));
291 | 
292 |     curr_ptr = (chase_t*)cfg->start_addr_a;
293 |     printf(YEL "[INFO]" RESET " Chase confirm: next_addr: 0x%lx, second chase addr: 0x%lx\n",
294 |             (uint64_t)(&curr_ptr[1]),
295 |             (uint64_t)(((chase_t*)(cfg->start_addr_a))->ptr_arr[0]));
296 | 
297 |     return num_chase_block;
298 | }
299 | 
300 | void set_prefetching(int starting_core, bool prefetch_en, int core_num) {
301 |     if (starting_core >= 0) {
302 |         if (prefetch_en) {
303 |             enable_prefetch(core_num);
304 |         } else {
305 |             disable_prefetch(core_num);
306 |         }
307 |     }
308 | }
309 | 
310 | void restore_prefetching(int starting_core, bool prefetch_en, int core_num) {
311 |     if (starting_core >= 0) {
312 |         // restore to enable prefetching
313 |         if (!prefetch_en) {
314 |             enable_prefetch(core_num);
315 |         }
316 |     }
317 | }
318 | 
319 | 
320 | void lats_chase_wrapper(test_cfg_t* cfg) {
321 |     uint64_t result, latency_sum = 0;
322 |     uint64_t min, max;
323 |     uint64_t num_chase_block; 
324 |     int core_num = cfg->thread_idx + cfg->starting_core;
325 | 
326 |     if (cfg->start_addr_a == NULL) {
327 |         printf(RED "[ERROR]" RESET " init_ptr_buf, found null buf addr\n");
328 |         return;
329 |     }
330 | 
331 |     num_chase_block = init_ptr_buf(cfg);
332 |     
333 |     set_prefetching(cfg->starting_core, cfg->prefetch_en, core_num);
334 | 
335 |     cfg->op_iter += 1; // for warm up
336 |     for (int i = 0; i < cfg->op_iter; i++) {
337 |         switch (cfg->op) {
338 |             default:
339 |                 result = op_ptr_chase(cfg->start_addr_a, num_chase_block);
340 |                 break;
341 |         }
342 |         if (i >= 1) {
343 |             latency_sum += result;
344 |             printf("result = %lu\n", result);
345 |         } else {
346 |             printf("warmup = %lu\n", result);
347 |         }
348 |         if (i == 1) {
349 |             min = result;
350 |             max = result;
351 |         } else {
352 |             if (min < result) min = result;
353 |             if (max > result) max = result;
354 |         }
355 |     }
356 | 
357 |     restore_prefetching(cfg->starting_core, cfg->prefetch_en, core_num);
358 | 
359 |     cfg->op_iter -= 1; // remove warm up for average
360 |     print_lats(cfg, min, max, latency_sum, num_chase_block);
361 | }
362 | 
363 | /*
364 |  * This function tests multi-operation latency.
365 |  * The scheme here goes as:  
366 |  *  flush cacheline <optional, parameter -B>
367 |  *      mfence
368 |  *  issue many nop <optional, parameter -C>
369 |  *  mark time1
370 |  *      issue X ops
371 |  *  mark time2 
372 |  *
373 |  * In most cases, the latency goes down as more ops
374 |  *  are issued in parallel. 
375 |  */
376 | void block_lats_wrapper(test_cfg_t* cfg) {
377 |     uint64_t result, latency_sum = 0;
378 |     uint64_t min, max;
379 |     int offset; /* measure the latency op_iter times and take average */
380 |     int core_num = cfg->thread_idx + cfg->starting_core;
381 | 
382 |     uint64_t* result_buff;
383 |     result_buff = malloc(sizeof(uint64_t) * cfg->op_iter);
384 | 
385 |     set_prefetching(cfg->starting_core, cfg->prefetch_en, core_num);
386 | 
387 |     flush_all_cache();
388 |     for (int i = 0; i < cfg->op_iter; i++) {
389 |         offset = rand() % cfg->total_buf_size & ~(0xFFFF);
390 |         switch (cfg->op) {
391 |             case READ:
392 |                 result = op_ld_block_lat(cfg->start_addr_a + offset, cfg->flush_block, cfg->num_clear_pipe);
393 |                 break;
394 |             case READ_NT:
395 |                 result = op_ntld_block_lat(cfg->start_addr_a + offset, cfg->flush_block, cfg->num_clear_pipe);
396 |                 break;
397 |             case WRITE:
398 |                 result = op_stwb_block_lat(cfg->start_addr_a + offset, cfg->flush_block, cfg->num_clear_pipe);
399 |                 break;
400 |             case WRITE_NT:
401 |                 result = op_ntst_block_lat(cfg->start_addr_a + offset, cfg->flush_block, cfg->num_clear_pipe);
402 |                 break;
403 |             default:
404 |                 printf(RED "[ERROR]" RESET "bad cfg->op\n");
405 |                 goto out;
406 |         }
407 |         if (i == 0) {
408 |             min = result;
409 |             max = result;
410 |         } else {
411 |             min = (result < min) ? result : min;
412 |             max = (result > max) ? result : max;
413 |         }
414 |         latency_sum += result;
415 |         result_buff[i] = result / BLOCK_xN;
416 |     }
417 |     print_lats(cfg, min, max, latency_sum, BLOCK_xN);
418 |     print_lats_median(cfg, result_buff);
419 | 
420 | out:
421 |     restore_prefetching(cfg->starting_core, cfg->prefetch_en, core_num);
422 |     free(result_buff);
423 | }
424 | 
425 | /*
426 |  * This function tests single operation latency.
427 |  * The scheme here goes as:  
428 |  *  flush cacheline
429 |  *  issue many nop
430 |  *  mark time1
431 |  *      issue 1 op
432 |  *  mark time2 
433 |  *
434 |  * In most cases, the latency here is very high,
435 |  *  and the actual interpretation of this latency
436 |  *  may vary.
437 |  */
438 | void lats_clflush_wrapper(test_cfg_t* cfg) {
439 | 
440 |     uint64_t result, latency_sum = 0;
441 |     uint64_t min, max;
442 |     int offset; /* measure the latency op_iter times and take average */
443 |     int core_num = cfg->thread_idx + cfg->starting_core;
444 | 
445 |     uint64_t* result_buff;
446 |     result_buff = malloc(sizeof(uint64_t) * cfg->op_iter);
447 | 
448 |     set_prefetching(cfg->starting_core, cfg->prefetch_en, core_num);
449 |     
450 |     flush_all_cache();
451 | 
452 |     switch (cfg->op)
453 |     {
454 |         case READ:
455 |             for (int i = 0; i < cfg->op_iter; i++){
456 |                 offset = rand() % cfg->total_buf_size;
457 |                 result = op_ld_64B_lat(cfg->start_addr_a + offset);
458 |                 latency_sum += result;
459 |                 if (i == 0){
460 |                     min = result;
461 |                     max = result;
462 |                 }else{
463 |                     min = (result < min) ? result : min;
464 |                     max = (result > max) ? result : max;
465 |                 }
466 |                 result_buff[i] = result;
467 |             }
468 |             break;
469 | 
470 |         case READ_NT:
471 |             for (int i = 0; i < cfg->op_iter; i++){
472 |                 offset = rand() % cfg->total_buf_size;
473 |                 result = op_ntld_64B_lat(cfg->start_addr_a + offset);
474 |                 latency_sum += result;
475 |                 if (i == 0){
476 |                     min = result;
477 |                     max = result;
478 |                 }else{
479 |                     min = (result < min) ? result : min;
480 |                     max = (result > max) ? result : max;
481 |                 }
482 |                 result_buff[i] = result;
483 |             }
484 |             break;
485 | 
486 |         case WRITE:
487 |             for (int i = 0; i < cfg->op_iter; i++){
488 |                 offset = rand() % cfg->total_buf_size;
489 |                 result = op_st_cl_flush_64B_lat(cfg->start_addr_a + offset);
490 |                 latency_sum += result;
491 |                 if (i == 0){
492 |                     min = result;
493 |                     max = result;
494 |                 }else{
495 |                     min = (result < min) ? result : min;
496 |                     max = (result > max) ? result : max;
497 |                 }
498 |                 result_buff[i] = result;
499 |             }
500 |             break;
501 | 
502 |         case WRITE_NT:
503 |             for (int i = 0; i < cfg->op_iter; i++){
504 |                 offset = rand() % cfg->total_buf_size;
505 |                 result = op_ntst_64B_lat(cfg->start_addr_a + offset);
506 |                 latency_sum += result;
507 |                 if (i == 0){
508 |                     min = result;
509 |                     max = result;
510 |                 }else{
511 |                     min = (result < min) ? result : min;
512 |                     max = (result > max) ? result : max;
513 |                 }
514 |                 result_buff[i] = result;
515 |             }
516 |             break;
517 | 
518 |         default:
519 |             break;
520 |     }
521 |     print_lats(cfg, min, max, latency_sum, 0);
522 |     print_lats_median(cfg, result_buff);
523 |     
524 |     restore_prefetching(cfg->starting_core, cfg->prefetch_en, core_num);
525 | 
526 |     free(result_buff);
527 |     return;
528 | }
529 | 
530 | void bw_wrapper(test_cfg_t* cfg) {
531 |     const uint64_t fixed_step = cfg->bw_granu << 6;
532 |     // random steps will be aligned by fixed steps
533 |     const uint64_t align_mask = (~(fixed_step - 1));
534 |     // random steps will be multiple of fix steps
535 |     const uint64_t step_bound_mask = ~(align_mask << 6);
536 | 
537 |     cfg->curr_op_cnt = 0;
538 |     char* src = cfg->start_addr_a;
539 |     char* dst = cfg->start_addr_b;
540 |     uint64_t rand_offset = rand();
541 |     uint64_t curr_step = fixed_step;
542 |     uint64_t counter = 0;
543 |     int stall_cnt;
544 |     int core_num = cfg->thread_idx + cfg->starting_core;
545 |     int rw_ratio = cfg->read_ratio;  // rw_ratio can be 1, 2, 3.
546 |     int mixed_switch = 0;
547 | 
548 |     set_prefetching(cfg->starting_core, cfg->prefetch_en, core_num);
549 | 
550 |     if (cfg->op == MOV) {
551 |         printf("src: 0x%lx, dst: 0x%lx\n", (uint64_t)src, (uint64_t)dst);
552 |     }
553 | 
554 |     /* sanity check */
555 |     if (cfg->op == MIXED && rw_ratio == 2 && fixed_step != 384) {
556 |         printf("[WARNING] You are using a BW_granu other than 384 for R2W1 mixed bandwidth test. Data will be skewed.\n");
557 |     }
558 | 
559 | #ifdef DUMP_ZMM
560 |     char* data_buf;
561 | 
562 |     // set all zmm
563 |     posix_memalign((void**)(&data_buf), PAGE_SIZE, PAGE_SIZE);
564 |     set_data_buf(data_buf, PAGE_SIZE);
565 |     set_all_zmm(data_buf);
566 |     dump_zmm(NULL, 0);
567 |     free(data_buf);
568 | #endif
569 | 
570 | 
571 |     while (1) {
572 |         if(counter + fixed_step > cfg->per_thread_size){
573 |             //fprintf(stdout, "reach end, reset\n");
574 |             counter = 0;
575 |             src = cfg->start_addr_a;
576 |             dst = cfg->start_addr_b;
577 |         }
578 |         switch(cfg->op) {
579 |             case WRITE:
580 |                 op_st(src, fixed_step);
581 |                 break;
582 | 
583 |             case WRITE_NT:
584 |                 op_ntst(src, fixed_step);
585 |                 break;
586 | 
587 |             case READ:
588 |                 op_ld(src, fixed_step);
589 |                 break;
590 | 
591 |             case READ_NT:
592 |                 op_ntld(src, fixed_step);
593 |                 break;
594 | 
595 |             case MOV:
596 |                 op_movdir64B(src, dst, fixed_step);
597 |                 break;
598 | 
599 |             case MIXED:
600 |                 // op_mixed(src, fixed_step, rw_ratio);
601 |                 if (mixed_switch == rw_ratio){
602 |                     op_st(src, fixed_step);
603 |                     mixed_switch = 0;
604 |                 } else {
605 |                     op_ld(src, fixed_step);
606 |                     mixed_switch += 1;
607 |                 }
608 |                 break;
609 | 
610 |             default:
611 |                 fprintf(stderr, "unknown op, thread idx: %d\n", cfg->thread_idx);
612 |                 goto out;
613 |         }
614 |         // ==================================== 
615 |         //              Stepping, rand/seq
616 |         // ==================================== 
617 |         // increment number of byte operated on
618 |         cfg->curr_op_cnt += fixed_step;
619 | 
620 |         /* update the address of interest */
621 |         if (cfg->random) {
622 |             curr_step = (curr_step ^ xorshf96(&rand_offset)) & align_mask;
623 |             curr_step &= step_bound_mask;
624 |             //fprintf(stdout, "curr_step: 0x%lx\n", curr_step);
625 |         } else {
626 |             curr_step = fixed_step;
627 |         }
628 |         counter += curr_step;
629 |         src += curr_step;
630 |         dst += curr_step;
631 | 
632 |         // ==================================== 
633 |         //              Stalling
634 |         // ==================================== 
635 |         // create artificial stalling if desired
636 |         stall_cnt = 0;
637 |         while (stall_cnt < cfg->stall_ratio) {
638 |             op_stall();
639 |             stall_cnt++; 
640 |         }
641 | 
642 |         if (cfg->halt) {
643 |             fprintf(stdout, "thread idx: %d end\n", cfg->thread_idx);
644 | #ifdef CHECK_NT_ST
645 |             if (cfg->op == WRITE_NT) {
646 |                 check_buff(cfg->buf_a, cfg->total_buf_size);
647 |             }
648 | #endif
649 |             break;
650 |         }
651 |     }
652 | out:
653 |     restore_prefetching(cfg->starting_core, cfg->prefetch_en, core_num);
654 | }
655 | 
656 | // taken from https://stackoverflow.com/questions/1407786/how-to-set-cpu-affinity-of-a-particular-pthread
657 | int stick_this_thread_to_core(int core_id) {
658 |     int num_cores = sysconf(_SC_NPROCESSORS_ONLN);
659 |     if (core_id < 0 || core_id >= num_cores)
660 |         return EINVAL;
661 | 
662 |     cpu_set_t cpuset;
663 |     CPU_ZERO(&cpuset);
664 |     CPU_SET(core_id, &cpuset);
665 | 
666 |     pthread_t current_thread = pthread_self();
667 |     return pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
668 | }
669 | 
670 | // dispatch to different workload wrappers
671 | void* thread_wrapper(void* arg) {
672 |     test_cfg_t* cfg;
673 |     cfg = (test_cfg_t*)arg;
674 |     fprintf(stdout, "thread %d created.\n", cfg->thread_idx);
675 | 
676 |     int res;
677 |     if (cfg->core_a >= 0 && cfg->core_b >= 0) {
678 |         if (cfg->thread_idx == 0) {
679 |             cfg->starting_core = cfg->core_a; // to make prefetch pin to correct core
680 | 
681 |         } else if (cfg->thread_idx == 1) {
682 |             cfg->starting_core = cfg->core_b; // to make prefetch pin to correct core
683 | 
684 |         } else {
685 |             printf(RED "[ERROR]" RESET "more than 2 thread in testing pinning to core a, b\n");
686 |             return NULL;
687 |         }
688 |     }
689 | 
690 |     if (cfg->starting_core >= 0) {
691 |         res = stick_this_thread_to_core(cfg->thread_idx + cfg->starting_core);
692 |     } else {
693 |         printf(YEL "[INFO]" RESET " core pinning is not specified. Prefetching options will be ignored\n");
694 |     }
695 | 
696 |     if(res != 0){
697 |         printf(RED "[ERROR]" RESET " Thread affinity set failure.\n");
698 |         return NULL;
699 |     }
700 | 
701 |     switch(cfg->type) {
702 |         case LATS_CLFLUSH:
703 |             lats_clflush_wrapper(cfg);
704 |             break;
705 |         case BW:
706 |             bw_wrapper(cfg);	
707 |             break;
708 |         case LATS_CHASE:
709 |             lats_chase_wrapper(cfg);
710 |             break;
711 |         case BLOCK_LATS:
712 |             block_lats_wrapper(cfg);
713 |             break;
714 |         default:
715 |             fprintf(stderr, "unkown type, thread idx: %d\n", cfg->thread_idx);
716 |     }
717 |     return NULL;
718 | }
719 | 
720 | 


--------------------------------------------------------------------------------
/memo_ae/src/workload.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Developed by FAST Lab @ ECE-UIUC -- 2022-2023
  3 |  *  Some part of this file follows the methodology of FAST-20 Yang's resporitory
  4 |  *  @ https://github.com/NVSL/OptaneStudy/tree/master
  5 |  */
  6 | #ifndef WORKLOAD_H
  7 | #define WORKLOAD_H
  8 | 
  9 | #include <stdint.h>
 10 | #include <stdbool.h>
 11 | 
 12 | /*
 13 | #define BLOCK_xN 1
 14 | #define LD_xN_RAND_AVX512   LD_x1_RAND_AVX512 
 15 | #define STWB_xN_RAND_AVX512 STWB_x1_RAND_AVX512 
 16 | #define NTLD_xN_RAND_AVX512 NTLD_x1_RAND_AVX512 
 17 | #define NTST_xN_RAND_AVX512 NTST_x1_RAND_AVX512 
 18 | */
 19 | 
 20 | /*
 21 | #define BLOCK_xN 8
 22 | #define LD_xN_RAND_AVX512   LD_x8_RAND_AVX512 
 23 | #define STWB_xN_RAND_AVX512 STWB_x8_RAND_AVX512 
 24 | #define NTLD_xN_RAND_AVX512 NTLD_x8_RAND_AVX512 
 25 | #define NTST_xN_RAND_AVX512 NTST_x8_RAND_AVX512 
 26 | */
 27 | 
 28 | #define BLOCK_xN 16
 29 | #define LD_xN_RAND_AVX512   LD_x16_RAND_AVX512 
 30 | #define STWB_xN_RAND_AVX512 STWB_x16_RAND_AVX512 
 31 | #define NTLD_xN_RAND_AVX512 NTLD_x16_RAND_AVX512 
 32 | #define NTST_xN_RAND_AVX512 NTST_x16_RAND_AVX512 
 33 | //#define LD_xN_RAND_AVX512   LD_LFENCE_x16_RAND_AVX512 
 34 | //#define STWB_xN_RAND_AVX512 STWB_SFENCE_x16_RAND_AVX512 
 35 | //#define NTLD_xN_RAND_AVX512 NTLD_LFENCE_x16_RAND_AVX512 
 36 | //#define NTST_xN_RAND_AVX512 NTST_SFENCE_x16_RAND_AVX512 
 37 | 
 38 | /*
 39 | #define BLOCK_xN 32
 40 | #define LD_xN_RAND_AVX512   LD_x32_RAND_AVX512 
 41 | #define STWB_xN_RAND_AVX512 STWB_x32_RAND_AVX512 
 42 | #define NTLD_xN_RAND_AVX512 NTLD_x32_RAND_AVX512 
 43 | #define NTST_xN_RAND_AVX512 NTST_x32_RAND_AVX512 
 44 | */
 45 | 
 46 | void op_ntld(char* addr, long size);
 47 | 
 48 | void op_ld(char* addr, long size);
 49 | 
 50 | void op_ntst(char* addr, long size);
 51 | 
 52 | void op_st(char* addr, long size);
 53 | 
 54 | void op_stall();
 55 | 
 56 | void op_movdir64B(char* src_addr, char* dst_addr, long size);
 57 | 
 58 | void op_mixed(char* addr, long size, int ratio);
 59 | 
 60 | uint64_t op_ntld_32B_lat(char* addr);
 61 | 
 62 | uint64_t op_ntld_64B_lat(char* addr);
 63 | 
 64 | uint64_t op_ntst_64B_lat(char* addr);
 65 | 
 66 | uint64_t op_ld_64B_lat(char* addr);
 67 | 
 68 | uint64_t op_st_64B_lat(char* addr);
 69 | 
 70 | uint64_t op_st_cl_flush_64B_lat(char* addr);
 71 | 
 72 | uint64_t op_st_32B_lat(char* addr);
 73 | 
 74 | uint64_t op_ptr_chase(char* addr, uint64_t num_chase_block);
 75 | 
 76 | uint64_t op_ld_block_lat(char* addr, bool flush_block, long num_clear_pipe);
 77 | 
 78 | uint64_t op_ntld_block_lat(char* addr, bool flush_block, long num_clear_pipe);
 79 | 
 80 | uint64_t op_stwb_block_lat(char* addr, bool flush_block, long num_clear_pipe);
 81 | 
 82 | uint64_t op_ntst_block_lat(char* addr, bool flush_block, long num_clear_pipe);
 83 | 
 84 | void set_all_zmm(char* addr);
 85 | 
 86 | void dump_zmm(char* dst, uint64_t size);
 87 | 
 88 | /* Assembly to perform non-temporal load */
 89 | #define SIZENTLD_64_AVX512	\
 90 | 				"vmovntdqa 0x0(%%r9, %%r10), %%zmm0 \n" \
 91 | 				"add $0x40, %%r10 \n"
 92 | 
 93 | #define SIZENTLD_128_AVX512	\
 94 | 				"vmovntdqa  0x0(%%r9, %%r10), %%zmm0 \n" \
 95 | 				"vmovntdqa  0x40(%%r9, %%r10), %%zmm1 \n" \
 96 | 				"add $0x80, %%r10 \n"
 97 | 
 98 | #define SIZENTLD_256_AVX512	\
 99 | 				"vmovntdqa  0x0(%%r9, %%r10), %%zmm0 \n" \
100 | 				"vmovntdqa  0x40(%%r9, %%r10), %%zmm1 \n" \
101 | 				"vmovntdqa  0x80(%%r9, %%r10), %%zmm2 \n" \
102 | 				"vmovntdqa  0xc0(%%r9, %%r10), %%zmm3 \n" \
103 | 				"add $0x100, %%r10 \n"
104 | 
105 | #define SIZENTLD_512_AVX512	\
106 | 				"vmovntdqa  0x0(%%r9, %%r10), %%zmm0 \n" \
107 | 				"vmovntdqa  0x40(%%r9, %%r10), %%zmm1 \n" \
108 | 				"vmovntdqa  0x80(%%r9, %%r10), %%zmm2 \n" \
109 | 				"vmovntdqa  0xc0(%%r9, %%r10), %%zmm3 \n" \
110 | 				"vmovntdqa  0x100(%%r9, %%r10), %%zmm4 \n" \
111 | 				"vmovntdqa  0x140(%%r9, %%r10), %%zmm5 \n" \
112 | 				"vmovntdqa  0x180(%%r9, %%r10), %%zmm6 \n" \
113 | 				"vmovntdqa  0x1c0(%%r9, %%r10), %%zmm7 \n" \
114 | 				"add $0x200, %%r10 \n"
115 | 
116 | #define SIZENTLD_1024_AVX512	\
117 | 				"vmovntdqa  0x0(%%r9, %%r10), %%zmm0 \n" \
118 | 				"vmovntdqa  0x40(%%r9, %%r10), %%zmm1 \n" \
119 | 				"vmovntdqa  0x80(%%r9, %%r10), %%zmm2 \n" \
120 | 				"vmovntdqa  0xc0(%%r9, %%r10), %%zmm3 \n" \
121 | 				"vmovntdqa  0x100(%%r9, %%r10), %%zmm4 \n" \
122 | 				"vmovntdqa  0x140(%%r9, %%r10), %%zmm5 \n" \
123 | 				"vmovntdqa  0x180(%%r9, %%r10), %%zmm6 \n" \
124 | 				"vmovntdqa  0x1c0(%%r9, %%r10), %%zmm7 \n" \
125 | 				"vmovntdqa  0x200(%%r9, %%r10), %%zmm8 \n" \
126 | 				"vmovntdqa  0x240(%%r9, %%r10), %%zmm9 \n" \
127 | 				"vmovntdqa  0x280(%%r9, %%r10), %%zmm10 \n" \
128 | 				"vmovntdqa  0x2c0(%%r9, %%r10), %%zmm11 \n" \
129 | 				"vmovntdqa  0x300(%%r9, %%r10), %%zmm12 \n" \
130 | 				"vmovntdqa  0x340(%%r9, %%r10), %%zmm13 \n" \
131 | 				"vmovntdqa  0x380(%%r9, %%r10), %%zmm14 \n" \
132 | 				"vmovntdqa  0x3c0(%%r9, %%r10), %%zmm15 \n" \
133 | 				"add $0x400, %%r10 \n"
134 | 
135 | #define NTLD_x1_RAND_AVX512 \
136 |                 "vmovntdqa  0xd6c0(%%r11, %%r10), %%zmm0 \n"
137 | 
138 | #define NTLD_x8_RAND_AVX512 \
139 |                 "vmovntdqa  0xd6c0(%%r11, %%r10), %%zmm0 \n" \
140 |                 "vmovntdqa  0xb680(%%r11, %%r10), %%zmm1 \n" \
141 |                 "vmovntdqa  0x7040(%%r11, %%r10), %%zmm2 \n" \
142 |                 "vmovntdqa  0x36c0(%%r11, %%r10), %%zmm3 \n" \
143 |                 "vmovntdqa  0x3b80(%%r11, %%r10), %%zmm4 \n" \
144 |                 "vmovntdqa  0x9340(%%r11, %%r10), %%zmm5 \n" \
145 |                 "vmovntdqa  0x9ec0(%%r11, %%r10), %%zmm6 \n" \
146 |                 "vmovntdqa  0x6e80(%%r11, %%r10), %%zmm7 \n"
147 | 
148 | #define NTLD_x16_RAND_AVX512 \
149 |                 "vmovntdqa  0xc840(%%r11, %%r10), %%zmm0 \n" \
150 |                 "vmovntdqa  0xf180(%%r11, %%r10), %%zmm1 \n" \
151 |                 "vmovntdqa  0xce40(%%r11, %%r10), %%zmm2 \n" \
152 |                 "vmovntdqa  0x300(%%r11, %%r10), %%zmm3 \n" \
153 |                 "vmovntdqa  0x6d40(%%r11, %%r10), %%zmm4 \n" \
154 |                 "vmovntdqa  0xa440(%%r11, %%r10), %%zmm5 \n" \
155 |                 "vmovntdqa  0xa9c0(%%r11, %%r10), %%zmm6 \n" \
156 |                 "vmovntdqa  0xe980(%%r11, %%r10), %%zmm7 \n" \
157 |                 "vmovntdqa  0xc940(%%r11, %%r10), %%zmm8 \n" \
158 |                 "vmovntdqa  0x8200(%%r11, %%r10), %%zmm9 \n" \
159 |                 "vmovntdqa  0xbac0(%%r11, %%r10), %%zmm10 \n" \
160 |                 "vmovntdqa  0x8940(%%r11, %%r10), %%zmm11 \n" \
161 |                 "vmovntdqa  0xe700(%%r11, %%r10), %%zmm12 \n" \
162 |                 "vmovntdqa  0xe100(%%r11, %%r10), %%zmm13 \n" \
163 |                 "vmovntdqa  0x8f40(%%r11, %%r10), %%zmm14 \n" \
164 |                 "vmovntdqa  0xf2c0(%%r11, %%r10), %%zmm15 \n"
165 | 
166 | #define NTLD_x32_RAND_AVX512 \
167 |                 "vmovntdqa  0x3d80(%%r11, %%r10), %%zmm0 \n" \
168 |                 "vmovntdqa  0x1780(%%r11, %%r10), %%zmm1 \n" \
169 |                 "vmovntdqa  0x4700(%%r11, %%r10), %%zmm2 \n" \
170 |                 "vmovntdqa  0xb980(%%r11, %%r10), %%zmm3 \n" \
171 |                 "vmovntdqa  0xaa00(%%r11, %%r10), %%zmm4 \n" \
172 |                 "vmovntdqa  0xad00(%%r11, %%r10), %%zmm5 \n" \
173 |                 "vmovntdqa  0x9a40(%%r11, %%r10), %%zmm6 \n" \
174 |                 "vmovntdqa  0x5300(%%r11, %%r10), %%zmm7 \n" \
175 |                 "vmovntdqa  0x7d40(%%r11, %%r10), %%zmm8 \n" \
176 |                 "vmovntdqa  0xf480(%%r11, %%r10), %%zmm9 \n" \
177 |                 "vmovntdqa  0x9480(%%r11, %%r10), %%zmm10 \n" \
178 |                 "vmovntdqa  0xbd80(%%r11, %%r10), %%zmm11 \n" \
179 |                 "vmovntdqa  0x3fc0(%%r11, %%r10), %%zmm12 \n" \
180 |                 "vmovntdqa  0xcdc0(%%r11, %%r10), %%zmm13 \n" \
181 |                 "vmovntdqa  0x480(%%r11, %%r10), %%zmm14 \n" \
182 |                 "vmovntdqa  0xb400(%%r11, %%r10), %%zmm15 \n" \
183 |                 "vmovntdqa  0xb500(%%r11, %%r10), %%zmm16 \n" \
184 |                 "vmovntdqa  0x49c0(%%r11, %%r10), %%zmm17 \n" \
185 |                 "vmovntdqa  0x3380(%%r11, %%r10), %%zmm18 \n" \
186 |                 "vmovntdqa  0x36c0(%%r11, %%r10), %%zmm19 \n" \
187 |                 "vmovntdqa  0x14c0(%%r11, %%r10), %%zmm20 \n" \
188 |                 "vmovntdqa  0xcc80(%%r11, %%r10), %%zmm21 \n" \
189 |                 "vmovntdqa  0xb600(%%r11, %%r10), %%zmm22 \n" \
190 |                 "vmovntdqa  0x6840(%%r11, %%r10), %%zmm23 \n" \
191 |                 "vmovntdqa  0x6c80(%%r11, %%r10), %%zmm24 \n" \
192 |                 "vmovntdqa  0x2c0(%%r11, %%r10), %%zmm25 \n" \
193 |                 "vmovntdqa  0x62c0(%%r11, %%r10), %%zmm26 \n" \
194 |                 "vmovntdqa  0x79c0(%%r11, %%r10), %%zmm27 \n" \
195 |                 "vmovntdqa  0xfe40(%%r11, %%r10), %%zmm28 \n" \
196 |                 "vmovntdqa  0xc200(%%r11, %%r10), %%zmm29 \n" \
197 |                 "vmovntdqa  0x58c0(%%r11, %%r10), %%zmm30 \n" \
198 |                 "vmovntdqa  0x9b40(%%r11, %%r10), %%zmm31 \n"
199 | 
200 | /* Assembly to perform non-temporal store */
201 | #define SIZENTST_64_AVX512		\
202 | 				"vmovntdq  %%zmm0,  0x0(%%r9, %%r10) \n" \
203 | 				"add $0x40, %%r10 \n"
204 | 
205 | #define SIZENTST_128_AVX512		\
206 | 				"vmovntdq  %%zmm0,  0x0(%%r9, %%r10) \n" \
207 | 				"vmovntdq  %%zmm0,  0x40(%%r9, %%r10) \n" \
208 | 				"add $0x80, %%r10 \n"
209 | 
210 | #define SIZENTST_256_AVX512		\
211 | 				"vmovntdq  %%zmm0,  0x0(%%r9, %%r10) \n" \
212 | 				"vmovntdq  %%zmm0,  0x40(%%r9, %%r10) \n" \
213 | 				"vmovntdq  %%zmm0,  0x80(%%r9, %%r10) \n" \
214 | 				"vmovntdq  %%zmm0,  0xc0(%%r9, %%r10) \n" \
215 | 				"add $0x100, %%r10 \n"
216 | 
217 | #define SIZENTST_512_AVX512		\
218 | 				"vmovntdq  %%zmm0,  0x0(%%r9, %%r10) \n" \
219 | 				"vmovntdq  %%zmm0,  0x40(%%r9, %%r10) \n" \
220 | 				"vmovntdq  %%zmm0,  0x80(%%r9, %%r10) \n" \
221 | 				"vmovntdq  %%zmm0,  0xc0(%%r9, %%r10) \n" \
222 | 				"vmovntdq  %%zmm0,  0x100(%%r9, %%r10) \n" \
223 | 				"vmovntdq  %%zmm0,  0x140(%%r9, %%r10) \n" \
224 | 				"vmovntdq  %%zmm0,  0x180(%%r9, %%r10) \n" \
225 | 				"vmovntdq  %%zmm0,  0x1c0(%%r9, %%r10) \n" \
226 | 				"add $0x200, %%r10 \n"
227 | 
228 | #define SIZENTST_1024_AVX512	\
229 | 				"vmovntdq  %%zmm0,  0x0(%%r9, %%r10) \n" \
230 | 				"vmovntdq  %%zmm1,  0x40(%%r9, %%r10) \n" \
231 | 				"vmovntdq  %%zmm2,  0x80(%%r9, %%r10) \n" \
232 | 				"vmovntdq  %%zmm3,  0xc0(%%r9, %%r10) \n" \
233 | 				"vmovntdq  %%zmm4,  0x100(%%r9, %%r10) \n" \
234 | 				"vmovntdq  %%zmm5,  0x140(%%r9, %%r10) \n" \
235 | 				"vmovntdq  %%zmm6,  0x180(%%r9, %%r10) \n" \
236 | 				"vmovntdq  %%zmm7,  0x1c0(%%r9, %%r10) \n" \
237 | 				"vmovntdq  %%zmm8,  0x200(%%r9, %%r10) \n" \
238 | 				"vmovntdq  %%zmm9,  0x240(%%r9, %%r10) \n" \
239 | 				"vmovntdq  %%zmm10,  0x280(%%r9, %%r10) \n" \
240 | 				"vmovntdq  %%zmm11,  0x2c0(%%r9, %%r10) \n" \
241 | 				"vmovntdq  %%zmm12,  0x300(%%r9, %%r10) \n" \
242 | 				"vmovntdq  %%zmm13,  0x340(%%r9, %%r10) \n" \
243 | 				"vmovntdq  %%zmm14,  0x380(%%r9, %%r10) \n" \
244 | 				"vmovntdq  %%zmm15,  0x3c0(%%r9, %%r10) \n" \
245 | 				"add $0x400, %%r10 \n"
246 | 
247 | #define NTST_x1_RAND_AVX512 \
248 |                 "vmovntdq %%zmm0, 0x9680(%%r11, %%r10) \n"
249 | 
250 | #define NTST_x8_RAND_AVX512 \
251 |                 "vmovntdq %%zmm0, 0x9680(%%r11, %%r10) \n" \
252 |                 "vmovntdq %%zmm1, 0x15c0(%%r11, %%r10) \n" \
253 |                 "vmovntdq %%zmm2, 0x4a80(%%r11, %%r10) \n" \
254 |                 "vmovntdq %%zmm3, 0xb800(%%r11, %%r10) \n" \
255 |                 "vmovntdq %%zmm4, 0x9700(%%r11, %%r10) \n" \
256 |                 "vmovntdq %%zmm5, 0x2000(%%r11, %%r10) \n" \
257 |                 "vmovntdq %%zmm6, 0x8d40(%%r11, %%r10) \n" \
258 |                 "vmovntdq %%zmm7, 0xb640(%%r11, %%r10) \n"
259 | 
260 | #define NTST_x16_RAND_AVX512 \
261 |                 "vmovntdq %%zmm0, 0x3680(%%r11, %%r10) \n" \
262 |                 "vmovntdq %%zmm1, 0x4140(%%r11, %%r10) \n" \
263 |                 "vmovntdq %%zmm2, 0x2cc0(%%r11, %%r10) \n" \
264 |                 "vmovntdq %%zmm3, 0x28c0(%%r11, %%r10) \n" \
265 |                 "vmovntdq %%zmm4, 0x8440(%%r11, %%r10) \n" \
266 |                 "vmovntdq %%zmm5, 0xec40(%%r11, %%r10) \n" \
267 |                 "vmovntdq %%zmm6, 0x1080(%%r11, %%r10) \n" \
268 |                 "vmovntdq %%zmm7, 0x6e00(%%r11, %%r10) \n" \
269 |                 "vmovntdq %%zmm8, 0x3300(%%r11, %%r10) \n" \
270 |                 "vmovntdq %%zmm9, 0xef80(%%r11, %%r10) \n" \
271 |                 "vmovntdq %%zmm10, 0xb900(%%r11, %%r10) \n" \
272 |                 "vmovntdq %%zmm11, 0x2280(%%r11, %%r10) \n" \
273 |                 "vmovntdq %%zmm12, 0x85c0(%%r11, %%r10) \n" \
274 |                 "vmovntdq %%zmm13, 0x240(%%r11, %%r10) \n" \
275 |                 "vmovntdq %%zmm14, 0x40c0(%%r11, %%r10) \n" \
276 |                 "vmovntdq %%zmm15, 0x3100(%%r11, %%r10) \n"
277 | 
278 | #define NTST_x32_RAND_AVX512 \
279 |                 "vmovntdq %%zmm0, 0x4240(%%r11, %%r10) \n" \
280 |                 "vmovntdq %%zmm1, 0x6400(%%r11, %%r10) \n" \
281 |                 "vmovntdq %%zmm2, 0xe4c0(%%r11, %%r10) \n" \
282 |                 "vmovntdq %%zmm3, 0xf200(%%r11, %%r10) \n" \
283 |                 "vmovntdq %%zmm4, 0xc400(%%r11, %%r10) \n" \
284 |                 "vmovntdq %%zmm5, 0x9e80(%%r11, %%r10) \n" \
285 |                 "vmovntdq %%zmm6, 0xaf80(%%r11, %%r10) \n" \
286 |                 "vmovntdq %%zmm7, 0xb380(%%r11, %%r10) \n" \
287 |                 "vmovntdq %%zmm8, 0xc7c0(%%r11, %%r10) \n" \
288 |                 "vmovntdq %%zmm9, 0x65c0(%%r11, %%r10) \n" \
289 |                 "vmovntdq %%zmm10, 0x5b40(%%r11, %%r10) \n" \
290 |                 "vmovntdq %%zmm11, 0x8640(%%r11, %%r10) \n" \
291 |                 "vmovntdq %%zmm12, 0x67c0(%%r11, %%r10) \n" \
292 |                 "vmovntdq %%zmm13, 0xaa80(%%r11, %%r10) \n" \
293 |                 "vmovntdq %%zmm14, 0x7640(%%r11, %%r10) \n" \
294 |                 "vmovntdq %%zmm15, 0x6d40(%%r11, %%r10) \n" \
295 |                 "vmovntdq %%zmm16, 0x1400(%%r11, %%r10) \n" \
296 |                 "vmovntdq %%zmm17, 0x3fc0(%%r11, %%r10) \n" \
297 |                 "vmovntdq %%zmm18, 0x6640(%%r11, %%r10) \n" \
298 |                 "vmovntdq %%zmm19, 0x1f40(%%r11, %%r10) \n" \
299 |                 "vmovntdq %%zmm20, 0x3a00(%%r11, %%r10) \n" \
300 |                 "vmovntdq %%zmm21, 0x1080(%%r11, %%r10) \n" \
301 |                 "vmovntdq %%zmm22, 0x9c0(%%r11, %%r10) \n" \
302 |                 "vmovntdq %%zmm23, 0xf80(%%r11, %%r10) \n" \
303 |                 "vmovntdq %%zmm24, 0xcb00(%%r11, %%r10) \n" \
304 |                 "vmovntdq %%zmm25, 0x7e80(%%r11, %%r10) \n" \
305 |                 "vmovntdq %%zmm26, 0x99c0(%%r11, %%r10) \n" \
306 |                 "vmovntdq %%zmm27, 0x680(%%r11, %%r10) \n" \
307 |                 "vmovntdq %%zmm28, 0x12c0(%%r11, %%r10) \n" \
308 |                 "vmovntdq %%zmm29, 0x2880(%%r11, %%r10) \n" \
309 |                 "vmovntdq %%zmm30, 0xd140(%%r11, %%r10) \n" \
310 |                 "vmovntdq %%zmm31, 0xf400(%%r11, %%r10) \n"
311 | 
312 | /* temporal load */
313 | #define SIZELD_1024_AVX512	\
314 | 				"vmovdqa64  0x0(%%r9, %%r10), %%zmm0 \n" \
315 | 				"vmovdqa64  0x40(%%r9, %%r10), %%zmm1 \n" \
316 | 				"vmovdqa64  0x80(%%r9, %%r10), %%zmm2 \n" \
317 | 				"vmovdqa64  0xc0(%%r9, %%r10), %%zmm3 \n" \
318 | 				"vmovdqa64  0x100(%%r9, %%r10), %%zmm4 \n" \
319 | 				"vmovdqa64  0x140(%%r9, %%r10), %%zmm5 \n" \
320 | 				"vmovdqa64  0x180(%%r9, %%r10), %%zmm6 \n" \
321 | 				"vmovdqa64  0x1c0(%%r9, %%r10), %%zmm7 \n" \
322 | 				"vmovdqa64  0x200(%%r9, %%r10), %%zmm8 \n" \
323 | 				"vmovdqa64  0x240(%%r9, %%r10), %%zmm9 \n" \
324 | 				"vmovdqa64  0x280(%%r9, %%r10), %%zmm10 \n" \
325 | 				"vmovdqa64  0x2c0(%%r9, %%r10), %%zmm11 \n" \
326 | 				"vmovdqa64  0x300(%%r9, %%r10), %%zmm12 \n" \
327 | 				"vmovdqa64  0x340(%%r9, %%r10), %%zmm13 \n" \
328 | 				"vmovdqa64  0x380(%%r9, %%r10), %%zmm14 \n" \
329 | 				"vmovdqa64  0x3c0(%%r9, %%r10), %%zmm15 \n" \
330 | 				"add $0x400, %%r10 \n"
331 | 
332 | #define LD_x1_RAND_AVX512 \
333 |                 "vmovdqa64 0x4140(%%r11, %%r10), %%zmm0 \n"
334 | 
335 | #define LD_x8_RAND_AVX512 \
336 |                 "vmovdqa64 0x4140(%%r11, %%r10), %%zmm0 \n" \
337 |                 "vmovdqa64 0xf340(%%r11, %%r10), %%zmm1 \n" \
338 |                 "vmovdqa64 0x2640(%%r11, %%r10), %%zmm2 \n" \
339 |                 "vmovdqa64 0x1000(%%r11, %%r10), %%zmm3 \n" \
340 |                 "vmovdqa64 0xda40(%%r11, %%r10), %%zmm4 \n" \
341 |                 "vmovdqa64 0x5200(%%r11, %%r10), %%zmm5 \n" \
342 |                 "vmovdqa64 0x180(%%r11, %%r10), %%zmm6 \n" \
343 |                 "vmovdqa64 0xf3c0(%%r11, %%r10), %%zmm7 \n"
344 | 
345 | #define LD_LFENCE_x16_RAND_AVX512 \
346 |                 "vmovdqa64 0x2a80(%%r11, %%r10), %%zmm0 \n lfence \n" \
347 |                 "vmovdqa64 0x680(%%r11, %%r10), %%zmm1 \n lfence \n" \
348 |                 "vmovdqa64 0x8500(%%r11, %%r10), %%zmm2 \n lfence \n" \
349 |                 "vmovdqa64 0x8980(%%r11, %%r10), %%zmm3 \n lfence \n" \
350 |                 "vmovdqa64 0x6d40(%%r11, %%r10), %%zmm4 \n lfence \n" \
351 |                 "vmovdqa64 0xf7c0(%%r11, %%r10), %%zmm5 \n lfence \n" \
352 |                 "vmovdqa64 0x4640(%%r11, %%r10), %%zmm6 \n lfence \n" \
353 |                 "vmovdqa64 0x1480(%%r11, %%r10), %%zmm7 \n lfence \n" \
354 |                 "vmovdqa64 0x2f00(%%r11, %%r10), %%zmm8 \n lfence \n" \
355 |                 "vmovdqa64 0x15c0(%%r11, %%r10), %%zmm9 \n lfence \n" \
356 |                 "vmovdqa64 0xf100(%%r11, %%r10), %%zmm10 \n lfence \n" \
357 |                 "vmovdqa64 0x66c0(%%r11, %%r10), %%zmm11 \n lfence \n" \
358 |                 "vmovdqa64 0xe240(%%r11, %%r10), %%zmm12 \n lfence \n" \
359 |                 "vmovdqa64 0xf480(%%r11, %%r10), %%zmm13 \n lfence \n" \
360 |                 "vmovdqa64 0x84c0(%%r11, %%r10), %%zmm14 \n lfence \n" \
361 |                 "vmovdqa64 0xe480(%%r11, %%r10), %%zmm15 \n lfence \n"
362 | 
363 | #define LD_x16_RAND_AVX512 \
364 |                 "vmovdqa64 0xc300(%%r11, %%r10), %%zmm0 \n" \
365 |                 "vmovdqa64 0xda00(%%r11, %%r10), %%zmm1 \n" \
366 |                 "vmovdqa64 0x1980(%%r11, %%r10), %%zmm2 \n" \
367 |                 "vmovdqa64 0xddc0(%%r11, %%r10), %%zmm3 \n" \
368 |                 "vmovdqa64 0xaa00(%%r11, %%r10), %%zmm4 \n" \
369 |                 "vmovdqa64 0x5540(%%r11, %%r10), %%zmm5 \n" \
370 |                 "vmovdqa64 0x6740(%%r11, %%r10), %%zmm6 \n" \
371 |                 "vmovdqa64 0x5a80(%%r11, %%r10), %%zmm7 \n" \
372 |                 "vmovdqa64 0xa680(%%r11, %%r10), %%zmm8 \n" \
373 |                 "vmovdqa64 0xdb00(%%r11, %%r10), %%zmm9 \n" \
374 |                 "vmovdqa64 0x3340(%%r11, %%r10), %%zmm10 \n" \
375 |                 "vmovdqa64 0x7e40(%%r11, %%r10), %%zmm11 \n" \
376 |                 "vmovdqa64 0x3600(%%r11, %%r10), %%zmm12 \n" \
377 |                 "vmovdqa64 0x5080(%%r11, %%r10), %%zmm13 \n" \
378 |                 "vmovdqa64 0x6e00(%%r11, %%r10), %%zmm14 \n" \
379 |                 "vmovdqa64 0x1540(%%r11, %%r10), %%zmm15 \n"
380 | 
381 | #define LD_x32_RAND_AVX512 \
382 |                 "vmovdqa64 0x7b40(%%r11, %%r10), %%zmm0 \n" \
383 |                 "vmovdqa64 0x7640(%%r11, %%r10), %%zmm1 \n" \
384 |                 "vmovdqa64 0xdf00(%%r11, %%r10), %%zmm2 \n" \
385 |                 "vmovdqa64 0xdb40(%%r11, %%r10), %%zmm3 \n" \
386 |                 "vmovdqa64 0xb6c0(%%r11, %%r10), %%zmm4 \n" \
387 |                 "vmovdqa64 0x6980(%%r11, %%r10), %%zmm5 \n" \
388 |                 "vmovdqa64 0xf280(%%r11, %%r10), %%zmm6 \n" \
389 |                 "vmovdqa64 0x3dc0(%%r11, %%r10), %%zmm7 \n" \
390 |                 "vmovdqa64 0x6d80(%%r11, %%r10), %%zmm8 \n" \
391 |                 "vmovdqa64 0xf580(%%r11, %%r10), %%zmm9 \n" \
392 |                 "vmovdqa64 0xf300(%%r11, %%r10), %%zmm10 \n" \
393 |                 "vmovdqa64 0x3140(%%r11, %%r10), %%zmm11 \n" \
394 |                 "vmovdqa64 0x8980(%%r11, %%r10), %%zmm12 \n" \
395 |                 "vmovdqa64 0xecc0(%%r11, %%r10), %%zmm13 \n" \
396 |                 "vmovdqa64 0xc5c0(%%r11, %%r10), %%zmm14 \n" \
397 |                 "vmovdqa64 0x1e40(%%r11, %%r10), %%zmm15 \n" \
398 |                 "vmovdqa64 0xf3c0(%%r11, %%r10), %%zmm16 \n" \
399 |                 "vmovdqa64 0xe800(%%r11, %%r10), %%zmm17 \n" \
400 |                 "vmovdqa64 0x2200(%%r11, %%r10), %%zmm18 \n" \
401 |                 "vmovdqa64 0x66c0(%%r11, %%r10), %%zmm19 \n" \
402 |                 "vmovdqa64 0xc00(%%r11, %%r10), %%zmm20 \n" \
403 |                 "vmovdqa64 0x2bc0(%%r11, %%r10), %%zmm21 \n" \
404 |                 "vmovdqa64 0x6a80(%%r11, %%r10), %%zmm22 \n" \
405 |                 "vmovdqa64 0x94c0(%%r11, %%r10), %%zmm23 \n" \
406 |                 "vmovdqa64 0xbec0(%%r11, %%r10), %%zmm24 \n" \
407 |                 "vmovdqa64 0xcdc0(%%r11, %%r10), %%zmm25 \n" \
408 |                 "vmovdqa64 0xf80(%%r11, %%r10), %%zmm26 \n" \
409 |                 "vmovdqa64 0xc000(%%r11, %%r10), %%zmm27 \n" \
410 |                 "vmovdqa64 0x4340(%%r11, %%r10), %%zmm28 \n" \
411 |                 "vmovdqa64 0x4640(%%r11, %%r10), %%zmm29 \n" \
412 |                 "vmovdqa64 0xcc0(%%r11, %%r10), %%zmm30 \n" \
413 |                 "vmovdqa64 0x6b40(%%r11, %%r10), %%zmm31 \n"
414 | 
415 | #define STWB_x1_RAND_AVX512 \
416 |                 "vmovdqa64  %%zmm0,  0xe80(%%r11, %%r10) \n clwb 0xe80(%%r11, %%r10) \n"
417 | 
418 | #define STWB_x8_RAND_AVX512 \
419 |                 "vmovdqa64  %%zmm0,  0xe80(%%r11, %%r10) \n clwb 0xe80(%%r11, %%r10) \n" \
420 |                 "vmovdqa64  %%zmm1,  0xe4c0(%%r11, %%r10) \n clwb 0xe4c0(%%r11, %%r10) \n" \
421 |                 "vmovdqa64  %%zmm2,  0x4780(%%r11, %%r10) \n clwb 0x4780(%%r11, %%r10) \n" \
422 |                 "vmovdqa64  %%zmm3,  0xc240(%%r11, %%r10) \n clwb 0xc240(%%r11, %%r10) \n" \
423 |                 "vmovdqa64  %%zmm4,  0x2e00(%%r11, %%r10) \n clwb 0x2e00(%%r11, %%r10) \n" \
424 |                 "vmovdqa64  %%zmm5,  0xf4c0(%%r11, %%r10) \n clwb 0xf4c0(%%r11, %%r10) \n" \
425 |                 "vmovdqa64  %%zmm6,  0xe5c0(%%r11, %%r10) \n clwb 0xe5c0(%%r11, %%r10) \n" \
426 |                 "vmovdqa64  %%zmm7,  0x7040(%%r11, %%r10) \n clwb 0x7040(%%r11, %%r10) \n"
427 | 
428 | //#define STWB_SFENCE_x16_RAND_AVX512
429 | 
430 | #define STWB_x16_RAND_AVX512 \
431 |                 "vmovdqa64  %%zmm0,  0x28c0(%%r11, %%r10) \n clwb 0x28c0(%%r11, %%r10) \n" \
432 |                 "vmovdqa64  %%zmm1,  0xc880(%%r11, %%r10) \n clwb 0xc880(%%r11, %%r10) \n" \
433 |                 "vmovdqa64  %%zmm2,  0x3cc0(%%r11, %%r10) \n clwb 0x3cc0(%%r11, %%r10) \n" \
434 |                 "vmovdqa64  %%zmm3,  0xdd40(%%r11, %%r10) \n clwb 0xdd40(%%r11, %%r10) \n" \
435 |                 "vmovdqa64  %%zmm4,  0x6bc0(%%r11, %%r10) \n clwb 0x6bc0(%%r11, %%r10) \n" \
436 |                 "vmovdqa64  %%zmm5,  0xe600(%%r11, %%r10) \n clwb 0xe600(%%r11, %%r10) \n" \
437 |                 "vmovdqa64  %%zmm6,  0x1c0(%%r11, %%r10) \n clwb 0x1c0(%%r11, %%r10) \n" \
438 |                 "vmovdqa64  %%zmm7,  0xf540(%%r11, %%r10) \n clwb 0xf540(%%r11, %%r10) \n" \
439 |                 "vmovdqa64  %%zmm8,  0x11c0(%%r11, %%r10) \n clwb 0x11c0(%%r11, %%r10) \n" \
440 |                 "vmovdqa64  %%zmm9,  0xb000(%%r11, %%r10) \n clwb 0xb000(%%r11, %%r10) \n" \
441 |                 "vmovdqa64  %%zmm10,  0x3f80(%%r11, %%r10) \n clwb 0x3f80(%%r11, %%r10) \n" \
442 |                 "vmovdqa64  %%zmm11,  0x5c40(%%r11, %%r10) \n clwb 0x5c40(%%r11, %%r10) \n" \
443 |                 "vmovdqa64  %%zmm12,  0xed00(%%r11, %%r10) \n clwb 0xed00(%%r11, %%r10) \n" \
444 |                 "vmovdqa64  %%zmm13,  0xd600(%%r11, %%r10) \n clwb 0xd600(%%r11, %%r10) \n" \
445 |                 "vmovdqa64  %%zmm14,  0x4c80(%%r11, %%r10) \n clwb 0x4c80(%%r11, %%r10) \n" \
446 |                 "vmovdqa64  %%zmm15,  0xb280(%%r11, %%r10) \n clwb 0xb280(%%r11, %%r10) \n"
447 | 
448 | /* temporal store */
449 | #define STWB_x32_RAND_AVX512 \
450 |                 "vmovdqa64  %%zmm0,  0x9c0(%%r11, %%r10) \n clwb 0x9c0(%%r11, %%r10) \n" \
451 |                 "vmovdqa64  %%zmm1,  0x3b40(%%r11, %%r10) \n clwb 0x3b40(%%r11, %%r10) \n" \
452 |                 "vmovdqa64  %%zmm2,  0xe540(%%r11, %%r10) \n clwb 0xe540(%%r11, %%r10) \n" \
453 |                 "vmovdqa64  %%zmm3,  0xe180(%%r11, %%r10) \n clwb 0xe180(%%r11, %%r10) \n" \
454 |                 "vmovdqa64  %%zmm4,  0x2b80(%%r11, %%r10) \n clwb 0x2b80(%%r11, %%r10) \n" \
455 |                 "vmovdqa64  %%zmm5,  0xa380(%%r11, %%r10) \n clwb 0xa380(%%r11, %%r10) \n" \
456 |                 "vmovdqa64  %%zmm6,  0x9ac0(%%r11, %%r10) \n clwb 0x9ac0(%%r11, %%r10) \n" \
457 |                 "vmovdqa64  %%zmm7,  0xd500(%%r11, %%r10) \n clwb 0xd500(%%r11, %%r10) \n" \
458 |                 "vmovdqa64  %%zmm8,  0x51c0(%%r11, %%r10) \n clwb 0x51c0(%%r11, %%r10) \n" \
459 |                 "vmovdqa64  %%zmm9,  0x99c0(%%r11, %%r10) \n clwb 0x99c0(%%r11, %%r10) \n" \
460 |                 "vmovdqa64  %%zmm10,  0xacc0(%%r11, %%r10) \n clwb 0xacc0(%%r11, %%r10) \n" \
461 |                 "vmovdqa64  %%zmm11,  0x4900(%%r11, %%r10) \n clwb 0x4900(%%r11, %%r10) \n" \
462 |                 "vmovdqa64  %%zmm12,  0x3540(%%r11, %%r10) \n clwb 0x3540(%%r11, %%r10) \n" \
463 |                 "vmovdqa64  %%zmm13,  0x8ac0(%%r11, %%r10) \n clwb 0x8ac0(%%r11, %%r10) \n" \
464 |                 "vmovdqa64  %%zmm14,  0x2580(%%r11, %%r10) \n clwb 0x2580(%%r11, %%r10) \n" \
465 |                 "vmovdqa64  %%zmm15,  0xc5c0(%%r11, %%r10) \n clwb 0xc5c0(%%r11, %%r10) \n" \
466 |                 "vmovdqa64  %%zmm16,  0xfd40(%%r11, %%r10) \n clwb 0xfd40(%%r11, %%r10) \n" \
467 |                 "vmovdqa64  %%zmm17,  0xac40(%%r11, %%r10) \n clwb 0xac40(%%r11, %%r10) \n" \
468 |                 "vmovdqa64  %%zmm18,  0x1240(%%r11, %%r10) \n clwb 0x1240(%%r11, %%r10) \n" \
469 |                 "vmovdqa64  %%zmm19,  0xa00(%%r11, %%r10) \n clwb 0xa00(%%r11, %%r10) \n" \
470 |                 "vmovdqa64  %%zmm20,  0x53c0(%%r11, %%r10) \n clwb 0x53c0(%%r11, %%r10) \n" \
471 |                 "vmovdqa64  %%zmm21,  0xcd00(%%r11, %%r10) \n clwb 0xcd00(%%r11, %%r10) \n" \
472 |                 "vmovdqa64  %%zmm22,  0xbac0(%%r11, %%r10) \n clwb 0xbac0(%%r11, %%r10) \n" \
473 |                 "vmovdqa64  %%zmm23,  0x2500(%%r11, %%r10) \n clwb 0x2500(%%r11, %%r10) \n" \
474 |                 "vmovdqa64  %%zmm24,  0xd300(%%r11, %%r10) \n clwb 0xd300(%%r11, %%r10) \n" \
475 |                 "vmovdqa64  %%zmm25,  0xba40(%%r11, %%r10) \n clwb 0xba40(%%r11, %%r10) \n" \
476 |                 "vmovdqa64  %%zmm26,  0xf500(%%r11, %%r10) \n clwb 0xf500(%%r11, %%r10) \n" \
477 |                 "vmovdqa64  %%zmm27,  0x2080(%%r11, %%r10) \n clwb 0x2080(%%r11, %%r10) \n" \
478 |                 "vmovdqa64  %%zmm28,  0xf2c0(%%r11, %%r10) \n clwb 0xf2c0(%%r11, %%r10) \n" \
479 |                 "vmovdqa64  %%zmm29,  0xa980(%%r11, %%r10) \n clwb 0xa980(%%r11, %%r10) \n" \
480 |                 "vmovdqa64  %%zmm30,  0x8880(%%r11, %%r10) \n clwb 0x8880(%%r11, %%r10) \n" \
481 |                 "vmovdqa64  %%zmm31,  0x54c0(%%r11, %%r10) \n clwb 0x54c0(%%r11, %%r10) \n"
482 | 
483 | #define SIZESTWB_1024_AVX512	\
484 | 				"vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n" \
485 |                 "clwb  0x0(%%r9, %%r10) \n" \
486 | 				"vmovdqa64  %%zmm0,  0x40(%%r9, %%r10) \n" \
487 |                 "clwb  0x40(%%r9, %%r10) \n" \
488 | 				"vmovdqa64  %%zmm0,  0x80(%%r9, %%r10) \n" \
489 |                 "clwb  0x80(%%r9, %%r10) \n" \
490 | 				"vmovdqa64  %%zmm0,  0xc0(%%r9, %%r10) \n" \
491 |                 "clwb  0xc0(%%r9, %%r10) \n" \
492 | 				"vmovdqa64  %%zmm0,  0x100(%%r9, %%r10) \n" \
493 |                 "clwb  0x100(%%r9, %%r10) \n" \
494 | 				"vmovdqa64  %%zmm0,  0x140(%%r9, %%r10) \n" \
495 |                 "clwb  0x140(%%r9, %%r10) \n" \
496 | 				"vmovdqa64  %%zmm0,  0x180(%%r9, %%r10) \n" \
497 |                 "clwb  0x180(%%r9, %%r10) \n" \
498 | 				"vmovdqa64  %%zmm0,  0x1c0(%%r9, %%r10) \n" \
499 |                 "clwb  0x1c0(%%r9, %%r10) \n" \
500 | 				"vmovdqa64  %%zmm0,  0x200(%%r9, %%r10) \n" \
501 |                 "clwb  0x200(%%r9, %%r10) \n" \
502 | 				"vmovdqa64  %%zmm0,  0x240(%%r9, %%r10) \n" \
503 |                 "clwb  0x240(%%r9, %%r10) \n" \
504 | 				"vmovdqa64  %%zmm0,  0x280(%%r9, %%r10) \n" \
505 |                 "clwb  0x280(%%r9, %%r10) \n" \
506 | 				"vmovdqa64  %%zmm0,  0x2c0(%%r9, %%r10) \n" \
507 |                 "clwb  0x2c0(%%r9, %%r10) \n" \
508 | 				"vmovdqa64  %%zmm0,  0x300(%%r9, %%r10) \n" \
509 |                 "clwb  0x300(%%r9, %%r10) \n" \
510 | 				"vmovdqa64  %%zmm0,  0x340(%%r9, %%r10) \n" \
511 |                 "clwb  0x340(%%r9, %%r10) \n" \
512 | 				"vmovdqa64  %%zmm0,  0x380(%%r9, %%r10) \n" \
513 |                 "clwb  0x380(%%r9, %%r10) \n" \
514 | 				"vmovdqa64  %%zmm0,  0x3c0(%%r9, %%r10) \n" \
515 |                 "clwb  0x3c0(%%r9, %%r10) \n" \
516 | 				"add $0x400, %%r10 \n"
517 | 
518 | #define SIZEST_1024_AVX512	\
519 | 				"vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n" \
520 | 				"vmovdqa64  %%zmm1,  0x40(%%r9, %%r10) \n" \
521 | 				"vmovdqa64  %%zmm2,  0x80(%%r9, %%r10) \n" \
522 | 				"vmovdqa64  %%zmm3,  0xc0(%%r9, %%r10) \n" \
523 | 				"vmovdqa64  %%zmm4,  0x100(%%r9, %%r10) \n" \
524 | 				"vmovdqa64  %%zmm5,  0x140(%%r9, %%r10) \n" \
525 | 				"vmovdqa64  %%zmm6,  0x180(%%r9, %%r10) \n" \
526 | 				"vmovdqa64  %%zmm7,  0x1c0(%%r9, %%r10) \n" \
527 | 				"vmovdqa64  %%zmm8,  0x200(%%r9, %%r10) \n" \
528 | 				"vmovdqa64  %%zmm9,  0x240(%%r9, %%r10) \n" \
529 | 				"vmovdqa64  %%zmm10,  0x280(%%r9, %%r10) \n" \
530 | 				"vmovdqa64  %%zmm11,  0x2c0(%%r9, %%r10) \n" \
531 | 				"vmovdqa64  %%zmm12,  0x300(%%r9, %%r10) \n" \
532 | 				"vmovdqa64  %%zmm13,  0x340(%%r9, %%r10) \n" \
533 | 				"vmovdqa64  %%zmm14,  0x380(%%r9, %%r10) \n" \
534 | 				"vmovdqa64  %%zmm15,  0x3c0(%%r9, %%r10) \n" \
535 | 				"add $0x400, %%r10 \n"
536 | 
537 | /* perform movdir64B */
538 | #define SIZEMOV_1024 \
539 |                 "movdir64b 0x0(%%r9, %%r10), %%r12 \n" \
540 |                 "add $0x40, %%r12 \n" \
541 |                 "movdir64b 0x40(%%r9, %%r10), %%r12\n" \
542 |                 "add $0x40, %%r12 \n" \
543 |                 "movdir64b 0x80(%%r9, %%r10), %%r12\n" \
544 |                 "add $0x40, %%r12 \n" \
545 |                 "movdir64b 0xc0(%%r9, %%r10), %%r12\n" \
546 |                 "add $0x40, %%r12 \n" \
547 |                 "movdir64b 0x100(%%r9, %%r10), %%r12\n" \
548 |                 "add $0x40, %%r12 \n" \
549 |                 "movdir64b 0x140(%%r9, %%r10), %%r12\n" \
550 |                 "add $0x40, %%r12 \n" \
551 |                 "movdir64b 0x180(%%r9, %%r10), %%r12\n" \
552 |                 "add $0x40, %%r12 \n" \
553 |                 "movdir64b 0x1c0(%%r9, %%r10), %%r12\n" \
554 |                 "add $0x40, %%r12 \n" \
555 |                 "movdir64b 0x200(%%r9, %%r10), %%r12\n" \
556 |                 "add $0x40, %%r12 \n" \
557 |                 "movdir64b 0x240(%%r9, %%r10), %%r12\n" \
558 |                 "add $0x40, %%r12 \n" \
559 |                 "movdir64b 0x280(%%r9, %%r10), %%r12\n" \
560 |                 "add $0x40, %%r12 \n" \
561 |                 "movdir64b 0x2c0(%%r9, %%r10), %%r12\n" \
562 |                 "add $0x40, %%r12 \n" \
563 |                 "movdir64b 0x300(%%r9, %%r10), %%r12\n" \
564 |                 "add $0x40, %%r12 \n" \
565 |                 "movdir64b 0x340(%%r9, %%r10), %%r12\n" \
566 |                 "add $0x40, %%r12 \n" \
567 |                 "movdir64b 0x380(%%r9, %%r10), %%r12\n" \
568 |                 "add $0x40, %%r12 \n" \
569 |                 "movdir64b 0x3c0(%%r9, %%r10), %%r12\n" \
570 |                 "add $0x40, %%r12 \n" \
571 | 				"add $0x400, %%r10 \n" \
572 | 
573 | /* Mixed read and write */
574 | /* try using the same dest reg. Assign some value to zmm0 for storing. */
575 | #define SIZE_R1W1_512 \
576 | 				"vmovdqa64  0x0(%%r9, %%r10), %%zmm0 \n" \
577 | 				"vmovdqa64  0x40(%%r9, %%r10), %%zmm1 \n" \
578 | 				"vmovdqa64  0x80(%%r9, %%r10), %%zmm2 \n" \
579 | 				"vmovdqa64  0xc0(%%r9, %%r10), %%zmm3 \n" \
580 | 				"" \
581 | 				"vmovq		%1, %%xmm0 \n" \
582 | 				"vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n" \
583 | 				"clwb  0x0(%%r9, %%r10) \n" \
584 | 				"vmovdqa64  %%zmm0,  0x40(%%r9, %%r10) \n" \
585 | 				"clwb  0x40(%%r9, %%r10) \n" \
586 | 				"vmovdqa64  %%zmm0,  0x80(%%r9, %%r10) \n" \
587 | 				"clwb  0x80(%%r9, %%r10) \n" \
588 | 				"vmovdqa64  %%zmm0,  0xc0(%%r9, %%r10) \n" \
589 | 				"clwb  0xc0(%%r9, %%r10) \n" \
590 | 				"add $0x200, %%r10 \n" \
591 | 
592 | 
593 | #define SIZE_R2W1_576 \
594 | 				"vmovdqa64  0x0(%%r9, %%r10), %%zmm0 \n" \
595 | 				"vmovdqa64  0x40(%%r9, %%r10), %%zmm0 \n" \
596 | 				"vmovdqa64  0x80(%%r9, %%r10), %%zmm0 \n" \
597 | 				"vmovdqa64  0xc0(%%r9, %%r10), %%zmm0 \n" \
598 | 				"vmovdqa64  0x100(%%r9, %%r10), %%zmm0 \n" \
599 | 				"vmovdqa64  0x140(%%r9, %%r10), %%zmm0 \n" \
600 | 				"vmovq	%1, %%xmm1 \n" \
601 | 				"vmovdqa64  %%zmm1,  0x0(%%r9, %%r10) \n" \
602 | 				"clwb  0x0(%%r9, %%r10) \n" \
603 | 				"vmovdqa64  %%zmm1,  0x40(%%r9, %%r10) \n" \
604 | 				"clwb  0x40(%%r9, %%r10) \n" \
605 | 				"vmovdqa64  %%zmm1,  0x80(%%r9, %%r10) \n" \
606 | 				"clwb  0x80(%%r9, %%r10) \n" \
607 | 				"add $0x180, %%r10 \n" \
608 | 
609 | 
610 | #define SIZE_R2W1_384 \
611 | 				"vmovdqa64  0x0(%%r9, %%r10), %%zmm0 \n" \
612 | 				"vmovdqa64  0x40(%%r9, %%r10), %%zmm1 \n" \
613 | 				"vmovdqa64  0x80(%%r9, %%r10), %%zmm2 \n" \
614 | 				"vmovdqa64  0xc0(%%r9, %%r10), %%zmm3 \n" \
615 | 				"vmovq	%1, %%xmm0 \n" \
616 | 				"vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n" \
617 | 				"clwb  0x0(%%r9, %%r10) \n" \
618 | 				"vmovdqa64  %%zmm0,  0x40(%%r9, %%r10) \n" \
619 | 				"clwb  0x40(%%r9, %%r10) \n" \
620 | 				"add $0x180, %%r10 \n" \
621 | 
622 | 
623 | #define SIZE_R3W1_512 \
624 | 				"vmovdqa64  0x0(%%r9, %%r10), %%zmm0 \n" \
625 | 				"vmovdqa64  0x40(%%r9, %%r10), %%zmm1 \n" \
626 | 				"vmovdqa64  0x80(%%r9, %%r10), %%zmm2 \n" \
627 | 				"vmovdqa64  0xc0(%%r9, %%r10), %%zmm3 \n" \
628 | 				"vmovdqa64  0x100(%%r9, %%r10), %%zmm4 \n" \
629 | 				"vmovdqa64  0x140(%%r9, %%r10), %%zmm5 \n" \
630 | 				"" \
631 | 				"vmovq		%1, %%xmm0 \n" \
632 | 				"vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n" \
633 | 				"clwb  0x0(%%r9, %%r10) \n" \
634 | 				"vmovdqa64  %%zmm0,  0x40(%%r9, %%r10) \n" \
635 | 				"clwb  0x40(%%r9, %%r10) \n" \
636 | 				"add $0x200, %%r10 \n" \
637 | 
638 | /* snippets for latency measuring */
639 | 
640 | /* Assembly instructions utilize the following registers:
641 |  * rsi: memory address
642 |  * rax, rdx, rcx, r8d and r9d: timing
643 |  * rdx: populating cache-lines
644 |  * ymm0: streaming instructions
645 |  */
646 | #define REGISTERS "rsi", "rax", "rdx", "rcx", "r8", "r9", "ymm0"
647 | 
648 | #define REGISTERS_AND_ZMM "rsi", "rax", "rdx", "rcx", "r8", "r9", \
649 | 
650 | #define ZMM_0_15 "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15"
651 | 
652 | 
653 | /* rdtscp: reads current timestamp to EDX:EAX and also sets ECX
654 |  * 		   higher 32-bits of RAX, RDX and RCX are cleared
655 |  *
656 |  * r9d = old EDX
657 |  * r8d = old EAX
658 |  * Here is what we do to compute t_start and t_end:
659 |  * - RDX holds t_end
660 |  * - RAX holds t_start
661 |  */
662 | 
663 | /** Douglas: read this blog for more info about timing 
664 |  * 	http://sites.utexas.edu/jdm4372/2018/07/23/comments-on-timing-short-code-sections-on-intel-processors/
665 | */
666 | #define TIMING_BEGIN      	"rdtscp \n" \
667 | 							"lfence \n" \
668 | 							"mov %%edx, %%r9d \n" \
669 | 							"mov %%eax, %%r8d \n"
670 | 
671 | #define TIMING_END      	"mfence \n" \
672 | 							"rdtscp \n" \
673 | 							"lfence \n" \
674 | 							"shl $32, %%rdx \n" \
675 | 							"or  %%rax, %%rdx \n" \
676 | 							"mov %%r9d, %%eax \n" \
677 | 							"shl $32, %%rax \n" \
678 | 							"or  %%r8, %%rax \n" \
679 | 							"mov %%rax, %[t_start] \n" \
680 | 							"mov %%rdx, %[t_end] \n"
681 | 
682 | #define FLUSH_64K_BLOCK \
683 |                             "LOOP_64K_BLOCK_FLUSH: \n" \
684 |                                 "clflush (%%r11, %%r10) \n" \
685 |                                 "add $0x40, %%r10 \n" \
686 |                                 "cmp $0x10000, %%r10 \n" \
687 |                                 "jl LOOP_64K_BLOCK_FLUSH\n" \
688 |                             "xor %%r10, %%r10 \n" \
689 | 							"mfence \n"
690 | 
691 | 
692 | #define FLUSH_CACHE_LINE    "clflush 0*32(%%rsi) \n" \
693 | 							"clflush 2*32(%%rsi) \n" \
694 | 							"clflush 4*32(%%rsi) \n" \
695 | 							"clflush 6*32(%%rsi) \n" \
696 | 							"mfence \n"
697 | 
698 | 
699 | #define CLEAR_PIPELINE      "nop \nnop \nnop \nnop \nnop \nnop \n" \
700 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
701 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
702 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
703 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
704 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
705 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
706 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
707 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
708 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
709 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
710 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
711 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
712 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
713 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" \
714 | 							"nop \nnop \nnop \nnop \nnop \nnop \n" 
715 | 
716 | #define CLEAR_PIPELINE_x16  CLEAR_PIPELINE \
717 |             CLEAR_PIPELINE \
718 |             CLEAR_PIPELINE \
719 |             CLEAR_PIPELINE \
720 |             CLEAR_PIPELINE \
721 |             CLEAR_PIPELINE \
722 |             CLEAR_PIPELINE \
723 |             CLEAR_PIPELINE \
724 |             CLEAR_PIPELINE \
725 |             CLEAR_PIPELINE \
726 |             CLEAR_PIPELINE \
727 |             CLEAR_PIPELINE \
728 |             CLEAR_PIPELINE \
729 |             CLEAR_PIPELINE \
730 |             CLEAR_PIPELINE \
731 |             CLEAR_PIPELINE
732 | 
733 | #endif // WORKLOAD_H
734 | 


--------------------------------------------------------------------------------