├── README.md ├── cache.bandwidth.cpp ├── cache.conflicts.cpp ├── cache.size.cpp ├── cacheline.cpp ├── cacheline.race.cpp ├── false.sharing.cpp ├── matrix.travel.cpp ├── object.member.cpp └── test.sh /README.md: -------------------------------------------------------------------------------- 1 | # CPU Cache 2 | 3 | This repo just some test case show the CPU Cache related test. 4 | 5 | The original article is "[The CPU Cache Knowledge All Programmer Need to Know](https://coolshell.cn/articles/20793.html)" Chinese Edition 6 | 7 | All of test cases are written by C++ 11. 8 | 9 | The `test.sh` is the script run all of test cases. 10 | -------------------------------------------------------------------------------- /cache.bandwidth.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | using namespace std; 6 | using namespace chrono; 7 | 8 | const int repeat_times = 1000; 9 | 10 | 11 | void test_memory(vector& memory) 12 | { 13 | using Clock = std::chrono::steady_clock; 14 | 15 | size_t size = memory.size(); 16 | 17 | int value = rand(); 18 | auto start = Clock::now(); 19 | 20 | for (int i = 0; i < repeat_times; i++) { 21 | for (int j = 0; j < size; j++) { 22 | memory[j] = value; 23 | } 24 | } 25 | 26 | auto time = duration_cast(Clock::now() - start).count(); 27 | auto timePerIter = time / (double) repeat_times; 28 | auto bytes = size * sizeof(int); 29 | 30 | cout << bytes / timePerIter << endl; // MiB/s 31 | } 32 | 33 | int main(int argc, char** argv) 34 | { 35 | long long size = 0; 36 | if (argc > 1){ 37 | size = atoi(argv[1]); 38 | } 39 | size_t count = size / sizeof(int); 40 | vector data(count, 0); 41 | 42 | test_memory(data); 43 | 44 | return 0; 45 | } -------------------------------------------------------------------------------- /cache.conflicts.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | using namespace std; 6 | using namespace chrono; 7 | 8 | const int repeat_times = 10000000; 9 | 10 | void test_memory(vector& memory, int increment) 11 | { 12 | using Clock = std::chrono::steady_clock; 13 | 14 | auto size = memory.size(); 15 | auto start = Clock::now(); 16 | 17 | for (int i = 0; i < repeat_times; i++) { 18 | for (int j = 0; j < size; j += increment) { 19 | memory[j] += j; 20 | } 21 | } 22 | 23 | cout << duration_cast(Clock::now() - start).count() << endl; 24 | } 25 | 26 | int main(int argc, char** argv) 27 | { 28 | 29 | auto count = static_cast(std::stoi(argv[1])); 30 | auto increment = static_cast(std::stoi(argv[2])) / sizeof(int); 31 | 32 | vector memory(count * increment); 33 | test_memory(memory, increment); 34 | 35 | return 0; 36 | } -------------------------------------------------------------------------------- /cache.size.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | using namespace std; 7 | using namespace std::chrono; 8 | 9 | 10 | #define LEN ( 64 * 1024 * 1024) 11 | 12 | // http://igoro.com/archive/gallery-of-processor-cache-effects/ 13 | 14 | void test(int* arr, int length, int steps) 15 | { 16 | // Get starting timepoint 17 | length--; 18 | auto start = high_resolution_clock::now(); 19 | 20 | // Loop the memory 21 | for (int i = 0; i < steps; i++) { 22 | arr[(i * 8) & length]++; 23 | } 24 | 25 | // Get ending timepoint 26 | auto stop = high_resolution_clock::now(); 27 | 28 | // Get duration. Substart timepoints to 29 | // get durarion. To cast it to proper unit 30 | // use duration cast method 31 | auto duration = duration_cast(stop - start); 32 | 33 | cout << std::setw(12) << length + 1 << " : " << duration.count() << endl; 34 | } 35 | 36 | int main(int argc, char** argv) 37 | { 38 | int length = atoi(argv[1]); 39 | const int steps = 64 * 1024 * 1024; 40 | int* arr = new int[length]; 41 | test(arr, length, steps); 42 | delete [] arr; 43 | 44 | return 0; 45 | } 46 | -------------------------------------------------------------------------------- /cacheline.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | using namespace std; 7 | using namespace std::chrono; 8 | 9 | 10 | #define LEN ( 64 * 1024 * 1024) 11 | 12 | //http://igoro.com/archive/gallery-of-processor-cache-effects/ 13 | 14 | void test(int* arr, int step) 15 | { 16 | srand((unsigned int)time(NULL)); 17 | 18 | // Get starting timepoint 19 | auto start = high_resolution_clock::now(); 20 | 21 | // Loop the memory 22 | for (int i = 0; i < LEN; i += step) arr[i] *= i; 23 | 24 | // Get ending timepoint 25 | auto stop = high_resolution_clock::now(); 26 | 27 | // Get duration. Substart timepoints to 28 | // get durarion. To cast it to proper unit 29 | // use duration cast method 30 | auto duration = duration_cast(stop - start); 31 | 32 | cout << std::setw(6) << step << " : " << duration.count() << endl; 33 | } 34 | 35 | int main(int argc, char** argv) 36 | { 37 | 38 | int step = atoi(argv[1]); 39 | 40 | int* arr = new int[LEN]; 41 | test(arr, step); 42 | delete [] arr; 43 | 44 | return 0; 45 | } 46 | -------------------------------------------------------------------------------- /cacheline.race.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace chrono; 8 | 9 | const int CACHE_LINE_SIZE = 64; 10 | const int SIZE = CACHE_LINE_SIZE / sizeof(int); 11 | const int COUNT = (10*1024*1024); 12 | 13 | int main(int argc, char** argv) 14 | { 15 | bool race = true; 16 | if (argc>1 ) race = false; 17 | 18 | srand((unsigned int)time(NULL)); 19 | 20 | int* p = new int [2*SIZE]; 21 | 22 | int *p1 = &p[0]; 23 | int *p2 = race ? &p[1] : &p[SIZE]; 24 | 25 | auto proc = [](int* data) { 26 | for(int i = 0; i < COUNT; ++i) 27 | *data += rand(); 28 | }; 29 | 30 | auto start_time = high_resolution_clock::now(); 31 | 32 | std::thread t1(proc, p1); 33 | std::thread t2(proc, p2); 34 | 35 | t1.join(); 36 | t2.join(); 37 | auto end_time = high_resolution_clock::now(); 38 | 39 | cout << "Duration: " << duration_cast(end_time - start_time).count() / 1000.f << " ms" << endl; 40 | 41 | return 0; 42 | } 43 | -------------------------------------------------------------------------------- /false.sharing.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | using namespace std; 9 | using namespace std::chrono; 10 | 11 | const int total_size = 16*1024*1024; 12 | int* test_data = NULL; 13 | const int max_threads = 128; 14 | int result[max_threads]; 15 | int nthread = 4; 16 | int total_count = 0; 17 | int chunk_size = 0; 18 | 19 | int sum(int len) { 20 | int r = 0; 21 | for (int i=0; i func ) { 35 | auto start_time = high_resolution_clock::now(); 36 | vector threads; 37 | for (int i=0; i(end_time - start_time).count(); 54 | } 55 | 56 | int main(int argc, char** argv) { 57 | 58 | if (argc > 1) { 59 | nthread = min( atoi(argv[1]), max_threads); 60 | } 61 | srand((unsigned int)time(NULL)); 62 | test_data = (int *)malloc(total_size * sizeof(int)); 63 | for (int i=0; i 2) { 94 | time = thread_test(false_sharing_proc); 95 | }else{ 96 | time = thread_test(non_false_sharing_proc); 97 | } 98 | 99 | cout << time << endl; 100 | 101 | return 0; 102 | 103 | } -------------------------------------------------------------------------------- /matrix.travel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | using namespace std; 7 | using namespace std::chrono; 8 | 9 | int main() 10 | { 11 | srand((unsigned int)time(NULL)); 12 | 13 | const int row = 1024; 14 | const int col = 512; 15 | 16 | int matrix[row][col]; 17 | for(int r = 0; r < row; ++r) { 18 | for(int c=0; c(stop - start); 33 | cout << "row travel : " << sum_row << " : " << duration.count() / 1000.f << "ms" << endl; 34 | 35 | //column travel 36 | int sum_col = 0; 37 | start = high_resolution_clock::now(); 38 | for(int c=0; c(stop - start); 45 | cout << "col travel : " << sum_col << " : " << duration.count() / 1000.f << "ms" << endl; 46 | 47 | return 0; 48 | } -------------------------------------------------------------------------------- /object.member.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | using namespace std; 9 | using namespace std::chrono; 10 | 11 | const int repeat_num = 4*1024*1024; 12 | const int obj_num = 32*1024; 13 | const int pad_size = 1024; 14 | 15 | class BadObject { 16 | public: 17 | bool isLive; 18 | int padding[pad_size]; // let the object > 64 Bytes 19 | }; 20 | 21 | class GoodObject { 22 | public: 23 | int padding[pad_size]; 24 | }; 25 | 26 | 27 | int main() 28 | { 29 | BadObject *bad = new BadObject[obj_num]; 30 | GoodObject *good = new GoodObject[obj_num]; 31 | bool *isLive = new bool [obj_num]; 32 | 33 | //init objects; 34 | srand((unsigned int)time(NULL)); 35 | for(int i=0; i(stop - start); 59 | cout << "Bad Objects : " << duration.count() << endl; 60 | 61 | 62 | start = high_resolution_clock::now(); 63 | 64 | // Loop the memory 65 | for (int i = 0; i < repeat_num; i++) { 66 | int idx = i % obj_num; 67 | if (isLive[idx]) { 68 | for (int j=0; j(stop - start); 76 | cout << "Good Objects : " << duration.count() << endl; 77 | 78 | delete [] bad; 79 | delete [] good; 80 | delete [] isLive; 81 | 82 | } 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | pushd `dirname $0` > /dev/null 6 | SCRIPT_PATH=`pwd -P` 7 | popd > /dev/null 8 | SCRIPT_FILE=`basename $0` 9 | 10 | COLOR_NONE='\033[0m' 11 | COLOR_INFO='\033[0;36m' 12 | COLOR_ERROR='\033[1;31m' 13 | 14 | function compile(){ 15 | CC=g++ 16 | OPT="-std=c++11" 17 | case $3 in 18 | 1) OPT="${OPT} -g";; 19 | 2) OPT="${OPT} -O2";; 20 | 3) OPT="${OPT} -march=native";; 21 | *) ;; 22 | esac 23 | echo "compile program with \"${OPT}\"" 24 | ${CC} ${OPT} -o $2 $1 25 | } 26 | 27 | function compile_name() { 28 | NAME=${1} 29 | SOURCE=${SCRIPT_PATH}/${NAME}.cpp 30 | TARGET=${SCRIPT_PATH}/${NAME} 31 | compile ${SOURCE} ${TARGET} $2 32 | } 33 | 34 | function cache_line_fetch() { 35 | NAME="cacheline" 36 | compile_name ${NAME} 2 37 | 38 | step=1 39 | echo " steps : microseconds" 40 | for i in {1..20};do 41 | ${TARGET} ${step} 42 | step=$(expr ${step} \* 2) 43 | done 44 | rm ${TARGET} 45 | } 46 | 47 | function cache_line_race() { 48 | NAME="cacheline.race" 49 | compile_name ${NAME} 2 50 | 51 | echo "cache line race " 52 | ${TARGET} 53 | echo "no cache line race " 54 | ${TARGET} false 55 | rm ${TARGET} 56 | } 57 | 58 | function cache_size() { 59 | NAME="cache.size" 60 | compile_name ${NAME} 2 61 | 62 | size=256 63 | echo " steps : microseconds" 64 | for i in {1..20};do 65 | ${TARGET} ${size} 66 | size=$(expr ${size} \* 2 ) 67 | done 68 | rm ${TARGET} 69 | } 70 | 71 | function matrix_travel() { 72 | NAME="matrix.travel" 73 | compile_name ${NAME} 2 74 | ${TARGET} 75 | rm ${TARGET} 76 | } 77 | 78 | function false_sharing() { 79 | NAME="false.sharing" 80 | compile_name ${NAME} 81 | 82 | thread=1 83 | echo "threads : scalable : non-scalable " 84 | for i in {1..32};do 85 | t1=$(${TARGET} ${thread}) 86 | t2=$(${TARGET} ${thread} xx) 87 | printf " %3d : %5d : %5d\n" $thread $t1 $t2 88 | thread=$(expr ${thread} + 1 ) 89 | done 90 | thread=1 91 | 92 | rm ${TARGET} 93 | } 94 | 95 | function object_member() { 96 | NAME="object.member" 97 | compile_name ${NAME} 2 98 | ${TARGET} 99 | rm ${TARGET} 100 | } 101 | 102 | function cache_hierarchy_bandwidth() { 103 | NAME="cache.bandwidth" 104 | compile_name ${NAME} 2 105 | 106 | size="1024 2048 4096 8192 16384 32768 49152 65536 98304 \ 107 | 131072 196608 262144 524288 786432 1048576 1572864 \ 108 | 2097152 3145728 4194304 6291456 8388608 12582912 16777216 \ 109 | 25165824 33554432 50331648 67108864 100663296" 110 | for i in $size; do 111 | t=$(${TARGET} ${i}) 112 | printf "%10d : %.2f\n" $i $t 113 | done 114 | 115 | rm ${TARGET} 116 | } 117 | 118 | function cache_conflicts() { 119 | NAME="cache.conflicts" 120 | compile_name ${NAME} 2 121 | echo "| count | 4 | 64 | 2048 | 4096 |" 122 | echo "------------------------------------------" 123 | for count in {1..18}; do 124 | printf "| %5d | " ${count} 125 | for inc in 4 64 2048 4096; do 126 | time=$(${TARGET} ${count} ${inc}) 127 | printf "%6d |" ${time} 128 | done 129 | printf "\n" 130 | done 131 | 132 | rm ${TARGET} 133 | } 134 | 135 | TEST1="Cache Line Fetch" 136 | TEST2="Cache Line Race" 137 | TEST3="Cache Size" 138 | TEST4="Cache Hierarchy Bandwidth" 139 | TEST5="Matrix Travel" 140 | TEST6="False Sharing Thread" 141 | TEST7="Object Member" 142 | TEST8="Cache Conflicts" 143 | 144 | 145 | function select_test() 146 | { 147 | while true; do 148 | echo " " 149 | PS3='Please enter your choice: ' 150 | options=("${TEST1}" "${TEST2}" "${TEST3}" "${TEST4}" "${TEST5}" "${TEST6}" "${TEST7}" "${TEST8}" "Quit") 151 | COLUMNS=12 152 | select opt in "${options[@]}" 153 | do 154 | case $opt in 155 | "${TEST1}") 156 | cache_line_fetch; break 157 | ;; 158 | "$TEST2") 159 | cache_line_race; break; 160 | ;; 161 | "${TEST3}") 162 | cache_size; break; 163 | ;; 164 | "${TEST4}") 165 | cache_hierarchy_bandwidth; break; 166 | ;; 167 | "${TEST5}") 168 | matrix_travel; break; 169 | ;; 170 | "${TEST6}") 171 | false_sharing; break; 172 | ;; 173 | "${TEST7}") 174 | object_member; break; 175 | ;; 176 | "${TEST8}") 177 | cache_conflicts; break; 178 | ;; 179 | "Quit") 180 | exit 0; 181 | ;; 182 | *) echo "invalid option $REPLY";; 183 | esac 184 | done 185 | done 186 | } 187 | 188 | select_test 189 | --------------------------------------------------------------------------------