├── Makefile ├── README.md ├── c2c_example_report.out ├── false_sharing.exe ├── false_sharing_example.c ├── perf-c2c-usage.out └── tugtest.c /Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | LDLIBS = -lnuma -lpthread 3 | binary = false_sharing.exe 4 | source = false_sharing_example.c 5 | .PHONY : clean 6 | 7 | $(binary) : $(source) 8 | $(CC) $(LDLIBS) -o $@ $< 9 | clean : 10 | -rm $(binary) $(objects) 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # c2c_blog 2 | # The false_sharing_example.c file is used with 3 | # my blog to demonstrate the proposed c2c addition 4 | # to the linux perf tool. 5 | -------------------------------------------------------------------------------- /c2c_example_report.out: -------------------------------------------------------------------------------- 1 | 1 ================================================= 2 | 2 Trace Event Information 3 | 3 ================================================= 4 | 4 Total records : 329219 5 | 5 Locked Load/Store Operations : 14654 6 | 6 Load Operations : 69679 7 | 7 Loads - uncacheable : 0 8 | 8 Loads - IO : 0 9 | 9 Loads - Miss : 3972 10 | 10 Loads - no mapping : 0 11 | 11 Load Fill Buffer Hit : 11958 12 | 12 Load L1D hit : 17235 13 | 13 Load L2D hit : 21 14 | 14 Load LLC hit : 14219 15 | 15 Load Local HITM : 3402 16 | 16 Load Remote HITM : 12757 17 | 17 Load Remote HIT : 5295 18 | 18 Load Local DRAM : 976 19 | 19 Load Remote DRAM : 3246 20 | 20 Load MESI State Exclusive : 4222 21 | 21 Load MESI State Shared : 0 22 | 22 Load LLC Misses : 22274 23 | 23 LLC Misses to Local DRAM : 4.4% 24 | 24 LLC Misses to Remote DRAM : 14.6% 25 | 25 LLC Misses to Remote cache (HIT) : 23.8% 26 | 26 LLC Misses to Remote cache (HITM) : 57.3% 27 | 27 Store Operations : 259539 28 | 28 Store - uncacheable : 0 29 | 29 Store - no mapping : 11 30 | 30 Store L1D Hit : 256696 31 | 31 Store L1D Miss : 2832 32 | 32 No Page Map Rejects : 2376 33 | 33 Unable to parse data source : 1 34 | 35 | 34 ================================================= 36 | 35 Global Shared Cache Line Event Information 37 | 36 ================================================= 38 | 37 Total Shared Cache Lines : 55 39 | 38 Load HITs on shared lines : 55454 40 | 39 Fill Buffer Hits on shared lines : 10635 41 | 40 L1D hits on shared lines : 16415 42 | 41 L2D hits on shared lines : 0 43 | 42 LLC hits on shared lines : 8501 44 | 43 Locked Access on shared lines : 14351 45 | 44 Store HITs on shared lines : 109953 46 | 45 Store L1D hits on shared lines : 109449 47 | 46 Total Merged records : 126112 48 | 49 | 47 ================================================= 50 | 48 c2c details 51 | 49 ================================================= 52 | 50 Events : cpu/mem-loads,ldlat=30/P 53 | 51 : cpu/mem-stores/P 54 | 52 Cachelines sort on : Remote HITMs 55 | 53 Cacheline data groupping : offset,pid,iaddr 56 | 57 | 54 ================================================= 58 | 55 Shared Data Cache Line Table 59 | 56 ================================================= 60 | 57 # 61 | 58 # Total Rmt ----- LLC Load Hitm ----- ---- Store Reference ---- --- Load Dram ---- LLC Total ----- Core Load Hit ----- -- LLC Load Hit -- 62 | 59 # Index Cacheline records Hitm Total Lcl Rmt Total L1Hit L1Miss Lcl Rmt Ld Miss Loads FB L1 L2 Llc Rmt 63 | 60 # ..... .................. ....... ....... ....... ....... ....... ....... ....... ....... ........ ........ ....... ....... ....... ....... ....... ........ ........ 64 | 61 # 65 | 62 0 0x602180 149904 77.09% 12103 2269 9834 109504 109036 468 727 2657 13747 40400 5355 16154 0 2875 529 66 | 63 1 0x602100 12128 22.20% 3951 1119 2832 0 0 0 65 200 3749 12128 5096 108 0 2056 652 67 | 64 2 0xffff883ffb6a7e80 260 0.09% 15 3 12 161 161 0 1 1 15 99 25 50 0 6 1 68 | 65 3 0xffffffff81aec000 157 0.07% 9 0 9 1 0 1 0 7 20 156 50 59 0 27 4 69 | 66 4 0xffffffff81e3f540 179 0.06% 9 1 8 117 97 20 0 10 25 62 11 1 0 24 7 70 | 71 | 67 ================================================= 72 | 68 Shared Cache Line Distribution Pareto 73 | 69 ================================================= 74 | 70 # 75 | 71 # ----- HITM ----- -- Store Refs -- Data address ---------- cycles ---------- cpu Shared 76 | 72 # Num Rmt Lcl L1 Hit L1 Miss Offset Pid Code address rmt hitm lcl hitm load cnt Symbol Object Source:Line Node{cpu list} 77 | 73 # ..... ....... ....... ....... ....... .................. ....... .................. ........ ........ ........ ........ ................... .................... ........................... .... 78 | 74 # 79 | 75 ------------------------------------------------------------- 80 | 76 0 9834 2269 109036 468 0x602180 81 | 77 ------------------------------------------------------------- 82 | 78 65.51% 55.88% 75.20% 0.00% 0x0 14604 0x400b4f 27161 26039 26017 9 [.] read_write_func no_false_sharing.exe false_sharing_example.c:144 0{0-1,4} 1{24-25,120} 2{48,54} 3{169} 83 | 79 0.41% 0.35% 0.00% 0.00% 0x0 14604 0x400b56 18088 12601 26671 9 [.] read_write_func no_false_sharing.exe false_sharing_example.c:145 0{0-1,4} 1{24-25,120} 2{48,54} 3{169} 84 | 80 0.00% 0.00% 24.80% 100.00% 0x0 14604 0x400b61 0 0 0 9 [.] read_write_func no_false_sharing.exe false_sharing_example.c:145 0{0-1,4} 1{24-25,120} 2{48,54} 3{169} 85 | 81 7.50% 9.92% 0.00% 0.00% 0x20 14604 0x400ba7 2470 1729 1897 2 [.] read_write_func no_false_sharing.exe false_sharing_example.c:154 1{122} 2{144} 86 | 82 17.61% 20.89% 0.00% 0.00% 0x28 14604 0x400bc1 2294 1575 1649 2 [.] read_write_func no_false_sharing.exe false_sharing_example.c:158 2{53} 3{170} 87 | 83 8.97% 12.96% 0.00% 0.00% 0x30 14604 0x400bdb 2325 1897 1828 2 [.] read_write_func no_false_sharing.exe false_sharing_example.c:162 0{96} 3{171} 88 | 89 | 84 ------------------------------------------------------------- 90 | 85 1 2832 1119 0 0 0x602100 91 | 86 ------------------------------------------------------------- 92 | 87 29.13% 36.19% 0.00% 0.00% 0x20 14604 0x400bb3 1964 1230 1788 2 [.] read_write_func no_false_sharing.exe false_sharing_example.c:155 1{122} 2{144} 93 | 88 43.68% 34.41% 0.00% 0.00% 0x28 14604 0x400bcd 2274 1566 1793 2 [.] read_write_func no_false_sharing.exe false_sharing_example.c:159 2{53} 3{170} 94 | 89 27.19% 29.40% 0.00% 0.00% 0x30 14604 0x400be7 2045 1247 2011 2 [.] read_write_func no_false_sharing.exe false_sharing_example.c:163 0{96} 3{171} 95 | 96 | 90 ------------------------------------------------------------- 97 | 91 2 12 3 161 0 0xffff883ffb6a7e80 98 | 92 ------------------------------------------------------------- 99 | 93 58.33% 100.00% 0.00% 0.00% 0x0 14604 0xffffffff810cf16d 1380 941 1229 9 [k] task_tick_fair [kernel.kallsyms] atomic64_64.h:21 0{0,4,96} 1{25,120,122} 2{53} 3{170-171} 100 | 94 16.67% 0.00% 98.76% 0.00% 0x0 14604 0xffffffff810c9379 1794 0 625 13 [k] update_cfs_rq_blocked_load [kernel.kallsyms] atomic64_64.h:45 0{1,4,96} 1{25,120,122} 2{48,53-54,144} 3{169-171} 101 | 95 16.67% 0.00% 0.00% 0.00% 0x0 14604 0xffffffff810ce098 1382 0 867 12 [k] update_cfs_shares [kernel.kallsyms] atomic64_64.h:21 0{1,4,96} 1{25,120,122} 2{53-54,144} 3{169-171} 102 | 96 8.33% 0.00% 0.00% 0.00% 0x8 14604 0xffffffff810cf18c 2560 0 679 8 [k] task_tick_fair [kernel.kallsyms] atomic.h:26 0{4,96} 1{24-25,120,122} 2{54} 3{170} 103 | 97 0.00% 0.00% 1.24% 0.00% 0x8 14604 0xffffffff810cf14f 0 0 0 2 [k] task_tick_fair [kernel.kallsyms] atomic.h:50 2{48,53} 104 | 105 | 98 ------------------------------------------------------------- 106 | 99 3 9 0 0 1 0xffffffff81aec000 107 | 100 ------------------------------------------------------------- 108 | 101 77.78% 0.00% 0.00% 0.00% 0x0 14604 0xffffffff810c331f 430 0 234 12 [k] scheduler_tick [kernel.kallsyms] core.c:3055 0{1,4,96} 1{25,120,122} 2{53-54,144} 3{169-171} 109 | 102 22.22% 0.00% 0.00% 0.00% 0x0 14604 0xffffffff810d3126 332 0 103 11 [k] trigger_load_balance [kernel.kallsyms] fair.c:7222 0{1,4,96} 1{25,120,122} 2{53-54,144} 3{170-171} 110 | 103 0.00% 0.00% 0.00% 100.00% 0x0 14604 0xffffffff810eb556 0 0 0 1 [k] do_timer [kernel.kallsyms] timekeeping.c:1875 3{169} 111 | 112 | 104 ------------------------------------------------------------- 113 | 105 4 8 1 97 20 0xffffffff81e3f540 114 | 106 ------------------------------------------------------------- 115 | 107 12.50% 0.00% 0.00% 0.00% 0x0 14604 0xffffffff810eb0b0 429 0 0 1 [k] update_wall_time [kernel.kallsyms] timekeeping.c:1719 3{169} 116 | 108 0.00% 0.00% 4.12% 10.00% 0x8 14604 0xffffffff810eb1f3 0 0 0 1 [k] update_wall_time [kernel.kallsyms] timekeeping.c:1605 3{169} 117 | 109 0.00% 0.00% 5.15% 0.00% 0x8 14604 0xffffffff810eb27d 0 0 0 1 [k] update_wall_time [kernel.kallsyms] timekeeping.c:1436 3{169} 118 | 110 0.00% 0.00% 4.12% 0.00% 0x18 14604 0xffffffff810eaf96 0 0 0 1 [k] update_wall_time [kernel.kallsyms] timekeeping.c:1456 3{169} 119 | 111 0.00% 0.00% 1.03% 0.00% 0x20 14604 0xffffffff810eaffa 0 0 0 1 [k] update_wall_time [kernel.kallsyms] timekeeping.c:1489 3{169} 120 | 112 50.00% 100.00% 0.00% 0.00% 0x38 14604 0xffffffff810ea2e1 442 218 526 10 [k] ktime_get [kernel.kallsyms] seqlock.h:77 0{4,96} 1{120,122} 2{53-54,144} 3{169-171} 121 | 113 37.50% 0.00% 0.00% 0.00% 0x38 14604 0xffffffff810eb593 593 0 184 10 [k] ktime_get_update_offsets_n [kernel.kallsyms] seqlock.h:77 0{4,96} 1{25,120,122} 2{53-54,144} 3{170-171} 122 | 114 0.00% 0.00% 11.34% 0.00% 0x38 14604 0xffffffff810eb088 0 0 627 3 [k] update_wall_time [kernel.kallsyms] seqlock.h:180 0{1} 2{54} 3{169} 123 | 115 0.00% 0.00% 26.80% 90.00% 0x38 14604 0xffffffff810eb0c6 0 0 120 1 [k] update_wall_time [kernel.kallsyms] seqlock.h:187 3{169} 124 | 116 0.00% 0.00% 1.03% 0.00% 0x3c 14604 0xffffffff8168ab02 0 0 0 1 [k] _raw_spin_unlock_irqrestor [kernel.kallsyms] spinlock.h:160 3{169} 125 | 117 0.00% 0.00% 46.39% 0.00% 0x3c 14604 0xffffffff8168acff 0 0 1942 3 [k] _raw_spin_lock_irqsave [kernel.kallsyms] spinlock.h:86 0{1} 1{120} 3{169} 126 | -------------------------------------------------------------------------------- /false_sharing.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joemario/perf-c2c-usage-files/a74c2c52af840237037247ce7fa6e35c3f6cdb81/false_sharing.exe -------------------------------------------------------------------------------- /false_sharing_example.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This is an example program to show false sharing between 3 | * numa nodes. 4 | * 5 | * It can be compiled two ways: 6 | * gcc -g false_sharing_example.c -pthread -lnuma -o false_sharing.exe 7 | * gcc -g false_sharing_example.c -pthread -lnuma -DNO_FALSE_SHARING -o no_false_sharing.exe 8 | * 9 | * The -DNO_FALSE_SHARING macro reduces the false sharing by expanding the shared data 10 | * structure into two different cachelines, (and it runs faster). 11 | * 12 | * The usage is: 13 | * ./false_sharing.exe 14 | * ./no_false_sharing.exe 15 | * 16 | * The program will make half the threads writer threads and half reader 17 | * threads. It will pin those threads in round-robin format to the 18 | * different numa nodes in the system. 19 | * 20 | * For example, on a system with 4 numa nodes: 21 | * ./false_sharing.exe 2 22 | * 12165 mticks, reader_thd (thread 6), on node 2 (cpu 144). 23 | * 12403 mticks, reader_thd (thread 5), on node 1 (cpu 31). 24 | * 12514 mticks, reader_thd (thread 4), on node 0 (cpu 96). 25 | * 12703 mticks, reader_thd (thread 7), on node 3 (cpu 170). 26 | * 12982 mticks, lock_th (thread 0), on node 0 (cpu 1). 27 | * 13018 mticks, lock_th (thread 1), on node 1 (cpu 24). 28 | * 13049 mticks, lock_th (thread 3), on node 3 (cpu 169). 29 | * 13050 mticks, lock_th (thread 2), on node 2 (cpu 49). 30 | * 31 | * # ./no_false_sharing.exe 2 32 | * 1918 mticks, reader_thd (thread 4), on node 0 (cpu 96). 33 | * 2432 mticks, reader_thd (thread 7), on node 3 (cpu 170). 34 | * 2468 mticks, reader_thd (thread 6), on node 2 (cpu 146). 35 | * 3903 mticks, reader_thd (thread 5), on node 1 (cpu 40). 36 | * 7560 mticks, lock_th (thread 0), on node 0 (cpu 1). 37 | * 7574 mticks, lock_th (thread 2), on node 2 (cpu 145). 38 | * 7602 mticks, lock_th (thread 3), on node 3 (cpu 169). 39 | * 7625 mticks, lock_th (thread 1), on node 1 (cpu 24). 40 | * 41 | */ 42 | 43 | #define _MULTI_THREADED 44 | #define _GNU_SOURCE 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | 55 | /* 56 | * A thread on each numa node seems to provoke cache misses 57 | */ 58 | #define LOOP_CNT (5 * 1024 * 1024) 59 | 60 | #if defined(__x86_64__) || defined(__i386__) 61 | static __inline__ uint64_t rdtsc() { 62 | unsigned hi, lo; 63 | __asm__ __volatile__ ( "rdtsc" : "=a"(lo), "=d"(hi)); 64 | return ( (uint64_t)lo) | ( ((uint64_t)hi) << 32); 65 | } 66 | 67 | #elif defined(__aarch64__) 68 | static __inline__ uint64_t rdtsc(void) 69 | { 70 | uint64_t val; 71 | 72 | /* 73 | * According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the 74 | * system counter is at least 56 bits wide; from Armv8.6, the counter 75 | * must be 64 bits wide. So the system counter could be less than 64 76 | * bits wide and it is attributed with the flag 'cap_user_time_short' 77 | * is true. 78 | */ 79 | asm volatile("mrs %0, cntvct_el0" : "=r" (val)); 80 | 81 | return val; 82 | } 83 | #endif 84 | 85 | 86 | /* 87 | * Create a struct where reader fields share a cacheline with the hot lock field. 88 | * Compiling with -DNO_FALSE_SHARING inserts padding to avoid that sharing. 89 | */ 90 | typedef struct _buf { 91 | long lock0; 92 | long lock1; 93 | long reserved1; 94 | #if defined(NO_FALSE_SHARING) 95 | long pad[5]; // to keep the 'lock*' fields on their own cacheline. 96 | #else 97 | long pad[1]; // to provoke false sharing. 98 | #endif 99 | long reader1; 100 | long reader2; 101 | long reader3; 102 | long reader4; 103 | } buf __attribute__((aligned (64))); 104 | 105 | buf buf1; 106 | buf buf2; 107 | 108 | volatile int wait_to_begin = 1; 109 | struct thread_data *thread; 110 | int max_node_num; 111 | int num_threads; 112 | char * lock_thd_name = "lock_th"; 113 | char * reader_thd_name = "reader_thd"; 114 | 115 | #define checkResults(string, val) { \ 116 | if (val) { \ 117 | printf("Failed with %d at %s", val, string); \ 118 | exit(1); \ 119 | } \ 120 | } 121 | 122 | struct thread_data { 123 | pthread_t tid; 124 | long tix; 125 | long node; 126 | char *name; 127 | }; 128 | 129 | /* 130 | * Bind a thread to the specified numa node. 131 | */ 132 | void setAffinity(void *parm) { 133 | volatile uint64_t rc, j; 134 | int node = ((struct thread_data *)parm)->node; 135 | char *func_name = ((struct thread_data *)parm)->name; 136 | 137 | numa_run_on_node(node); 138 | pthread_setname_np(pthread_self(),func_name); 139 | } 140 | 141 | /* 142 | * Thread function to simulate the false sharing. 143 | * The "lock" threads will test-n-set the lock field, 144 | * while the reader threads will just read the other fields 145 | * in the struct. 146 | */ 147 | extern void *read_write_func(void *parm) { 148 | 149 | int tix = ((struct thread_data *)parm)->tix; 150 | uint64_t start, stop, j; 151 | char *thd_name = ((struct thread_data *)parm)->name; 152 | 153 | // Pin each thread to a numa node. 154 | setAffinity(parm); 155 | 156 | // Wait for all threads to get created before starting. 157 | while(wait_to_begin) ; 158 | 159 | start = rdtsc(); 160 | for(j=0; j\n", argv[0] ); 210 | printf( "where \"n\" is the number of threads per node\n"); 211 | exit(1); 212 | } 213 | 214 | if ( numa_available() < 0 ) 215 | { 216 | printf( "NUMA not available\n" ); 217 | exit(1); 218 | } 219 | 220 | int thread_cnt = atoi(argv[1]); 221 | 222 | max_node_num = numa_max_node(); 223 | if ( max_node_num == 0 ) 224 | max_node_num = 1; 225 | int node_cnt = max_node_num + 1; 226 | 227 | // Use "thread_cnt" threads per node. 228 | num_threads = (max_node_num +1) * thread_cnt; 229 | 230 | thread = malloc( sizeof(struct thread_data) * num_threads); 231 | 232 | // Create the first half of threads as lock threads. 233 | // Assign each thread a successive round robin node to 234 | // be pinned to (later after it gets created.) 235 | // 236 | for (i=0; i<=(num_threads/2 - 1); i++) { 237 | thread[i].tix = i; 238 | thread[i].node = i%node_cnt; 239 | thread[i].name = lock_thd_name; 240 | rc = pthread_create(&thread[i].tid, NULL, read_write_func, &thread[i]); 241 | checkResults("pthread_create()\n", rc); 242 | usleep(500); 243 | } 244 | 245 | // Create the second half of threads as reader threads. 246 | // Assign each thread a successive round robin node to 247 | // be pinned to (later after it gets created.) 248 | // 249 | for (i=((num_threads/2)); i<(num_threads); i++) { 250 | thread[i].tix = i; 251 | thread[i].node = i%node_cnt; 252 | thread[i].name = reader_thd_name; 253 | rc = pthread_create(&thread[i].tid, NULL, read_write_func, &thread[i]); 254 | checkResults("pthread_create()\n", rc); 255 | usleep(500); 256 | } 257 | 258 | // Sync to let threads start together 259 | usleep(500); 260 | wait_to_begin = 0; 261 | 262 | for (i=0; i ] [] 4 | or: perf c2c record [] -- [] 5 | 6 | -e, --event event selector. Use 'perf mem record -e list' to list available events 7 | -k, --all-kernel collect only kernel level data 8 | -l, --ldlat setup mem-loads latency 9 | -u, --all-user collect only user level data 10 | -v, --verbose be more verbose (show counter open errors, etc) 11 | 12 | The flags for "perf c2c report" are: 13 | 14 | -c, --coalesce 15 | coalesce fields: pid,tid,iaddr,dso 16 | -d, --display ... lcl,rmt 17 | -g, --call-graph 18 | Display call graph (stack chain/backtrace): 19 | 20 | print_type: call graph printing style (graph|flat|fractal|folded|none) 21 | threshold: minimum call graph inclusion threshold () 22 | print_limit: maximum number of call graph entry () 23 | order: call graph order (caller|callee) 24 | sort_key: call graph sort key (function|address) 25 | branch: include last branch info to call graph (branch) 26 | value: call graph value (percent|period|count) 27 | 28 | Default: graph,0.5,caller,function,percent 29 | -i, --input the input file to process 30 | -k, --vmlinux vmlinux pathname 31 | -N, --node-info show extra node info in report (repeat for more info) 32 | -v, --verbose be more verbose (show counter open errors, etc) 33 | --full-symbols Display full lenght of symbols 34 | --stats Use the stdio interface 35 | --stdio Use the stdio interface 36 | 37 | -------------------------------------------------------------------------------- /tugtest.c: -------------------------------------------------------------------------------- 1 | //compile with 2 | // gcc -Wall -o tugtest tugtest.c 3 | 4 | /* 5 | ** simple program to test memory read/write cacheline contention. 6 | */ 7 | 8 | #define _GNU_SOURCE 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define cpu_relax() asm volatile("rep; nop") 22 | #define MAX_CPUS 1024 23 | 24 | enum { FALSE, TRUE }; 25 | enum { SUCCESS, FAILURE }; 26 | 27 | struct reader_thread_arg { 28 | int cpunum; 29 | int idx; 30 | } _rdthdarg[MAX_CPUS]; 31 | 32 | struct writer_thread_arg { 33 | int cpunum; 34 | int idx; 35 | } _wrthdarg[MAX_CPUS]; 36 | 37 | volatile uint64_t lockmem[8] __attribute__((aligned(64))); 38 | static double time_diff(struct timeval x , struct timeval y); 39 | 40 | int getopt(int argc, char * const argv[], const char *optstring); 41 | extern char *optarg; 42 | extern int optind; 43 | 44 | volatile int wait_to_start = TRUE; 45 | struct timeval tv_start, tv_stop; 46 | 47 | int sleep_cnt = 5; 48 | long loop_cnt = 2000000; 49 | int debug = FALSE; 50 | int test_thd_cnt = 0; 51 | int readerIdx = 0; 52 | int writerIdx = 0; 53 | 54 | #define CPU_BIND(cpu) \ 55 | do { \ 56 | cpu_set_t cs; \ 57 | CPU_ZERO (&cs); \ 58 | CPU_SET (cpu, &cs); \ 59 | \ 60 | if (sched_setaffinity(0, sizeof (cs), &cs) < 0) { \ 61 | perror("sched_setaffinity"); \ 62 | exit(EXIT_FAILURE); \ 63 | }\ 64 | } while (0) 65 | 66 | /* 67 | ** spin attempting to get lockmem 68 | */ 69 | void acquire_lock(volatile uint64_t *lock) { 70 | uint64_t expected = 0, new = 1; 71 | uint64_t result = 0; 72 | 73 | while ((result = __atomic_compare_exchange_n(lock, &expected, &new, 0, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) == 0) { 74 | ; // Do nothing 75 | } 76 | } 77 | 78 | 79 | void release_lock(volatile uint64_t *lock) 80 | { 81 | uint64_t expected = 1, new = 0; 82 | uint64_t result = 0; 83 | 84 | while ((result = __atomic_compare_exchange_n(lock, &expected, &new, 0, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) == 0) { 85 | ; // Do nothing 86 | } 87 | } 88 | 89 | void *writer(void *arg) 90 | { 91 | register long i,j; 92 | struct writer_thread_arg *wt = arg; 93 | int cpu = wt->cpunum; 94 | int idx = wt->idx; 95 | volatile uint64_t *p = (volatile uint64_t *)((char *)lockmem ); 96 | 97 | // Bind this thread to the cpu passed in. 98 | CPU_BIND(cpu); 99 | 100 | if (debug) 101 | printf("Starting writer thread %d on cpu %d, accessing data at 0x%p.\n", idx, cpu, lockmem); 102 | 103 | if (idx == 0) { 104 | gettimeofday(&tv_start, NULL); 105 | } 106 | 107 | // Wait for all the threads to have been kicked off in main(). 108 | while (wait_to_start) 109 | ; 110 | 111 | for (i = 0; i < loop_cnt; i++) { 112 | acquire_lock(p); 113 | for (j = 0; j < sleep_cnt; j++) cpu_relax(); 114 | release_lock(p); 115 | } 116 | 117 | if (idx == 0) { 118 | gettimeofday(&tv_stop, NULL); 119 | } 120 | 121 | if (debug) printf("Writer on cpu %d finished\n", cpu); 122 | return 0; 123 | } 124 | 125 | 126 | void *reader(void *arg) 127 | { 128 | register long i,j; 129 | struct reader_thread_arg *rt = arg; 130 | int cpu = rt->cpunum; 131 | int idx = rt->idx; 132 | 133 | // Start readers at 32 offset in cacheline 134 | volatile uint64_t *varptr = (volatile uint64_t*)(((char *)lockmem) + 32); 135 | 136 | CPU_BIND(cpu); 137 | if (debug) { 138 | printf("Starting reader thread %d on cpu %d, accessing data at 0x%p.\n", idx, cpu, varptr); 139 | fflush(stdout); 140 | } 141 | 142 | while (wait_to_start) 143 | ; 144 | 145 | for (i = 0; i < loop_cnt ; i++) { 146 | // Force a load - printf should never happen. 147 | // 148 | if (*varptr < 0 ) 149 | printf("varptr < 0\n"); 150 | for (j = 0; j < sleep_cnt; j++) 151 | cpu_relax(); 152 | } 153 | 154 | if (debug) { 155 | printf("Reader on cpu %d finished\n", cpu); 156 | fflush(stdout); 157 | } 158 | 159 | return 0; 160 | } 161 | 162 | 163 | int main(int argc, char *argv[]) 164 | { 165 | int opt; 166 | char *usage = " ./tugtest -r -r -w -w \n Example: \n ./tugtest -r2 -w6 -w21 -r10 -r3 -r17 -w8 -w19 -d \n The above will create reader threads on cpus 2,10,3,17 and writers on cpus 6,21,8,19. It will also set debug. \n The -S flag can be used to change sleep cycles between loops. Default is -S5 \n The -L flag is for the number of loops for each reader and writer thread to execute, with a default of -L2000000"; 167 | 168 | // 169 | // process the command line 170 | // 171 | while ((opt = getopt(argc, argv, "hptdm:r:w:L:S:")) != -1) { 172 | 173 | switch (opt) { 174 | 175 | // Debug 176 | case 'D': 177 | case 'd': 178 | debug = TRUE; 179 | break; 180 | 181 | // Reader threads 182 | case 'r': 183 | _rdthdarg[readerIdx].cpunum = atoi(optarg); 184 | _rdthdarg[readerIdx].idx = readerIdx; 185 | if (debug) printf("Reader cpu: %d, idx: %d\n", _rdthdarg[readerIdx].cpunum, _rdthdarg[readerIdx].idx); 186 | readerIdx++; 187 | test_thd_cnt++; 188 | break; 189 | 190 | // Writer threads 191 | case 'w': 192 | _wrthdarg[writerIdx].cpunum = atoi(optarg); 193 | _wrthdarg[writerIdx].idx = writerIdx; 194 | if (debug) printf("Writer cpu: %d\n", _wrthdarg[writerIdx].cpunum); 195 | writerIdx++; 196 | test_thd_cnt++; 197 | break; 198 | 199 | // Loop count 200 | case 'L': 201 | loop_cnt = atoi(optarg); 202 | if (debug) printf("loop count is %ld\n",loop_cnt); 203 | if (loop_cnt < 1) { 204 | printf("loop cnt must be >= 1\n"); 205 | exit(FAILURE); 206 | } 207 | break; 208 | 209 | // Sleep loop count 210 | case 'S': 211 | sleep_cnt = atoi(optarg); 212 | if (sleep_cnt < 0) { 213 | printf("sleep cnt, between loops, must be >= 0\n"); 214 | exit(FAILURE); 215 | } 216 | break; 217 | 218 | case 'h': /* -h for help */ 219 | printf("usage: %s\n", usage); 220 | 221 | exit(0); 222 | } 223 | } 224 | 225 | if (debug) { 226 | printf("Loop cnt: %ld\n", loop_cnt); 227 | printf("Sleep loop cnt, between loops: %d\n", sleep_cnt); 228 | } 229 | 230 | // 231 | // create the threads that will ping pong back and forth 232 | // 233 | pthread_t thread[test_thd_cnt]; 234 | int i, ti; 235 | 236 | for (i=0;i