├── results-latencies └── readme ├── results-crossthread └── readme ├── requirements.txt ├── .gitignore ├── demo-crossthread ├── skylake-tlbsets-ht.pdf ├── skylake-tlbsets-ht.png ├── sandybridge-tlbsets-ht.pdf └── sandybridge-tlbsets-ht.png ├── c ├── Makefile ├── profile.h ├── tvlb.h └── tvlb-verbatim.c ├── py ├── log.py ├── perf.py ├── cpus.py ├── tlblib.py ├── crossthread.py └── tlb-latency.py ├── test └── fulltest.sh ├── README.md └── demo-latencies ├── sandybridge-latency.txt └── skylake-latencies.txt /results-latencies/readme: -------------------------------------------------------------------------------- 1 | results-latencies placeholder 2 | -------------------------------------------------------------------------------- /results-crossthread/readme: -------------------------------------------------------------------------------- 1 | results-crossthread placeholder 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scipy 2 | numpy 3 | sklearn 4 | matplotlib 5 | cpuid 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .tvlbsinglepagefile 2 | *.pyc 3 | c/tvlb-verbatim 4 | *.npy 5 | *.pdf 6 | *.png 7 | -------------------------------------------------------------------------------- /demo-crossthread/skylake-tlbsets-ht.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bilalahmaddev/tlbkit/master/demo-crossthread/skylake-tlbsets-ht.pdf -------------------------------------------------------------------------------- /demo-crossthread/skylake-tlbsets-ht.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bilalahmaddev/tlbkit/master/demo-crossthread/skylake-tlbsets-ht.png -------------------------------------------------------------------------------- /demo-crossthread/sandybridge-tlbsets-ht.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bilalahmaddev/tlbkit/master/demo-crossthread/sandybridge-tlbsets-ht.pdf -------------------------------------------------------------------------------- /demo-crossthread/sandybridge-tlbsets-ht.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bilalahmaddev/tlbkit/master/demo-crossthread/sandybridge-tlbsets-ht.png -------------------------------------------------------------------------------- /c/Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS=-Wall -O3 -D_GNU_SOURCE -Werror -pie -fpic -g 2 | LDFLAGS=-lpthread 3 | 4 | all: tvlb-verbatim 5 | 6 | tvlb-verbatim: tvlb-verbatim.c tvlb.h 7 | $(CC) -o tvlb-verbatim tvlb-verbatim.c $(LDFLAGS) $(CFLAGS) 8 | 9 | clean: 10 | rm -f *.o tvlb-verbatim *.png *.pyc 11 | 12 | -------------------------------------------------------------------------------- /py/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import cpuid 3 | 4 | def log(farg, *args): 5 | lgr.info(farg, *args) 6 | 7 | cpu_uarch=cpuid.cpu_microarchitecture()[0] 8 | fn='results-latencies/' + cpu_uarch + '.log' 9 | lgr = logging.getLogger('tlblatency') 10 | fh = logging.FileHandler(fn) 11 | lgr.setLevel(logging.DEBUG) 12 | fh.setLevel(logging.DEBUG) 13 | frmt = logging.Formatter('%(name)s - %(levelname)s - %(message)s') 14 | fh.setFormatter(frmt) 15 | lgr.addHandler(fh) 16 | sh=logging.StreamHandler() 17 | sh.setFormatter(frmt) 18 | lgr.addHandler(sh) 19 | 20 | log('logging to %s', fn) 21 | -------------------------------------------------------------------------------- /py/perf.py: -------------------------------------------------------------------------------- 1 | 2 | def out_to_fields(o): 3 | """ 4 | parse output of perf command in 'o' executed with -x for machine-readable 5 | output, and return a dict of counter(str) -> value(int) 6 | """ 7 | yset=dict() 8 | lines=[x for x in o.split('\n') if len(x) > 0 and x[0] != '#'] 9 | for l in lines: 10 | fields=l.split(',') 11 | if len(fields) < 3: 12 | print 'not seeing expected output here:', fields 13 | e=fields[2] 14 | v=int(fields[0]) 15 | yset[e] = v 16 | return yset 17 | 18 | -------------------------------------------------------------------------------- /test/fulltest.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | if [ ! -d test ] 4 | then echo 'please run me from the top dir.' 5 | exit 1 6 | fi 7 | 8 | for d in results-* 9 | do if [ ! -d $d ] 10 | then echo "expecting results-.. to be a dir" 11 | exit 1 12 | fi 13 | rm -f $d/* 14 | echo "$d placeholder" >$d/readme 15 | done 16 | 17 | rm -rf env 18 | make -C c clean 19 | make -C c 20 | virtualenv -p python2.7 env 21 | ./env/bin/pip install -r requirements.txt 22 | echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid 23 | ./env/bin/python py/crossthread.py 24 | ./env/bin/python py/tlb-latency.py 25 | 26 | -------------------------------------------------------------------------------- /py/cpus.py: -------------------------------------------------------------------------------- 1 | 2 | def corelist(allthreads=True): 3 | """ return array of processor tuples that share a core, i.e. sibling hyperthreads """ 4 | cpulines=open('/proc/cpuinfo').read().split('\n') 5 | package=None 6 | core=None 7 | thread=None 8 | 9 | d=dict() 10 | 11 | package=None 12 | logical_id=None 13 | 14 | for line in cpulines: 15 | fields=line.split(':') 16 | try: 17 | n=int(fields[1]) 18 | except: 19 | continue 20 | if 'processor' in line: 21 | assert logical_id == None 22 | logical_id=n 23 | elif 'physical id' in line: 24 | assert package == None 25 | package=n 26 | elif 'core id' in line: 27 | assert package != None 28 | assert logical_id != None 29 | full_core_id=(package,n) 30 | if not full_core_id in d: 31 | d[full_core_id] = [] 32 | if allthreads: 33 | d[full_core_id].append(logical_id) 34 | else: 35 | d[full_core_id]=logical_id 36 | package=None 37 | logical_id=None 38 | else: 39 | continue 40 | return [d[l] for l in sorted(d)] 41 | 42 | #l=corelist() 43 | #print(l) 44 | 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Demo 2 | ==== 3 | ![Skylake demo pic](demo-crossthread/skylake-tlbsets-ht.png?raw=true "Crossthread interference result on skylake") 4 | 5 | Layout 6 | ====== 7 | 8 | c: C code 9 | py: python code 10 | demo-crossthread: examples of crossthread interference output 11 | demo-latencies: examples of latency measurement output 12 | results-crossthread: output dir for crossthread code 13 | results-latencies: output dir for latency measurement code 14 | 15 | Setup 16 | ===== 17 | 18 | To run: 19 | 20 | 0. Have perf and a compiler installed 21 | 22 | 1. build the C program, set up python env, perf privileges for unprivileged users: 23 | 24 | ```console 25 | make -C c 26 | rm -rf env 27 | virtualenv -p python2.7 env 28 | ./env/bin/pip install -r requirements.txt 29 | echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid 30 | ``` 31 | 32 | Inter-hyperthread tlb interference shown using performance counters 33 | =================================================================== 34 | 35 | under some tlb size and structure assumptions (works on skylake), probe 36 | cross-thread tlb interference 37 | 38 | ```console 39 | ./env/bin/python py/crossthread.py 40 | ``` 41 | 42 | on skylake, the results should look like in the demo/ dir. 43 | 44 | Calculate TLB miss latency 45 | ============================= 46 | 47 | ```console 48 | ./env/bin/python py/tlb-latency.py 49 | ``` 50 | 51 | on skylake, the results should look like in demo/skylake-latencies.txt. 52 | 53 | 54 | Enjoy! 55 | 56 | Ben 57 | 58 | -------------------------------------------------------------------------------- /c/profile.h: -------------------------------------------------------------------------------- 1 | 2 | /* This code was gratefully taken from RevAnC, 3 | * specifically include/profile.h and 4 | * include/x86-64/profile.h, written by 5 | * Stephan van Schaik, see 6 | * https://github.com/vusec/revanc 7 | * -BJG 8 | */ 9 | 10 | /* This Source Code Form is subject to the terms of the Mozilla Public 11 | * License, v. 2.0. If a copy of the MPL was not distributed with this 12 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 13 | */ 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | typedef uint64_t cycles_t; 25 | 26 | static inline void code_barrier(void) 27 | { 28 | asm volatile("cpuid\n" ::: "%rax", "%rbx", "%rcx", "%rdx"); 29 | } 30 | 31 | static inline void data_barrier(void) 32 | { 33 | asm volatile("mfence\n" ::: "memory"); 34 | } 35 | 36 | static inline cycles_t rdtsc(void) 37 | { 38 | cycles_t cycles_lo, cycles_hi; 39 | 40 | asm volatile("rdtscp\n" : 41 | "=a" (cycles_lo), "=d" (cycles_hi) :: 42 | "%rcx"); 43 | 44 | return ((uint64_t)cycles_hi << 32) | cycles_lo; 45 | } 46 | 47 | 48 | static inline uint64_t profile_access(volatile char *p) 49 | { 50 | uint64_t past, now; 51 | 52 | data_barrier(); 53 | code_barrier(); 54 | past = rdtsc(); 55 | data_barrier(); 56 | 57 | *p; 58 | 59 | data_barrier(); 60 | now = rdtsc(); 61 | code_barrier(); 62 | data_barrier(); 63 | 64 | return now - past; 65 | } 66 | 67 | static inline uint64_t profile_call_return(void (*p)(void)) 68 | { 69 | uint64_t past, now; 70 | 71 | data_barrier(); 72 | code_barrier(); 73 | past = rdtsc(); 74 | data_barrier(); 75 | 76 | p(); 77 | 78 | data_barrier(); 79 | now = rdtsc(); 80 | code_barrier(); 81 | data_barrier(); 82 | 83 | return now - past; 84 | } 85 | 86 | #define PROFILE_ACCESS(code, cycles) do { \ 87 | uint64_t _past, _now; \ 88 | data_barrier(); \ 89 | code_barrier(); \ 90 | _past = rdtsc(); \ 91 | data_barrier(); \ 92 | do { code } while(0); \ 93 | data_barrier(); \ 94 | _now = rdtsc(); \ 95 | code_barrier(); \ 96 | data_barrier(); \ 97 | cycles = _now-_past; \ 98 | } while(0) 99 | -------------------------------------------------------------------------------- /c/tvlb.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _TVLB_H 3 | #define _TVLB_H 1 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | #ifndef NO_PTHREAD 21 | #include 22 | #include 23 | #endif 24 | 25 | 26 | #define PAGE 4096 27 | #define SHAREFILE ".tvlbsinglepagefile" 28 | #define SYNCFILE ".tvlbsyncfile" 29 | #define SECRETBITS 256 30 | 31 | static int createfile(const char *fn) 32 | { 33 | int fd; 34 | #ifndef NO_PTHREAD 35 | struct stat sb; 36 | char sharebuf[PAGE]; 37 | if(stat(fn, &sb) != 0 || sb.st_size != PAGE) { 38 | fd = open(fn, O_RDWR | O_CREAT | O_TRUNC, 0644); 39 | if(fd < 0) { 40 | perror("open"); 41 | fprintf(stderr, "createfile: couldn't create shared file %s\n", fn); 42 | exit(1); 43 | } 44 | if(write(fd, sharebuf, PAGE) != PAGE) { 45 | fprintf(stderr, "createfile: couldn't write shared file\n"); 46 | exit(1); 47 | } 48 | return fd; 49 | } 50 | 51 | assert(sb.st_size == PAGE); 52 | #endif 53 | 54 | fd = open(fn, O_RDWR, 0644); 55 | if(fd < 0) { 56 | perror(fn); 57 | fprintf(stderr, "createfile: couldn't open shared file\n"); 58 | exit(1); 59 | } 60 | return fd; 61 | 62 | } 63 | 64 | #ifndef NO_PTHREAD 65 | static int am_pinned = 0; 66 | static void pin_cpu(size_t i) 67 | { 68 | cpu_set_t cpu_set; 69 | pthread_t thread; 70 | 71 | thread = pthread_self(); 72 | 73 | assert (i >= 0); 74 | assert (i < CPU_SETSIZE); 75 | 76 | CPU_ZERO(&cpu_set); 77 | CPU_SET(i, &cpu_set); 78 | 79 | int v = pthread_setaffinity_np(thread, sizeof cpu_set, &cpu_set); 80 | if(v != 0) { perror("pthread_setaffinity_np"); exit(1); } 81 | fprintf(stderr, "# cpu %d\n", (int) i); 82 | am_pinned=1; 83 | } 84 | #endif 85 | 86 | #endif 87 | -------------------------------------------------------------------------------- /py/tlblib.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import subprocess 3 | import collections 4 | 5 | def popcount_py(x): 6 | return bin(x).count("1") 7 | 8 | def elem_id_l2(el,xorsize=7): 9 | """ return set id for a particular page number by XORing 10 | together the lowest 'xorsize' bits. the default (7) 11 | is the skylake addressing function. 12 | """ 13 | v1 = el % (2**xorsize) 14 | v2 = (el>>xorsize) % (2**xorsize) 15 | return v1 ^ v2 16 | 17 | def elem_id_l1(el): 18 | """ set number for L1 tlb on all known uarchs """ 19 | return el % 16 20 | 21 | def generate_set_l1(ident,size=8): 22 | assert ident >= 0 23 | l=[] 24 | x=ident 25 | while len(l) < size: 26 | l.append(x) 27 | x += 16 28 | assert len(l) == size 29 | return l 30 | 31 | def generate_set_l2(ident,size=13): 32 | assert ident >= 0 33 | assert ident < 256 34 | l=[] 35 | for x in range(2**18): 36 | assert len(l) < size 37 | if elem_id_l2(x) == ident: 38 | l.append(x) 39 | if len(l) == size: 40 | return l 41 | return l 42 | 43 | def generate_set_l2_general(ident,size=13): 44 | """ return list of page numbers that collide in L2 TLB, likely 45 | independent of addressing function """ 46 | assert ident >= 0 47 | assert ident < 256 48 | l=[] 49 | k=0 50 | for x in range(size): 51 | k+=1 52 | l.append(k * 2**16 + ident) 53 | assert len(l) == size 54 | return l 55 | 56 | class TestTLBLib(unittest.TestCase): 57 | def xorlist(self, inlist): 58 | """ xor successive elements together, just for the unit test. """ 59 | x = [inlist[i+1]^inlist[i] for i in range(len(inlist)-1)] 60 | return x 61 | def test_elem(self): 62 | """ test we generate a xor pattern for each set, based on experimental 63 | data 64 | """ 65 | my_xorlist=[129, 387, 129, 903, 129, 387, 129, 1935, 129, 387, 66 | 129, 903, 129, 387, 129, 3999, 129, 387, 129, 903, 129, 387, 67 | 129, 1935, 129, 387, 129, 903, 129, 387, 129, 8127, 129, 387, 68 | 129, 903, 129, 387, 129, 1935, 129, 387, 129, 903, 129, 387, 69 | 129, 3999, 129, 387, 129, 903, 129, 387, 129, 1935, 129, 387, 70 | 129, 903, 129, 387, 129, 16383, 129, 387, 129, 903, 129, 387, 71 | 129, 1935, 129, 387, 129, 903, 129, 387, 129, 3999, 129, 387, 72 | 129, 903, 129, 387, 129, 1935, 129, 387, 129, 903, 129, 387, 73 | 129, 8127, 129, 387, 129, 903, 129, 387, 129, 1935, 129, 387, 74 | 129, 903, 129, 387, 129, 3999, 129, 387, 129, 903, 129, 387, 75 | 129, 1935, 129, 387, 129, 903, 129, 387, 129] 76 | generated=set() 77 | for i in range(128): 78 | s=generate_set_l2(i,128) 79 | self.assertEqual(my_xorlist, self.xorlist(s)) 80 | for e in s: 81 | """ some internal consistency tests """ 82 | self.assertEqual(elem_id_l2(e), i) 83 | assert e not in generated 84 | assert e >= 0 85 | assert e < 2**14 86 | generated.add(e) 87 | self.assertEqual(len(generated), 16384) 88 | 89 | if __name__ == "__main__": 90 | unittest.main() 91 | -------------------------------------------------------------------------------- /demo-latencies/sandybridge-latency.txt: -------------------------------------------------------------------------------- 1 | tlblatency - INFO - logging to results-latencies/sandybridge.log 2 | tlblatency - INFO - 3 | tlblatency - INFO - * Probing set sizes 4 | tlblatency - INFO - 5 | tlblatency - INFO - calculating dtlb_load_misses.stlb_hit and cycles slope for l1 miss test size 1 6 | tlblatency - INFO - done. misses: 0.0, cycles: 96.3 7 | tlblatency - INFO - calculating dtlb_load_misses.stlb_hit and cycles slope for l1 miss test size 2 8 | tlblatency - INFO - done. misses: 0.1, cycles: 143.1 9 | tlblatency - INFO - calculating dtlb_load_misses.stlb_hit and cycles slope for l1 miss test size 3 10 | tlblatency - INFO - done. misses: 0.0, cycles: 191.1 11 | tlblatency - INFO - calculating dtlb_load_misses.stlb_hit and cycles slope for l1 miss test size 4 12 | tlblatency - INFO - done. misses: 5.4, cycles: 273.4 13 | tlblatency - INFO - Found 14 | tlblatency - INFO - 15 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 1 16 | tlblatency - INFO - done. misses: 0.0, cycles: 96.2 17 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 2 18 | tlblatency - INFO - done. misses: 0.0, cycles: 142.0 19 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 3 20 | tlblatency - INFO - done. misses: 0.0, cycles: 195.9 21 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 4 22 | tlblatency - INFO - done. misses: 5.0, cycles: 335.4 23 | tlblatency - INFO - Found 24 | tlblatency - INFO - 25 | tlblatency - INFO - L1 size: 4 L2 size: 4 26 | tlblatency - INFO - Assuming L1dTLB set size: 4 27 | tlblatency - INFO - Assuming L2dTLB/STLB set size: 4 28 | tlblatency - INFO - 29 | tlblatency - INFO - * Configuring miss and no-miss TLB sets of equal size 30 | tlblatency - INFO - 31 | tlblatency - INFO - l1 misses: [1, 17, 33, 49, 65] 32 | tlblatency - INFO - l1 no misses: [1, 17, 33, 50, 66] 33 | tlblatency - INFO - l2 misses: [65537, 131073, 196609, 262145, 327681] 34 | tlblatency - INFO - l2 no misses: [65537, 131073, 196609, 262146, 327682] 35 | tlblatency - INFO - 36 | tlblatency - INFO - * Probing miss rate and latency differences of miss and no-miss sets of equal size 37 | tlblatency - INFO - 38 | tlblatency - INFO - calculating dtlb_load_misses.stlb_hit and cycles slope for l1 miss list 39 | tlblatency - INFO - done. misses: 4.1, cycles: 268.0 40 | tlblatency - INFO - calculating dtlb_load_misses.stlb_hit and cycles slope for l1 no-miss list 41 | tlblatency - INFO - done. misses: 0.1, cycles: 243.0 42 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss list 43 | tlblatency - INFO - done. misses: 5.5, cycles: 340.0 44 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 no-miss list 45 | tlblatency - INFO - done. misses: 0.0, cycles: 239.3 46 | tlblatency - INFO - 47 | tlblatency - INFO - * Results 48 | tlblatency - INFO - 49 | tlblatency - INFO - L1TLB misses for miss/no-miss set: 4.10 vs 0.12. Cycles per iteration: 268.0 vs 243.0 50 | tlblatency - INFO - Extra misses: 3.98 extra cycles: 25.03 cycles per miss: 6.29 51 | tlblatency - INFO - L2TLB misses for miss/no-miss set: 5.49 vs 0.00. Cycles per iteration: 340.0 vs 239.3 52 | tlblatency - INFO - Extra misses: 5.49 extra cycles: 100.77 cycles per miss: 18.35 53 | -------------------------------------------------------------------------------- /c/tvlb-verbatim.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "profile.h" 12 | 13 | #define VTARGET 0x300000000000ULL 14 | 15 | #define CACHELINE 64 16 | #define SETS 128 17 | #define PAGE 4096 18 | 19 | 20 | void allocate_buffer(unsigned long long p, int fd) 21 | { 22 | /* allocate buffer that is a particular page number offset from the base, is RWX and contains usable instructions 23 | * in case we want to execute it. it all points to the same physical page so we don't have to worry about the effects 24 | * of the cache too much when calculating latency, but ought to be just seeing the TLB latency. 25 | */ 26 | assert(p >= 0); 27 | volatile char *target = (void *) (VTARGET+p*PAGE); 28 | volatile char *ret; 29 | ret = mmap((void *) target, PAGE, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_SHARED|MAP_FILE|MAP_FIXED, fd, 0); 30 | if(ret == MAP_FAILED) { 31 | perror("mmap"); 32 | exit(1); 33 | } 34 | if(ret != (volatile char *) target) { fprintf(stderr, "Wrong mapping\n"); exit(1); } 35 | *ret; 36 | memset((char *) ret, 0xc3, PAGE); /* RETQ instruction */ 37 | } 38 | 39 | #include "tvlb.h" 40 | 41 | int main(int argc, char *argv[]) 42 | { 43 | const char *progname = argv[0]; 44 | int fd = createfile(SHAREFILE); 45 | unsigned long long repetitions=1000; 46 | int c; 47 | int cpu=1; /* logical core */ 48 | int datapages = 0, codepages = 0; 49 | #define MAXPAGES 3000 50 | static unsigned long long pagelist_code[MAXPAGES], pagelist_data[MAXPAGES]; 51 | 52 | unsigned long long pageno; 53 | 54 | while ((c = getopt (argc, argv, "r:c:D:C:")) != -1) { 55 | switch (c) 56 | { 57 | case 'D': 58 | pageno = atoi(optarg); 59 | pagelist_data[datapages++] = pageno; 60 | allocate_buffer(pageno, fd); 61 | break; 62 | case 'C': 63 | pageno = atoi(optarg); 64 | pagelist_code[codepages++] = pageno; 65 | allocate_buffer(pageno, fd); 66 | break; 67 | case 'c': 68 | cpu=atoi(optarg); 69 | break; 70 | case 'r': 71 | repetitions = atoll(optarg); 72 | break; 73 | default: 74 | fprintf(stderr, "usage\n"); 75 | abort (); 76 | } 77 | } 78 | 79 | argc -= optind; 80 | argv += optind; 81 | 82 | if(argc > 0) { 83 | fprintf(stderr, "usage: %s [-c] [-r] [-Ddatapage] [-Ccodepage]\n", progname); 84 | exit(1); 85 | } 86 | 87 | pin_cpu(cpu); 88 | 89 | if(repetitions < 0) { fprintf(stderr, "args\n"); return 1; } 90 | fprintf(stderr, "# cpu %d repetitions %llu\n", cpu, repetitions); 91 | 92 | int a; 93 | int r; 94 | 95 | for(r = 0; r < repetitions; r++) { 96 | for(a = 0; a < datapages; a++) { 97 | unsigned long long p = pagelist_data[a]; 98 | volatile int *probe = (volatile int *) (VTARGET+(p*PAGE)); 99 | 100 | *probe; 101 | data_barrier(); // for timing info 102 | } 103 | 104 | for(a = 0; a < codepages; a++) { 105 | unsigned long long p = pagelist_code[a]; 106 | void (*probe)(void) = (void (*)(void)) (VTARGET+(p*PAGE)); 107 | probe(); 108 | } 109 | 110 | } 111 | 112 | return 0; 113 | } 114 | 115 | -------------------------------------------------------------------------------- /demo-latencies/skylake-latencies.txt: -------------------------------------------------------------------------------- 1 | tlblatency - INFO - logging to results-latencies/skylake.log 2 | tlblatency - INFO - 3 | tlblatency - INFO - * Probing set sizes 4 | tlblatency - INFO - 5 | tlblatency - INFO - calculating dtlb_load_misses.stlb_hit and cycles slope for l1 miss test size 1 6 | tlblatency - INFO - done. misses: 0.0, cycles: 95.5 7 | tlblatency - INFO - calculating dtlb_load_misses.stlb_hit and cycles slope for l1 miss test size 2 8 | tlblatency - INFO - done. misses: 0.0, cycles: 142.7 9 | tlblatency - INFO - calculating dtlb_load_misses.stlb_hit and cycles slope for l1 miss test size 3 10 | tlblatency - INFO - done. misses: 0.0, cycles: 189.5 11 | tlblatency - INFO - calculating dtlb_load_misses.stlb_hit and cycles slope for l1 miss test size 4 12 | tlblatency - INFO - done. misses: 5.0, cycles: 281.0 13 | tlblatency - INFO - Found 14 | tlblatency - INFO - 15 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 1 16 | tlblatency - INFO - done. misses: 0.0, cycles: 95.5 17 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 2 18 | tlblatency - INFO - done. misses: 0.0, cycles: 142.8 19 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 3 20 | tlblatency - INFO - done. misses: 0.0, cycles: 189.7 21 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 4 22 | tlblatency - INFO - done. misses: 0.0, cycles: 280.9 23 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 5 24 | tlblatency - INFO - done. misses: 0.0, cycles: 336.9 25 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 6 26 | tlblatency - INFO - done. misses: 0.0, cycles: 393.3 27 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 7 28 | tlblatency - INFO - done. misses: 0.0, cycles: 449.0 29 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 8 30 | tlblatency - INFO - done. misses: 0.0, cycles: 503.1 31 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 9 32 | tlblatency - INFO - done. misses: 0.0, cycles: 559.8 33 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 10 34 | tlblatency - INFO - done. misses: 0.0, cycles: 614.1 35 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 11 36 | tlblatency - INFO - done. misses: 0.0, cycles: 670.1 37 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss test size 12 38 | tlblatency - INFO - done. misses: 13.0, cycles: 1026.8 39 | tlblatency - INFO - Found 40 | tlblatency - INFO - 41 | tlblatency - INFO - L1 size: 4 L2 size: 12 42 | tlblatency - INFO - Assuming L1dTLB set size: 4 43 | tlblatency - INFO - Assuming L2dTLB/STLB set size: 12 44 | tlblatency - INFO - 45 | tlblatency - INFO - * Configuring miss and no-miss TLB sets of equal size 46 | tlblatency - INFO - 47 | tlblatency - INFO - l1 misses: [1, 17, 33, 49, 65] 48 | tlblatency - INFO - l1 no misses: [1, 17, 33, 50, 66] 49 | tlblatency - INFO - l2 misses: [65537, 131073, 196609, 262145, 327681, 393217, 458753, 524289, 589825, 655361, 720897, 786433, 851969] 50 | tlblatency - INFO - l2 no misses: [65537, 131073, 196609, 262145, 327681, 393217, 458753, 524289, 589825, 655361, 720897, 786434, 851970] 51 | tlblatency - INFO - 52 | tlblatency - INFO - * Probing miss rate and latency differences of miss and no-miss sets of equal size 53 | tlblatency - INFO - 54 | tlblatency - INFO - calculating dtlb_load_misses.stlb_hit and cycles slope for l1 miss list 55 | tlblatency - INFO - done. misses: 5.0, cycles: 280.8 56 | tlblatency - INFO - calculating dtlb_load_misses.stlb_hit and cycles slope for l1 no-miss list 57 | tlblatency - INFO - done. misses: 0.0, cycles: 235.7 58 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 miss list 59 | tlblatency - INFO - done. misses: 12.9, cycles: 1039.9 60 | tlblatency - INFO - calculating dtlb_load_misses.miss_causes_a_walk and cycles slope for l2 no-miss list 61 | tlblatency - INFO - done. misses: 0.0, cycles: 707.2 62 | tlblatency - INFO - 63 | tlblatency - INFO - * Results 64 | tlblatency - INFO - 65 | tlblatency - INFO - L1TLB misses for miss/no-miss set: 5.00 vs 0.00. Cycles per iteration: 280.8 vs 235.7 66 | tlblatency - INFO - Extra misses: 5.00 extra cycles: 45.09 cycles per miss: 9.02 67 | tlblatency - INFO - L2TLB misses for miss/no-miss set: 12.88 vs 0.00. Cycles per iteration: 1039.9 vs 707.2 68 | tlblatency - INFO - Extra misses: 12.88 extra cycles: 332.69 cycles per miss: 25.83 69 | -------------------------------------------------------------------------------- /py/crossthread.py: -------------------------------------------------------------------------------- 1 | 2 | import multiprocessing 3 | import subprocess 4 | import tlblib 5 | from scipy import stats 6 | import numpy 7 | import matplotlib 8 | matplotlib.use('Agg') 9 | import matplotlib.pyplot as plt 10 | import os 11 | import cpuid 12 | import cpus 13 | import perf 14 | 15 | cpu_uarch=cpuid.cpu_microarchitecture()[0] 16 | 17 | def test_cores(argtype,setsize,nsets,event,pdfname, level, iterate=50000): 18 | events=[event] 19 | print 'reading cpu info from /proc/cpuinfo' 20 | corelist=cpus.corelist() 21 | corepair=corelist[1] 22 | print 'picking 2nd pair of logical processors:', corepair 23 | c1, c2 = corepair 24 | misses=numpy.zeros((nsets,nsets)) 25 | numpyfn='results-crossthread/' + pdfname+'-'+str(iterate)+'.npy' 26 | if not os.path.isfile(numpyfn): 27 | for set_id1 in range(nsets): 28 | for set_id2 in range(nsets): 29 | if level == 1: 30 | pagelist1=tlblib.generate_set_l1(set_id1, setsize) 31 | pagelist2=tlblib.generate_set_l1(set_id2, setsize) 32 | elif level == 2: 33 | pagelist1=tlblib.generate_set_l2_general(set_id1, setsize) 34 | pagelist2=tlblib.generate_set_l2_general(set_id2, setsize) 35 | print pagelist1,pagelist2 36 | else: 37 | raise Exception('no') 38 | pagelist_args1 = [argtype+str(x) for x in pagelist1] 39 | pagelist_args2 = [argtype+str(x) for x in pagelist2] 40 | cmdlist=["perf", "stat", "-x,", "-e", ",".join(events), "./c/tvlb-verbatim", "-r", str(iterate)] 41 | print 'popen 1:', cmdlist+pagelist_args1, 'cpu', c1 42 | print 'popen 2:', cmdlist+pagelist_args2, 'cpu', c2 43 | p1 = subprocess.Popen(cmdlist+["-c"+str(c1)]+pagelist_args1, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) 44 | p2 = subprocess.Popen(cmdlist+["-c"+str(c2)]+pagelist_args2, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) 45 | o1, stderrdata1 = p1.communicate() 46 | o2, stderrdata2 = p2.communicate() 47 | if p1.returncode != 0: 48 | print 'stdout:', o1 49 | print 'stderr:', stderrdata1 50 | raise Exception('popen 1 failed') 51 | if p2.returncode != 0: 52 | print 'stdout:', o2 53 | print 'stderr:', stderrdata2 54 | raise Exception('popen 2 failed') 55 | yset1 = perf.out_to_fields(o1) 56 | yset2 = perf.out_to_fields(o2) 57 | print 'yset1:', yset1[event],yset2[event],c1,c2,set_id1,set_id2 58 | misses[set_id2,set_id1] = ((float(yset1[event])+yset2[event])/iterate)/2.0 59 | #print misses 60 | #print '%7d' % (misses/iterate), 61 | print 'cores:', c1, c2 62 | print misses 63 | numpy.save(numpyfn, misses) 64 | return numpy.load(numpyfn) 65 | 66 | if __name__ == "__main__": 67 | """ 68 | Do measurements by invoking test_cores() for every desired measurement (TLB type and level). 69 | Plot the result using matplotlib. 70 | """ 71 | numpy.set_printoptions(threshold=numpy.nan, linewidth=numpy.nan, precision=1) 72 | l2size=128 73 | 74 | # test_cores('-D',3,l2size,'dtlb_load_misses.miss_causes_a_walk', cpu_uarch + '-stlb-sets.pdf', 2) 75 | l1dtlb = test_cores('-D',3,16,'dtlb_load_misses.stlb_hit', cpu_uarch + '-dtlb-sets.pdf', 1) 76 | l1itlb = test_cores('-C',3,16,'itlb_misses.stlb_hit', cpu_uarch + '-itlb-sets.pdf', 1) 77 | 78 | f, axarr = plt.subplots(nrows=1, ncols=2) 79 | f.tight_layout() 80 | matplotlib.rc('font', size=5) 81 | plt.subplot(1,2,1, adjustable='box', aspect=0.8) 82 | pcm = plt.pcolormesh(l1dtlb) 83 | plt.xticks([0,4,8,12,16]) 84 | plt.yticks([0,4,8,12,16]) 85 | plt.gca().invert_yaxis() 86 | plt.xlabel('TLB set') 87 | plt.ylabel('TLB set') 88 | plt.title('L1 dtlb') 89 | 90 | plt.subplot(1,2,2, adjustable='box', aspect=0.8) 91 | pcm = plt.pcolormesh(l1itlb) 92 | plt.gca().invert_yaxis() 93 | plt.xticks([0,4,8]) 94 | plt.yticks([0,4,8]) 95 | plt.xlabel('TLB set') 96 | plt.ylabel('TLB set') 97 | plt.title('L1 itlb') 98 | 99 | f.subplots_adjust(right=0.8) 100 | cbar_ax = f.add_axes([0.85, 0.38, 0.05, 0.25]) 101 | f.colorbar(pcm, cax=cbar_ax) 102 | 103 | plt.savefig('results-crossthread/' + cpu_uarch + '-tlbsets-ht.pdf', bbox_inches='tight') 104 | plt.savefig('results-crossthread/' + cpu_uarch + '-tlbsets-ht.png', bbox_inches='tight') 105 | plt.close() 106 | 107 | -------------------------------------------------------------------------------- /py/tlb-latency.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import multiprocessing 4 | import subprocess 5 | import tlblib 6 | from scipy import stats 7 | import numpy 8 | import os 9 | import cpuid 10 | import cpus 11 | import perf 12 | from sklearn.linear_model import LinearRegression 13 | import log 14 | 15 | cpu_uarch=cpuid.cpu_microarchitecture()[0] 16 | 17 | """ 18 | dtlb_load_misses.stlb_hit 19 | dtlb_load_misses.miss_causes_a_walk 20 | """ 21 | 22 | def linear_regression(xx,yy): 23 | nx,ny = numpy.asarray(xx), numpy.asarray(yy) 24 | nx,ny = nx.reshape(-1, 1), ny.reshape(-1, 1) 25 | reg = LinearRegression().fit(nx, ny) 26 | slope = reg.coef_[0][0] 27 | return slope 28 | 29 | def hit_miss_comparison(m1,m2,c1,c2,name): 30 | log.log('%s misses for miss/no-miss set: %.2f vs %.2f. Cycles per iteration: %.1f vs %.1f', name, m1, m2, c1, c2) 31 | if m2 >= 1: 32 | raise Exception('The no-miss rate is 1 or larger, this likely means the assumptions (eviction set) of this experiment are wrong.') 33 | if m2 >= 0.3: 34 | raise Exception('miss rate is quite large for the no-miss set! Possibly wrong set or noisy data.') 35 | if m1 < 1: 36 | raise Exception('miss rate is quite small for the no-miss set! Possibly wrong set or noisy data.') 37 | if m1 <= m2: 38 | raise Exception('More misses in the miss vs no-miss case makes no sense. Wrong eviction set or noisy data.') 39 | if c1 <= c2: 40 | raise Exception('More cycles in the miss vs no-miss case makes no sense. Wrong eviction set or noisy data.') 41 | extra_misses = m1-m2 42 | extra_cycles = c1-c2 43 | log.log('Extra misses: %.2f extra cycles: %.2f cycles per miss: %.2f', extra_misses, extra_cycles, extra_cycles/extra_misses) 44 | 45 | def test_latency_and_miss_per_iteration(pagelist,event,name): 46 | cpu = 1 # just a single process so doesn't matter, but we should pin it 47 | xx,yy,ycycles=[],[],[] 48 | log.log('calculating %s and cycles slope for %s', event, name) 49 | sys.stdout.flush() 50 | for iterations in range(0,20000000,5000000): 51 | pagelist_args = ['-D'+str(x) for x in pagelist] 52 | cmdlist=["perf", "stat", "-x,", "-e", event+",cycles", "./c/tvlb-verbatim", "-r", str(iterations), "-c"+str(cpu)]+pagelist_args 53 | sys.stdout.flush() 54 | p1 = subprocess.Popen(cmdlist, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) 55 | o1, stderrdata1 = p1.communicate() 56 | if p1.returncode != 0: 57 | print 'stdout:', o1 58 | print 'stderr:', stderrdata1 59 | raise Exception('command failed') 60 | yset1 = perf.out_to_fields(o1) 61 | xx.append(iterations) 62 | yy.append(yset1[event]) 63 | ycycles.append(yset1['cycles']) 64 | 65 | event_per_iteration = linear_regression(xx,yy) 66 | cycles_per_iteration = linear_regression(xx,ycycles) 67 | 68 | log.log('done. misses: %.1f, cycles: %.1f', event_per_iteration,cycles_per_iteration) 69 | 70 | return event_per_iteration,cycles_per_iteration 71 | 72 | if __name__ == "__main__": 73 | log.log('') 74 | log.log(' * Probing set sizes') 75 | log.log('') 76 | 77 | for l1_size in range(1,10): 78 | l1_pagelist_misses=tlblib.generate_set_l1(1,l1_size+1) 79 | l1misses1,l1cycles1 = test_latency_and_miss_per_iteration(l1_pagelist_misses, 'dtlb_load_misses.stlb_hit', 'l1 miss test size ' + str(l1_size)) 80 | if l1misses1 > 1: 81 | log.log('Found') 82 | log.log('') 83 | break 84 | for l2_size in range(1,16): 85 | l2_pagelist_misses=tlblib.generate_set_l2_general(1,l2_size+1) 86 | l2misses1,l2cycles1 = test_latency_and_miss_per_iteration(l2_pagelist_misses, 'dtlb_load_misses.miss_causes_a_walk', 'l2 miss test size ' + str(l2_size)) 87 | if l2misses1 > 1: 88 | log.log('Found') 89 | log.log('') 90 | break 91 | log.log('L1 size: %d L2 size: %d', l1_size, l2_size) 92 | 93 | l1_setsize=l1_size 94 | l2_setsize=l2_size 95 | 96 | log.log('Assuming L1dTLB set size: %d', l1_setsize) 97 | log.log('Assuming L2dTLB/STLB set size: %d', l2_setsize) 98 | 99 | l1_pagelist_misses=tlblib.generate_set_l1(1,l1_setsize+1) 100 | l1_pagelist_no_misses=list(l1_pagelist_misses) 101 | l1_pagelist_no_misses[-1] += 1 102 | l1_pagelist_no_misses[-2] += 1 103 | 104 | l2_pagelist_misses=tlblib.generate_set_l2_general(1,l2_setsize+1) 105 | l2_pagelist_no_misses=list(l2_pagelist_misses) 106 | l2_pagelist_no_misses[-1] += 1 107 | l2_pagelist_no_misses[-2] += 1 108 | 109 | log.log('') 110 | log.log(' * Configuring miss and no-miss TLB sets of equal size') 111 | log.log('') 112 | 113 | log.log('l1 misses: %s', l1_pagelist_misses) 114 | log.log('l1 no misses: %s', l1_pagelist_no_misses) 115 | log.log('l2 misses: %s', l2_pagelist_misses) 116 | log.log('l2 no misses: %s', l2_pagelist_no_misses) 117 | 118 | log.log('') 119 | log.log(' * Probing miss rate and latency differences of miss and no-miss sets of equal size') 120 | log.log('') 121 | 122 | l1misses1,l1cycles1 = test_latency_and_miss_per_iteration(l1_pagelist_misses, 'dtlb_load_misses.stlb_hit', 'l1 miss list') 123 | l1misses2,l1cycles2 = test_latency_and_miss_per_iteration(l1_pagelist_no_misses, 'dtlb_load_misses.stlb_hit', 'l1 no-miss list') 124 | l2misses1,l2cycles1 = test_latency_and_miss_per_iteration(l2_pagelist_misses, 'dtlb_load_misses.miss_causes_a_walk', 'l2 miss list') 125 | l2misses2,l2cycles2 = test_latency_and_miss_per_iteration(l2_pagelist_no_misses, 'dtlb_load_misses.miss_causes_a_walk', 'l2 no-miss list') 126 | 127 | log.log('') 128 | log.log(' * Results') 129 | log.log('') 130 | 131 | hit_miss_comparison(l1misses1,l1misses2,l1cycles1,l1cycles2, 'L1TLB') 132 | hit_miss_comparison(l2misses1,l2misses2,l2cycles1,l2cycles2, 'L2TLB') 133 | --------------------------------------------------------------------------------