├── .gitignore ├── LICENSE ├── Makefile ├── README.md └── collide.c /.gitignore: -------------------------------------------------------------------------------- 1 | collide 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Ryan Hitchman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: collide 2 | 3 | CFLAGS := -Os -g --std=gnu99 -Wall -D_GNU_SOURCE 4 | 5 | clean: 6 | rm -f collide 7 | 8 | check: collide 9 | # only two rdpmc insns, and 4 cpuid insns 10 | objdump -d collide | grep -c rdpmc | grep -q "^2$$" 11 | objdump -d collide | grep -c cpuid | grep -q "^4$$" 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | An attempt to reverse-engineer branch prediction on modern x86 processors (Haswell in particular). 2 | 3 | Inspired by [Matt Godbolt's](http://xania.org/201602/haswell-and-ivy-btb) investigations. 4 | 5 | It generates a function involving a long sequence of unconditional jumps. Theoretically, enough 6 | jumps will eventually exceed the Branch Prediction Unit's capacity, causing BPU mispredicts. 7 | Determining the sets of addresses that together force a mispredict should give information 8 | on how they are stored. 9 | 10 | In the current preliminary state, it finds that these addresses collide: 11 | 12 | 0x10003f377 13 | 0x1009bf376 14 | 0x1008cb379 15 | 0x100eef36d 16 | 0x10071736e 17 | 18 | Which seems very plausible! More work is necessary to understand the precise function. 19 | -------------------------------------------------------------------------------- /collide.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | const int INSN_RET = 0xC3; // 1 byte 19 | const int INSN_JMP = 0xE9; // 5 bytes: opcode + 4B displacement 20 | 21 | ////////////////////////////////////////////// 22 | // xorshift128+ by Sebastiano Vigna 23 | // from http://xorshift.di.unimi.it/xorshift128plus.c 24 | uint64_t s[2] = {0, 1}; 25 | 26 | uint64_t 27 | xrand(void) { 28 | uint64_t s1 = s[0]; 29 | const uint64_t s0 = s[1]; 30 | s[0] = s0; 31 | s1 ^= s1 << 23; // a 32 | s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c 33 | return s[1] + s0; 34 | } 35 | 36 | void 37 | xsrand(uint64_t x) { 38 | // splitmix64 generator -- http://xorshift.di.unimi.it/splitmix64.c 39 | for (int i = 0; i <= 1; i++) { 40 | uint64_t z = (x += UINT64_C(0x9E3779B97F4A7C15)); 41 | z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9); 42 | z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB); 43 | s[i] = z ^ (z >> 31); 44 | } 45 | } 46 | ///////////////////////////////////// 47 | 48 | void 49 | bind_to_cpu(int cpu) 50 | { 51 | cpu_set_t set; 52 | CPU_ZERO(&set); 53 | CPU_SET(cpu, &set); 54 | if (sched_setaffinity(0, sizeof(set), &set) < 0) 55 | err(EXIT_FAILURE, "Unable to set CPU affinity"); 56 | } 57 | 58 | 59 | long 60 | perf_event_open(struct perf_event_attr *hw_event, pid_t pid, 61 | int cpu, int group_fd, unsigned long flags) 62 | { 63 | int ret; 64 | ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, 65 | group_fd, flags); 66 | return ret; 67 | } 68 | 69 | int 70 | open_perf_counter(int event) 71 | { 72 | int fd; 73 | struct perf_event_attr pe = { 0 }; 74 | pe.size = sizeof(struct perf_event_attr); 75 | pe.type = PERF_TYPE_RAW; 76 | pe.config = event; 77 | pe.pinned = 1; 78 | fd = perf_event_open(&pe, 0, -1, -1, 0); 79 | if (fd == -1) 80 | err(EXIT_FAILURE, "Error opening leader %llx", pe.config); 81 | return fd; 82 | } 83 | 84 | struct cpu_model { 85 | char *name; 86 | int event; 87 | int models[5]; 88 | } cpu_models[] = { 89 | // These event numbers are from 90 | // Intel® 64 and IA-32 Architectures Software Developer’s Manual, Vol 3B 91 | // Descriptions: 92 | // "Number of front end re-steers due to BPU misprediction." 93 | // Sandy Bridge: "Counts the number of times the front end is resteered, 94 | // mainly when the BPU cannot provide a correct prediction and this is 95 | // corrected by other branch handling mechanisms at the front end." 96 | // Nehalem: "... This can occur if the code has many branches such that they 97 | // cannot be consumed by the BPU. Each BACLEAR asserted by the BAC 98 | // generates approximately an 8 cycle bubble in the instruction fetch 99 | // pipeline." 100 | // (NOTE: libpfm4 could supply these values as well) 101 | // BACLEARS.ANY: 102 | {"Skylake", 0x01E6, {0x4E, 0x5E}}, 103 | {"Broadwell", 0x1FE6, {0x3D, 0x47, 0x56}}, // Undocumented event 104 | {"Haswell", 0x1FE6, {0x3C, 0x45, 0x46, 0x3F}}, 105 | {"Ivy Bridge", 0x1FE6, {0x3A, 0x3E}}, 106 | {"Sandy Bridge", 0x01E6, {0x2A, 0x2D}}, 107 | // BACLEAR.CLEAR: 108 | {"Westmere", 0x01E6, {0x25, 0x2C, 0x2F}}, 109 | {"Nehalem", 0x01E6, {0x1A, 0x1E, 0x1F, 0x2E}}, 110 | {"Core 2", 0x00E6, {0x17, 0x1D}}, // BACLEARS 111 | {} 112 | }; 113 | 114 | int 115 | determine_perf_event(void) 116 | { 117 | FILE *cpuinfo = fopen("/proc/cpuinfo", "r"); 118 | if (!cpuinfo) 119 | err(EXIT_FAILURE, "unable to open cpuinfo"); 120 | int family = 0, model = 0; 121 | char *line = NULL; 122 | size_t size = 0; 123 | while (getline(&line, &size, cpuinfo) != -1) { 124 | char *saveptr = NULL; 125 | char *key = strtok_r(line, "\t:", &saveptr); 126 | char *value = strtok_r(NULL, "\t: ", &saveptr); 127 | if (key == NULL || value == NULL) 128 | break; 129 | if (!strcmp("vendor_id", key)) { 130 | if (!strcmp(key, "GenuineIntel\n")) 131 | errx(EXIT_FAILURE, "only works for Intel"); 132 | } else if (!strcmp("cpu family", key)) { 133 | family = atoi(value); 134 | } else if (!strcmp("model", key)) { 135 | model = atoi(value); 136 | } 137 | } 138 | fclose(cpuinfo); 139 | if (family != 6) 140 | errx(EXIT_FAILURE, "unknown cpu family %d (expected 6)", family); 141 | for (int i = 0; cpu_models[i].name; i++) { 142 | for (int *cpu_model = cpu_models[i].models; *cpu_model; cpu_model++) { 143 | if (*cpu_model == model) { 144 | int event = cpu_models[i].event; 145 | printf("# CPU: %s (%02X_%02XH => event %04X)\n", 146 | cpu_models[i].name, family, model, event); 147 | return event; 148 | } 149 | } 150 | } 151 | errx(EXIT_FAILURE, "unknown CPU model %d", model); 152 | } 153 | 154 | static uint64_t 155 | rdpmc(uint32_t ctr) 156 | { 157 | uint32_t low, high; 158 | __asm__ volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (ctr)); 159 | return (uint64_t)low | (((uint64_t)high) << 32); 160 | } 161 | 162 | static void 163 | serialize(void) { 164 | __asm__ __volatile__("xor %%eax, %%eax\n\tcpuid" : : : "rax", "rbx", "rcx", "rdx"); 165 | } 166 | 167 | typedef void (*voidfunc)(void); 168 | 169 | long 170 | count_perf(voidfunc func, int counter) 171 | { 172 | // warm up 173 | func(); func(); func(); func(); func(); 174 | func(); func(); func(); func(); func(); 175 | 176 | serialize(); // prevent instructions prior to here from crossing the rdpmc 177 | uint64_t before = rdpmc(counter); 178 | serialize(); // prevent instructions after the rdpmc from jumping ahead 179 | func(); 180 | serialize(); // ensure all the func() work is done 181 | uint64_t after = rdpmc(counter); 182 | serialize(); // prevent any instructions jumping ahead of the rdpmc 183 | 184 | return after - before; 185 | } 186 | 187 | // Return the minimum of repeated runs of count_perf(func), 188 | // or the first result at or below thresh. 189 | long 190 | count_perf_min_below(voidfunc func, int iters, int thresh, int counter) 191 | { 192 | long min_count = LONG_MAX; 193 | for (int i = 0; i < iters; i++) { 194 | long count = count_perf(func, counter); 195 | if (count < min_count) 196 | min_count = count; 197 | if (count <= thresh) 198 | return count; // early exit 199 | } 200 | return min_count; 201 | } 202 | 203 | // Return the minimum of repeated runs of count_perf(func) 204 | long 205 | count_perf_min(voidfunc func, int iters, int counter) 206 | { 207 | return count_perf_min_below(func, iters, 0, counter); 208 | } 209 | 210 | void 211 | write_jump(uint8_t *buf, uint64_t addr, uint64_t target) 212 | { 213 | int64_t offset = target - addr - 5; 214 | assert(INT32_MIN <= offset && offset <= INT32_MAX); 215 | assert(offset <= -10 || 0 <= offset); 216 | buf[addr] = INSN_JMP; 217 | buf[addr+1] = offset & 0xFF; 218 | buf[addr+2] = (offset >> 8) & 0xFF; 219 | buf[addr+3] = (offset >> 16) & 0xFF; 220 | buf[addr+4] = (offset >> 24) & 0xFF; 221 | } 222 | 223 | // returns true if putting a jump instruction 224 | // at addr would squash another jump instruction 225 | int 226 | already_used(uint8_t *buf, int addr) 227 | { 228 | for (int i = -4; i <= 4; i++) { 229 | if (addr + i < 0) 230 | continue; 231 | if (buf[addr + i] == INSN_JMP) 232 | return 1; 233 | } 234 | return 0; 235 | } 236 | 237 | void 238 | usage(char **argv) 239 | { 240 | errx(2, "usage: %s [-b BITS] [-s SEED] [-j JUMPS] [-r RUNS] [-m MASK_HEX]" 241 | " [-c CPU]", 242 | argv[0]); 243 | } 244 | 245 | int 246 | main(int argc, char **argv) 247 | { 248 | int opt; 249 | int cpu = 0, nbits = 31, jumps = 0, runs = 500; 250 | uint64_t seed = 0; 251 | 252 | // specify a set of bits that will be zero in each jump 253 | uint32_t clear_mask = 0; 254 | 255 | while ((opt = getopt(argc, argv, "hs:j:b:r:m:c:")) != -1) { 256 | errno = 0; 257 | char *endptr = NULL; 258 | switch (opt) { 259 | case 's': seed = strtoll(optarg, &endptr, 10); break; 260 | case 'b': nbits = strtol(optarg, &endptr, 10); break; 261 | case 'c': runs = strtol(optarg, &endptr, 10); break; 262 | case 'j': jumps = strtol(optarg, &endptr, 10); break; 263 | case 'r': runs = strtol(optarg, &endptr, 10); break; 264 | case 'm': clear_mask = strtol(optarg, &endptr, 16); break; 265 | case 'h': 266 | default: usage(argv); 267 | } 268 | if (endptr == optarg || *endptr != '\0') 269 | err(EXIT_FAILURE, "bad number '%s'", optarg); 270 | if (errno) 271 | err(EXIT_FAILURE, "error parsing '%s'", optarg); 272 | } 273 | if (optind != argc) 274 | usage(argv); 275 | 276 | 277 | const uint64_t BUF_SIZE = 1ULL << nbits; 278 | 279 | // pessimistic lower bound 280 | int max_jumps = (1ULL << (nbits - __builtin_popcount(clear_mask))) / 9; 281 | if (max_jumps > 100000) 282 | max_jumps = 100000; 283 | 284 | #define CHECK_RANGE(var, name, lo, hi) \ 285 | if (var < lo || var > hi) \ 286 | errx(EXIT_FAILURE, name " must be in range [%d, %d]", lo, hi); 287 | 288 | CHECK_RANGE(nbits, "BITS", 8, 31); 289 | CHECK_RANGE(jumps, "JUMPS", 0, max_jumps); 290 | CHECK_RANGE(runs, "RUNS", 1, 1000000); 291 | 292 | if (jumps != 0) { 293 | max_jumps = jumps; 294 | } 295 | 296 | bind_to_cpu(cpu); 297 | int fd = open_perf_counter(determine_perf_event()); 298 | 299 | struct perf_event_mmap_page *event_buf = (struct perf_event_mmap_page*)mmap( 300 | NULL, getpagesize(), PROT_READ, MAP_SHARED, fd, 0); 301 | if (event_buf == MAP_FAILED) 302 | err(EXIT_FAILURE, "unable to mmap event_buf"); 303 | int counter = event_buf->index - 1; 304 | 305 | // Create a function from a series of unconditional jumps 306 | 307 | uint8_t *buf = mmap((void*)0x100000000LL, 308 | BUF_SIZE, 309 | PROT_READ | PROT_WRITE | PROT_EXEC, 310 | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, 311 | -1, 0); 312 | if (buf == MAP_FAILED) 313 | err(EXIT_FAILURE, "unable to mmap"); 314 | 315 | xsrand(seed); 316 | int last = xrand() % (BUF_SIZE - 5) & ~clear_mask; 317 | 318 | int jump_addrs[max_jumps]; 319 | jump_addrs[0] = last; 320 | buf[last] = INSN_RET; 321 | voidfunc func = (voidfunc)buf + jump_addrs[0]; 322 | 323 | for (int i = 1; i < max_jumps; i++) { 324 | int target; 325 | do { 326 | target = (xrand() % (BUF_SIZE - 5)) & ~clear_mask; 327 | } while (already_used(buf, target) || abs(target - last) < 5); 328 | write_jump(buf, last, target); 329 | buf[target] = INSN_RET; 330 | jump_addrs[i] = target; 331 | last = target; 332 | if (jumps == 0 && count_perf_min(func, runs, counter) > 0) { 333 | jumps = i; 334 | break; 335 | } 336 | } 337 | 338 | printf("# -j%d -b%d -s%ld", jumps, nbits, seed); 339 | if (clear_mask) 340 | printf(" -m%04x", clear_mask); 341 | printf("\n"); 342 | 343 | long clears = count_perf_min(func, runs * 10, counter); 344 | printf("BACLEARS: %ld\n", clears); 345 | 346 | if (clears == 0) { 347 | printf("Bailing: no event on every iteration\n"); 348 | return 0; 349 | } 350 | 351 | int mask = 0; 352 | int expected = 0; 353 | 354 | // Try to find which jumps are causing mispredicts. 355 | printf("N addr clears\n"); 356 | for (int i = 0; i < jumps; i++) { 357 | // skip this jump 358 | if (i == 0 && jumps > 1) { 359 | func += jump_addrs[1] - jump_addrs[0]; 360 | } else if (i == jumps - 1) { 361 | buf[jump_addrs[i - 1]] = INSN_RET; 362 | } else { 363 | write_jump(buf, jump_addrs[i - 1], jump_addrs[i + 1]); 364 | } 365 | long modified_clears = count_perf_min(func, runs, counter); 366 | if (modified_clears != clears) { 367 | uintptr_t addr = (uintptr_t)buf + jump_addrs[i]; 368 | printf("%03d %8lx %ld\n", i + 1, addr, modified_clears); 369 | if (mask == 0) { 370 | mask = BUF_SIZE - 1; 371 | expected = addr; 372 | } else { 373 | mask ^= (mask & addr) ^ expected; 374 | expected &= mask; 375 | } 376 | } 377 | // undo 378 | if (i == 0 && jumps > 1) { 379 | func -= jump_addrs[1] - jump_addrs[0]; 380 | } else { 381 | write_jump(buf, jump_addrs[i - 1], jump_addrs[i]); 382 | } 383 | } 384 | printf("mask: %08x\n", mask); 385 | 386 | munmap(event_buf, getpagesize()); 387 | close(fd); 388 | } 389 | --------------------------------------------------------------------------------