├── KNOWN-ISSUES ├── MANUAL ├── Makefile ├── cycles.h ├── simple-pmu.c └── tcyc.c /KNOWN-ISSUES: -------------------------------------------------------------------------------- 1 | Breaks suspend/resume on recent kernels. Sorry for now. 2 | 3 | Needs updates for newer kernels 4 | -------------------------------------------------------------------------------- /MANUAL: -------------------------------------------------------------------------------- 1 | simple-pmu 2 | ---------- 3 | 4 | This is a simple Intel PMU driver that allows to read fixed counters 5 | from user space. 6 | 7 | Useful to measure cycles without worrying about Turbo mode and p-states 8 | Essentially this is a replacement for RDTSC for performance analysis. 9 | This has lower overhead than classic RDTSC. 10 | 11 | Can be only used by writing code. 12 | 13 | Doesn't attempt to compete with any of the big PMU boys like 14 | perf or oprofile, it just aims to be a RDTSC replacement. 15 | 16 | This works currently only on modern Intel CPUs (anything with 17 | "arch_perfmon" in /proc/cpuinfo) 18 | 19 | Usage 20 | ----- 21 | 22 | Install kernel module: 23 | 24 | make 25 | (or make KDIR=/your/kernel/source or objdir) 26 | su 27 | make modules_install 28 | 29 | Install header files: 30 | 31 | make install 32 | 33 | Application usage 34 | ----------------- 35 | 36 | Load the kernel module 37 | 38 | modprobe simple-pmu 39 | 40 | Measuring cycles: 41 | 42 | Copy cycles.h somewhere where your application can find it with -I.. 43 | 44 | #define _GNU_SOURCE 1 45 | ... 46 | #include "cycles.h" 47 | 48 | ... 49 | 50 | counter_t a, b; 51 | 52 | pin_cpu(NULL); 53 | if (perfmon_available() == 0) ... exit... 54 | 55 | sync_core(); 56 | a = unhalted_core(); 57 | 58 | b = unhalted_core(); 59 | sync_core(); 60 | printf("%llu cycles\n", b - a); 61 | 62 | The sync_core()s are needed to prevent an out of order CPU from 63 | moving instructions outside the measurement window. 64 | pin_cpu() pins the thread to the current CPU because CPU changes 65 | could cause mismeasurements. You can call unpin_cpu() 66 | afterwards to unpin. For this it is needed to pass a cpu_set_t 67 | to pin_cpu() and afterwards to unpin_cpu(). 68 | 69 | Measuring retired instructions: 70 | 71 | #include "cycles.h" 72 | 73 | counter_t a, b; 74 | 75 | pin_cpu(NULL); 76 | if (perfmon_available() == 0) ... exit... 77 | 78 | sync_core(); 79 | a = insn_retired(); 80 | 81 | b = insn_retired(); 82 | sync_core(); 83 | printf("%llu instructions\n", b - a); 84 | 85 | unhalted_ref() is also available. Since Nehalem this does nothing. 86 | Before nehalem it would count frontside bus cycles. 87 | 88 | rdtsc() and rdtscp() (RDTSC with synchronization) are also available, 89 | however for those all the caveats described at the beginning apply. 90 | 91 | Kernel interface 92 | ---------------- 93 | 94 | The exporting of the various counters can be controlled through 95 | /sys/devices/system/simple-pmu/simple-pmu0/ 96 | 97 | ring 98 | The ring the counters are exported to. Default to 3. 99 | 100 | rdpmc_fixed 101 | If not zero start the fixed counters 102 | 103 | Notes 104 | ----- 105 | tcyc.c is a simple demo program showing (nearly) all the cycles.h 106 | features. You can build it with "make tcyc" 107 | 108 | #define _GNU_SOURCE 1 must be defined at the beginning of the 109 | source files that include cycles.h 110 | 111 | There is currently no allocation for the fixed counters, which 112 | are also used by perf and other profilers. Don't use those in parallel. 113 | Interaction with oprofile is fine. 114 | 115 | Author 116 | ------ 117 | 118 | simple-pmu was written by Andi Kleen 119 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | KDIR = /lib/modules/`uname -r`/build 2 | obj-m := simple-pmu.o 3 | M := make -C ${KDIR} M=`pwd` 4 | 5 | 6 | all: 7 | ${M} modules 8 | 9 | install: 10 | ${M} modules_install 11 | 12 | clean: 13 | ${M} clean 14 | rm -f tcyc 15 | 16 | tcyc: tcyc.c cycles.h 17 | gcc -o tcyc tcyc.c -Wall -O2 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /cycles.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 Intel Corporation 3 | * Author: Andi Kleen 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that: (1) source code distributions 7 | * retain the above copyright notice and this paragraph in its entirety, (2) 8 | * distributions including binary code include the above copyright notice and 9 | * this paragraph in its entirety in the documentation or other materials 10 | * provided with the distribution 11 | * 12 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED 13 | * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF 14 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 15 | * 16 | * Library for efficient self monitoring on modern Intel CPUs using the 17 | * simple-pmu driver. 18 | */ 19 | #ifndef CYCLES_H 20 | #define CYCLES_H 1 21 | 22 | #if defined(_SCHED_H) && !defined(__USE_GNU) 23 | #error "Add #define _GNU_SOURCE 1 at beginning of source file" 24 | #endif 25 | 26 | #define _GNU_SOURCE 1 27 | #include 28 | #include 29 | 30 | #define force_inline __attribute__((always_inline)) 31 | 32 | typedef unsigned long long counter_t; 33 | 34 | static inline void p_cpuid(unsigned in, 35 | unsigned *a, unsigned *b, unsigned *c, unsigned *d) 36 | { 37 | asm("cpuid" : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d) 38 | : "0" (in)); 39 | } 40 | 41 | static inline unsigned p_cpuid_a(unsigned in) 42 | { 43 | unsigned a, b, c, d; 44 | p_cpuid(in, &a, &b, &c, &d); 45 | return a; 46 | } 47 | 48 | static inline force_inline void sync_core(void) 49 | { 50 | asm volatile("lfence" ::: "memory"); 51 | } 52 | 53 | static inline force_inline unsigned long long p_rdpmc(unsigned in) 54 | { 55 | unsigned d, a; 56 | 57 | asm volatile("rdpmc" : "=d" (d), "=a" (a) : "c" (in) : "memory"); 58 | return ((unsigned long long)d << 32) | a; 59 | } 60 | 61 | static inline int perfmon_available(void) 62 | { 63 | unsigned eax; 64 | 65 | if (access("/sys/devices/system/simple-pmu/simple-pmu0", R_OK) < 0) 66 | return 0; 67 | if (p_cpuid_a(0) < 10) 68 | return 0; 69 | eax = p_cpuid_a(10); 70 | if ((eax & 0xff) == 0) 71 | return 0; 72 | return (eax >> 8) & 0xff; 73 | } 74 | 75 | enum { 76 | FIXED_SELECT = (1U << 30), 77 | FIXED_INST_RETIRED_ANY = 0, 78 | FIXED_CPU_CLK_UNHALTED_CORE = 1, 79 | FIXED_CPU_CLK_UNHALTED_REF = 2, 80 | }; 81 | 82 | static inline force_inline counter_t unhalted_core(void) 83 | { 84 | return p_rdpmc(FIXED_SELECT|FIXED_CPU_CLK_UNHALTED_CORE); 85 | } 86 | 87 | static inline force_inline counter_t unhalted_ref(void) 88 | { 89 | return p_rdpmc(FIXED_SELECT|FIXED_CPU_CLK_UNHALTED_REF); 90 | } 91 | 92 | static inline force_inline counter_t insn_retired(void) 93 | { 94 | return p_rdpmc(FIXED_SELECT|FIXED_INST_RETIRED_ANY); 95 | } 96 | 97 | /* Lots of caveats when you use that */ 98 | 99 | static inline force_inline unsigned long long rdtsc(void) 100 | { 101 | #ifdef __i386__ 102 | unsigned long long s; 103 | asm volatile("rdtsc" : "=A" (s) :: "memory"); 104 | return s; 105 | #else 106 | unsigned low, high; 107 | asm volatile("rdtsc" : "=a" (low), "=d" (high) :: "memory"); 108 | return ((unsigned long long)high << 32) | low; 109 | #endif 110 | } 111 | 112 | static inline force_inline unsigned long long rdtscp(void) 113 | { 114 | #ifdef __i386__ 115 | unsigned long long s; 116 | asm volatile("rdtscp" : "=A" (s) :: "ecx", "memory"); 117 | return s; 118 | #else 119 | unsigned low, high; 120 | asm volatile("rdtscp" : "=a" (low), "=d" (high) :: "ecx", "memory"); 121 | return ((unsigned long long)high << 32) | low; 122 | #endif 123 | } 124 | 125 | static inline int pin_cpu(cpu_set_t *oldcpus) 126 | { 127 | int cpu = sched_getcpu(); 128 | cpu_set_t cpus; 129 | CPU_ZERO(&cpus); 130 | CPU_SET(cpu, &cpus); 131 | if (oldcpus) 132 | sched_getaffinity(0, sizeof(cpu_set_t), oldcpus); 133 | return sched_setaffinity(0, sizeof(cpu_set_t), &cpus); 134 | } 135 | 136 | static inline void unpin_cpu(cpu_set_t *oldcpus) 137 | { 138 | sched_setaffinity(0, sizeof(cpu_set_t), oldcpus); 139 | } 140 | 141 | #endif 142 | -------------------------------------------------------------------------------- /simple-pmu.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2010, 2011 Intel Corporation 3 | * Author: Andi Kleen 4 | * 5 | * This software may be redistributed and/or modified under the terms of 6 | * the GNU General Public License ("GPL") version 2 only as published by the 7 | * Free Software Foundation. 8 | * 9 | * Simple PMU driver for no overhead self-monitoring. 10 | * Enable fixed counters on Intel CPUs and let them be read by RDPMC in ring 3. 11 | */ 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) 27 | #define SMP_CALL_ARG 1, 1 28 | #else 29 | #define SMP_CALL_ARG 1 30 | #endif 31 | 32 | static unsigned long counter_mask; 33 | static int num_counter; 34 | 35 | static int ring = 3; 36 | static int rdpmc_fixed = 1; 37 | 38 | enum rflags { 39 | R_UNINIT = 1 << 0, 40 | R_RESERVE = 1 << 1, 41 | }; 42 | 43 | static const struct fixed_ctr { 44 | int cpuid; 45 | unsigned msr; 46 | } fixed_ctr[] = { 47 | { 1, MSR_CORE_PERF_FIXED_CTR0 }, /* INST RETIRED.ANY */ 48 | { 0, MSR_CORE_PERF_FIXED_CTR1 }, /* CLK_UNHALTED_CORE */ 49 | { 2, MSR_CORE_PERF_FIXED_CTR2 }, /* CLK_UNHALTED_REF */ 50 | }; 51 | 52 | /* Enable or disable per CPU PMU state */ 53 | static void simple_pmu_cpu_init(void *arg) 54 | { 55 | int enable = arg != NULL; 56 | int i; 57 | u64 gc, fc; 58 | u32 cr4; 59 | unsigned r = (ring & 0x3); 60 | int err; 61 | 62 | printk("simple pmu cpu init cpu %d %d\n", smp_processor_id(), enable); 63 | 64 | err = rdmsrl_safe(MSR_CORE_PERF_FIXED_CTR_CTRL, &fc); 65 | err |= rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &gc); 66 | for (i = 0; i < num_counter; i++) 67 | if (test_bit(i, &counter_mask)) { 68 | fc &= ~(0xfUL << (4*i)); 69 | if (enable) { 70 | fc |= r << (4*i); 71 | gc |= 1UL << (32 + i); 72 | } else { 73 | gc &= ~(1UL << (32 + i)); 74 | } 75 | wrmsr_safe(fixed_ctr[i].msr, 0, 0); 76 | } 77 | err |= wrmsr_safe(MSR_CORE_PERF_FIXED_CTR_CTRL, (u32)fc, fc >> 32); 78 | err |= wrmsr_safe(MSR_CORE_PERF_GLOBAL_CTRL, (u32)gc, gc >> 32); 79 | 80 | if (err) { 81 | pr_err("CPU %d: simple PMU msr access failed\n", 82 | smp_processor_id()); 83 | return; 84 | } 85 | 86 | cr4 = read_cr4(); 87 | if (enable) 88 | cr4 |= X86_CR4_PCE; 89 | else 90 | cr4 &= ~X86_CR4_PCE; 91 | write_cr4(cr4); 92 | } 93 | 94 | static int 95 | simple_pmu_cpuhandler(struct notifier_block *nb, unsigned long action, void *v) 96 | { 97 | unsigned long cpu = (unsigned long)v; 98 | void *enable; 99 | 100 | switch (action) { 101 | case CPU_ONLINE: 102 | case CPU_ONLINE_FROZEN: 103 | case CPU_DOWN_FAILED: 104 | case CPU_DOWN_FAILED_FROZEN: 105 | enable = (void *)1L; 106 | break; 107 | case CPU_DOWN_PREPARE_FROZEN: 108 | case CPU_DOWN_PREPARE: 109 | enable = NULL; 110 | break; 111 | default: 112 | return NOTIFY_OK; 113 | } 114 | smp_call_function_single(cpu, simple_pmu_cpu_init, 115 | enable, SMP_CALL_ARG); 116 | return NOTIFY_DONE; 117 | } 118 | 119 | static struct notifier_block cpu_notifier = { 120 | .notifier_call = simple_pmu_cpuhandler, 121 | }; 122 | 123 | struct a_ebx { 124 | unsigned version : 8; 125 | unsigned num_counter : 8; 126 | unsigned width : 8; 127 | unsigned mask_bitlength : 8; 128 | }; 129 | 130 | static void query_cpu(void) 131 | { 132 | union { 133 | u32 val; 134 | struct a_ebx f; 135 | } eax; 136 | u32 ebx, edx, tmp; 137 | int i; 138 | u32 mask; 139 | 140 | cpuid(0xa, &eax.val, &ebx, &tmp, &edx); 141 | 142 | num_counter = min_t(unsigned, ARRAY_SIZE(fixed_ctr), edx & 0xf); 143 | mask = (~ebx) & ((1UL << eax.f.mask_bitlength)-1); 144 | 145 | counter_mask = 0; 146 | for (i = 0; i < num_counter; i++) 147 | if ((1U << fixed_ctr[i].cpuid) & mask) 148 | __set_bit(i, &counter_mask); 149 | } 150 | 151 | static void reserve_counters(void) 152 | { 153 | int i; 154 | int other; 155 | 156 | query_cpu(); 157 | 158 | other = 0; 159 | for (i = 0; i < num_counter; i++) 160 | if (test_bit(i, &counter_mask)) { 161 | if (!reserve_perfctr_nmi(fixed_ctr[i].msr)) { 162 | __clear_bit(i, &counter_mask); 163 | other++; 164 | } 165 | } 166 | 167 | pr_info("Simple-PMU: %d fixed counters used, CPU has %d total\n", 168 | num_counter - other, num_counter); 169 | } 170 | 171 | static void unreserve_counters(void) 172 | { 173 | int i; 174 | 175 | for (i = 0; i < num_counter; i++) 176 | if (test_bit(i, &counter_mask)) 177 | release_perfctr_nmi(fixed_ctr[i].msr); 178 | } 179 | 180 | static void restart(enum rflags rflags) 181 | { 182 | static DEFINE_MUTEX(restart_lock); 183 | static int prev; 184 | int enable; 185 | 186 | mutex_lock(&restart_lock); 187 | enable = rdpmc_fixed; 188 | if ((rflags & R_UNINIT) && ((prev && enable) || !enable)) { 189 | on_each_cpu(simple_pmu_cpu_init, NULL, SMP_CALL_ARG); 190 | if (rflags & R_RESERVE) 191 | unreserve_counters(); 192 | } 193 | if (enable) { 194 | if (rflags & R_RESERVE) 195 | reserve_counters(); 196 | on_each_cpu(simple_pmu_cpu_init, (void *)1L, SMP_CALL_ARG); 197 | } 198 | prev = enable; 199 | mutex_unlock(&restart_lock); 200 | } 201 | 202 | static int old_state; 203 | 204 | static int simple_pmu_suspend(struct sys_device *dev, pm_message_t state) 205 | { 206 | printk("simple_pmu_suspend\n"); 207 | old_state = rdpmc_fixed; 208 | rdpmc_fixed = 0; 209 | restart(R_UNINIT); 210 | return 0; 211 | } 212 | 213 | static int simple_pmu_resume(struct sys_device *dev) 214 | { 215 | printk("simple_pmu_resume\n"); 216 | rdpmc_fixed = old_state; 217 | restart(0); 218 | return 0; 219 | } 220 | 221 | struct spmu_attr { 222 | struct sysdev_attribute attr; 223 | int *var; 224 | }; 225 | 226 | static ssize_t 227 | spmu_attr_store(struct sys_device *c, struct sysdev_attribute *a, 228 | const char *buf, size_t size) 229 | { 230 | struct spmu_attr *sa = container_of(a, struct spmu_attr, attr); 231 | char *end; 232 | long new = simple_strtol(buf, &end, 0); 233 | if (end == buf || new > INT_MAX || new < INT_MIN) 234 | return -EINVAL; 235 | *(int *)(sa->var) = new; 236 | restart(R_RESERVE|R_UNINIT); 237 | return size; 238 | } 239 | 240 | static ssize_t 241 | spmu_attr_show(struct sys_device *c, struct sysdev_attribute *a, char *buf) 242 | { 243 | struct spmu_attr *sa = container_of(a, struct spmu_attr, attr); 244 | return snprintf(buf, PAGE_SIZE, "%d", *(sa->var)); 245 | } 246 | 247 | #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34) 248 | #define SPMU_ATTR(name) \ 249 | static struct spmu_attr name##_attr; \ 250 | static ssize_t name##_store(struct sys_device *c, const char *buf, \ 251 | size_t size) \ 252 | { \ 253 | return spmu_attr_store(c, &name##_attr.attr, buf, size); \ 254 | } \ 255 | \ 256 | static ssize_t name##_show(struct sys_device *c, char *buf) \ 257 | { \ 258 | return spmu_attr_show(c, &name##_attr.attr, buf); \ 259 | } \ 260 | static struct spmu_attr name##_attr = { \ 261 | _SYSDEV_ATTR(name, 0644, name##_show, name##_store), \ 262 | &name \ 263 | }; 264 | #else 265 | #define SPMU_ATTR(name) \ 266 | static struct spmu_attr name##_attr = { \ 267 | _SYSDEV_ATTR(name, 0644, spmu_attr_show, spmu_attr_store), \ 268 | &name, \ 269 | } 270 | #endif 271 | 272 | SPMU_ATTR(rdpmc_fixed); 273 | SPMU_ATTR(ring); 274 | 275 | static struct sysdev_attribute *spmu_attr[] = { 276 | &ring_attr.attr, 277 | &rdpmc_fixed_attr.attr, 278 | NULL 279 | }; 280 | 281 | static struct sysdev_class spmu_sysdev_class = { 282 | .name = "simple-pmu", 283 | }; 284 | 285 | static struct sysdev_driver spmu_sysdev_driver = { 286 | #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38) 287 | /* FIXME: need a replacement. */ 288 | .suspend = simple_pmu_suspend, 289 | .resume = simple_pmu_resume 290 | #endif 291 | }; 292 | 293 | static struct sys_device spmu_sysdev = { 294 | .cls = &spmu_sysdev_class, 295 | }; 296 | 297 | static int simple_pmu_init(void) 298 | { 299 | int err; 300 | int i; 301 | 302 | if (!boot_cpu_has(X86_FEATURE_ARCH_PERFMON) || cpuid_eax(0) < 0xa) 303 | return -ENODEV; 304 | 305 | err = sysdev_class_register(&spmu_sysdev_class); 306 | if (err) 307 | return err; 308 | 309 | err = sysdev_register(&spmu_sysdev); 310 | if (err) 311 | goto err_class; 312 | 313 | err = sysdev_driver_register(&spmu_sysdev_class, &spmu_sysdev_driver); 314 | if (err) 315 | goto err_sysdev; 316 | 317 | for (i = 0; spmu_attr[i] && !err; i++) 318 | err = sysdev_create_file(&spmu_sysdev, spmu_attr[i]); 319 | if (err) 320 | goto error_file; 321 | 322 | restart(R_RESERVE); 323 | register_cpu_notifier(&cpu_notifier); 324 | return 0; 325 | 326 | error_file: 327 | while (--i >= 0) 328 | sysdev_remove_file(&spmu_sysdev, spmu_attr[i]); 329 | sysdev_driver_unregister(&spmu_sysdev_class, &spmu_sysdev_driver); 330 | err_sysdev: 331 | sysdev_unregister(&spmu_sysdev); 332 | err_class: 333 | sysdev_class_unregister(&spmu_sysdev_class); 334 | return err; 335 | } 336 | 337 | static void simple_pmu_exit(void) 338 | { 339 | int i; 340 | 341 | for (i = 0; spmu_attr[i]; i++) 342 | sysdev_remove_file(&spmu_sysdev, spmu_attr[i]); 343 | sysdev_unregister(&spmu_sysdev); 344 | sysdev_driver_unregister(&spmu_sysdev_class, &spmu_sysdev_driver); 345 | sysdev_class_unregister(&spmu_sysdev_class); 346 | unregister_cpu_notifier(&cpu_notifier); 347 | rdpmc_fixed = 0; 348 | restart(R_UNINIT|R_RESERVE); 349 | } 350 | 351 | module_init(simple_pmu_init); 352 | module_exit(simple_pmu_exit); 353 | MODULE_LICENSE("GPL"); 354 | -------------------------------------------------------------------------------- /tcyc.c: -------------------------------------------------------------------------------- 1 | /* Demo program for simple-pmu */ 2 | #define _GNU_SOURCE 1 3 | #include 4 | #include 5 | #include 6 | #include "cycles.h" 7 | 8 | static inline void kernel(void) 9 | { 10 | asm volatile("nop ; nop ; nop ; nop ; nop"); 11 | asm volatile("nop ; nop ; nop ; nop ; nop"); 12 | } 13 | 14 | int main(void) 15 | { 16 | counter_t a, b; 17 | 18 | if (pin_cpu(NULL) < 0) { 19 | printf("Cannot pin CPU\n"); 20 | exit(1); 21 | } 22 | if (perfmon_available() <= 0) { 23 | printf("no fixed perfmon available\n"); 24 | exit(1); 25 | } 26 | sync_core(); 27 | a = unhalted_core(); 28 | kernel(); 29 | b = unhalted_core(); 30 | sync_core(); 31 | printf("unhalted cycles %llu\n", b-a); 32 | 33 | #if 0 /* gone on nehalem */ 34 | sync_core(); 35 | a = unhalted_ref(); 36 | kernel(); 37 | b = unhalted_ref(); 38 | sync_core(); 39 | printf("reference cycles %llu\n", b-a); 40 | #endif 41 | 42 | sync_core(); 43 | a = insn_retired(); 44 | kernel(); 45 | b = insn_retired(); 46 | sync_core(); 47 | printf("instructions retired %llu\n", b-a); 48 | 49 | sync_core(); 50 | a = rdtsc(); 51 | kernel(); 52 | b = rdtsc(); 53 | printf("rdtsc tick %llu\n", b-a); 54 | 55 | #if 0 /* Not available before Nehalem */ 56 | a = rdtscp(); 57 | kernel(); 58 | b = rdtscp(); 59 | printf("rdtsc tick %llu\n", b-a); 60 | #endif 61 | return 0; 62 | 63 | } 64 | --------------------------------------------------------------------------------