├── .gitignore
├── LICENSE
├── LowOverheadTimersTests
    ├── README.md
    ├── SetupPowerLevelCounters.sh
    ├── SetupUserKernelCounters.sh
    ├── build_timer_tests.sh
    ├── counter_test_epilog.c
    ├── counter_test_epilog_32.c
    ├── counter_test_prolog.c
    ├── counter_test_prolog_32.c
    ├── run_timer_test_ensemble.sh
    ├── summarize.sh
    └── test_timer_overhead.c
├── README.md
├── low_overhead_timers.c
└── low_overhead_timers.h


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.s
3 | *.optrpt
4 | *.exe
5 | log.*
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, John D McCalpin and University of Texas at Austin
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/LowOverheadTimersTests/README.md:
--------------------------------------------------------------------------------
 1 | Test driver and scripts for the low_overhead_timers.c program.
 2 | 
 3 | The program "test_timer_overhead.c" tests all of the interfaces in
 4 | "low_overhead_counters.c" with 64 repeated calls, then reports the minimum,
 5 | average, and maximum counter deltas (excluding the first iteration).
 6 | The program also includes tests of the same counters read directly using
 7 | inline assembly macros for comparison.  The test_timer_overhead code is set
 8 | up so that it can be compiled in either inline or separate compilation mode.
 9 | 
10 | The script "build_timer_tests.sh" will compile four versions of the
11 | test_timer_overhead program -- one with inlining and one with separate
12 | compilation, for each of the Intel (icc) and GNU (gcc) compilers.
13 | 
14 | The script "run_timer_test_ensemble.sh" will run each of these versions 10
15 | times and save the results.
16 | 
17 | The script "summarize.sh" will take a result file from the
18 | "run_timer_test_ensemble.sh" and compute the average of the average
19 | values (after excluding the slowest result of the ensemble).
20 | 
21 | Building for icc
22 |   IMPORTANT NOTES:
23 |     1. Choose a target architecture that does not include 256-bit or 512-bit SIMD support
24 |        (Any SSE target will do -- the default for Linux is -msse2)
25 |     2. The compiler flag -nolib-inline prevents the compiler from generating calls to
26 |        memset() or memcpy() that might contain 256-bit or 512-bit instructions.
27 | 
28 | Building for gcc
29 |     1. Choose a target architecture that does not include 256-bit or 512-bit SIMD support
30 |        (Any SSE target will do -- gcc defaults to -msse or -msse2.)
31 |     2. The flag -fno-tree-loop-distribute-patterns will prevent the compiler from
32 |        generating most calls to memset() or memcpy() (which might include 256-bit
33 |        or 512-bit instructions).
34 | 
35 | Other compilers should be similar to gcc, but have not been carefully tested.
36 | On Mac OS X, the clang compiler does not understand the gcc "-fno-tree-loop-distribute-patterns"
37 | flag, but otherwise appears to work.
38 | 
39 | 


--------------------------------------------------------------------------------
/LowOverheadTimersTests/SetupPowerLevelCounters.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # You will need to put in the path to the msrtools binaries here
 4 | # if they are not in your default path.
 5 | # The msrtools commands almost always require root privileges.
 6 | WRMSR=wrmsr
 7 | 
 8 | # -------------------------------------------------------------------
 9 | # PMC0 on each core is set to record actual cycles not halted (in
10 | # either user or kernel mode).  This event is "architectural", so 
11 | # it should work on almost all Intel processors.
12 | # -------------------------------------------------------------------
13 | # The next three events are specific to the Intel Xeon Scalable
14 | # Processor family (Skylake Xeon).
15 | #  --> These event encodings might produce non-zero results on
16 | #      other Intel processors (Event 0x28 was used for L1D cache
17 | #      writebacks to L2 on Nehalem/Westmere/SandyBridge/IvyBridge),
18 | #      but these specific values are intended for SKX only.
19 | # -------------------------------------------------------------------
20 | # The desired result for the test_timer_overhead program using these
21 | # Skylake Xeon counters is that PMC0 ("actual cycles not halted")
22 | # matches PMC1 ("core power level 0 cycles"), and that PMC2 
23 | # ("core power level 1 cycles") and PMC3 ("core power level 2 cycles")
24 | # counters are zero.
25 | # If PMC2 and PMC3 are *not* zero, the power control unit in
26 | # the processor will halt the core while it adjusts the voltage
27 | # and activates the 256-bit or 512-bit pipelines.  The timing
28 | # of this halt is unpredictable, and the duration of the halt
29 | # can be 20,000 or more core cycles.
30 | # -------------------------------------------------------------------
31 | 
32 | $WRMSR -a 0x186 0x0043003c		# actual cycles not halted
33 | $WRMSR -a 0x187 0x00430728		# core power level 0 cycles
34 | $WRMSR -a 0x188 0x00431828		# core power level 1 cycles
35 | $WRMSR -a 0x189 0x00432028		# core power level 2 cycles
36 | 


--------------------------------------------------------------------------------
/LowOverheadTimersTests/SetupUserKernelCounters.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # You will need to put in the path to the msrtools binaries here
 4 | # if they are not in your default path.
 5 | # The msrtools commands almost always require root privileges.
 6 | WRMSR=wrmsr
 7 | 
 8 | # -------------------------------------------------------------------------
 9 | # This set of events is intended to help find instances in which the
10 | # timer overhead test was contaminated by OS activity.
11 | #
12 | # PMC0 is set to measure actual cycles not halted in user or kernel mode.
13 | #  This is an "architectural" event that should work on all Intel processors.
14 | # PMC1 is set to measure actual cycles not halted in kernel mode.
15 | #  This is an "architectural" event that should work on all Intel processors.
16 | # PMC2 is set to measure instructions retired in kernel mode.
17 | #  This is an "architectural" event that should work on all Intel processors.
18 | # PMC3 is set to measure interrupts received (Skylake and later cores only)
19 | #  PMC3 may report non-zero results on earlier Intel processors, but
20 | #  those results will mean something unrelated to what I want to measure
21 | #  here.
22 | #
23 | # A "clean" result has non-zero results in PMC0 and zero results in the
24 | # other three counters.  
25 | # For processors earlier than Skylake, ignore the results for PMC3.
26 | # -------------------------------------------------------------------------
27 | 
28 | $WRMSR -a 0x186 0x0043003c		# actual cycles not halted (user + kernel)
29 | $WRMSR -a 0x187 0x0042003c		# actual cycles not halted (kernel only)
30 | $WRMSR -a 0x188 0x004200c0		# instructions retired (kernel only)
31 | $WRMSR -a 0x189 0x004301cb		# interrupts received
32 | 


--------------------------------------------------------------------------------
/LowOverheadTimersTests/build_timer_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | which icc >& /dev/null
 4 | if [ $? -ne 0 ]
 5 | then
 6 |     echo "Intel icc compiler not found, skipping...."
 7 | else
 8 | 	echo "compiling externally linked version with icc"
 9 | 	icc --version
10 | 	icc -O2 -msse2 -nolib-inline -DUSE_PAUSE -c ../low_overhead_timers.c -o low_overhead_timers_icc.o
11 | 	icc -O2 -msse2 -nolib-inline -DUSE_PAUSE -I.. test_timer_overhead.c low_overhead_timers_icc.o -qopt-report=5 -o timer_ovhd_external.icc.exe
12 | 	mv test_timer_overhead.optrpt test_timer_overhead.external.optrpt
13 | 	icc -O2 -msse2 -nolib-inline -DUSE_PAUSE -I.. test_timer_overhead.c -S -o timer_ovhd_external.icc.s
14 | 
15 | 	echo "compiling inlined version with icc"
16 | 	icc -O2 -msse2 -nolib-inline -DUSE_PAUSE -DINLINE_TIMERS -I.. test_timer_overhead.c -qopt-report=5 -o timer_ovhd_inline.icc.exe
17 | 	mv test_timer_overhead.optrpt test_timer_overhead.inline.optrpt
18 | 	icc -O2 -msse2 -nolib-inline -DUSE_PAUSE -DINLINE_TIMERS -I.. test_timer_overhead.c -S -o timer_ovhd_inline.icc.s
19 | fi
20 | 
21 | which gcc >& /dev/null
22 | if [ $? -ne 0 ]
23 | then
24 |     echo "GNU gcc compiler not found, skipping...."
25 | else
26 | 	echo "compiling externally linked version with gcc"
27 | 	gcc --version
28 | 	gcc -O2 -msse2 -fno-tree-loop-distribute-patterns -DUSE_PAUSE -c ../low_overhead_timers.c -o low_overhead_timers_gcc.o
29 | 	gcc -O2 -msse2 -fno-tree-loop-distribute-patterns -DUSE_PAUSE -I.. test_timer_overhead.c low_overhead_timers_gcc.o -o timer_ovhd_external.gcc.exe
30 | 	gcc -O2 -msse2 -fno-tree-loop-distribute-patterns -DUSE_PAUSE -I.. test_timer_overhead.c -fverbose-asm -S -o timer_ovhd_external.gcc.s
31 | 
32 | 	echo "compiling inlined version with gcc"
33 | 	gcc -O2 -msse2 -fno-tree-loop-distribute-patterns -DUSE_PAUSE -DINLINE_TIMERS -I.. test_timer_overhead.c -o timer_ovhd_inline.gcc.exe
34 | 	gcc -O2 -msse2 -fno-tree-loop-distribute-patterns -DUSE_PAUSE -DINLINE_TIMERS -I.. test_timer_overhead.c -fverbose-asm -S -o timer_ovhd_inline.gcc.s
35 | fi
36 | 


--------------------------------------------------------------------------------
/LowOverheadTimersTests/counter_test_epilog.c:
--------------------------------------------------------------------------------
 1 | 	// Boilerplate code that goes after the sample loop for
 2 | 	//  tests that store the full 64 bits of each result.
 3 | 	//
 4 | 	// 1. Collect final values of fixed-function core counters,
 5 | 	//    programmable core performance counters, and TSC
 6 | 	//    to monitor behavior over the entire sample loop.
 7 | 	// 2. Compute min/max/avg deltas on the individual measurements
 8 | 	//    taken in the loop (excluding the first delta, which is
 9 | 	//    sometimes slow).
10 | 	// 3. Compute core utilization and average frequency for the
11 | 	//    entire sample loop.
12 | 	// 4. Print out statistics on the deltas measured within
13 | 	//    the loop, along with core utilization and avg frequency.
14 | 	// 5. Print out the deltas of the programmable counters
15 | 	//    over the entire sample loop.
16 | 	//
17 | 	// Note that this code makes no attempt to detect or correct
18 | 	// for wraparound of the fixed or programmable performance 
19 | 	// counters -- either for the entire loop or for the individual
20 | 	// measurements within the loop!
21 | 	//
22 | 	gen_cyc_end = rdpmc_actual_cycles(); 
23 | 	gen_ref_end = rdpmc_reference_cycles();
24 | 	tsc_end = rdtscp(); 
25 | 	for (i=0; i<NUM_CORE_COUNTERS; i++) core_counter_end[i] = rdpmc(i); 
26 | 	minval = 1UL << 60; 
27 | 	maxval = 0; 
28 | 	avgval = 0;
29 | 	for (j=2; j<NSAMPLES; j++) { 
30 | 		count = values64[j]-values64[j-1]; 
31 | 		minval = MIN(count,minval); 
32 | 		maxval = MAX(count,maxval); 
33 | 		avgval += count; 
34 | 	} 
35 | 	avgval = avgval / (double)(NSAMPLES-2);
36 | 	//utilization = (double)(gen_ref_end-gen_ref_start)/(double)(tsc_end-tsc_start);
37 | 	utilization = (double)(corrected_pmc_delta(gen_ref_end,gen_ref_start,core_ctr_width))/(double)(tsc_end-tsc_start);
38 | 	//avg_ghz = (double)(gen_cyc_end-gen_cyc_start)/(double)(tsc_end-tsc_start)*nominal_ghz; 
39 | 	avg_ghz = (double)(corrected_pmc_delta(gen_cyc_end,gen_cyc_start,core_ctr_width))/(double)(tsc_end-tsc_start)*nominal_ghz; 
40 | 	printf("%s minimum %ld average %f maximum %ld utilization %f avg_ghz %f\n",FUNCTIONLABEL,minval,avgval,maxval,utilization,avg_ghz); 
41 | 	printf("COUNTERS: %s core count deltas ",FUNCTIONLABEL);
42 | 	for (i=0; i<NUM_CORE_COUNTERS; i++) {
43 | 		printf(" %lu",corrected_pmc_delta(core_counter_end[i],core_counter_start[i],core_ctr_width));
44 | 	}
45 | 	printf("\n");
46 | 


--------------------------------------------------------------------------------
/LowOverheadTimersTests/counter_test_epilog_32.c:
--------------------------------------------------------------------------------
 1 | 	// Boilerplate code that goes after the sample loop for
 2 | 	//  tests that store only the low-order 32 bits of each result.
 3 | 	//
 4 | 	// 1. Collect final values of fixed-function core counters,
 5 | 	//    programmable core performance counters, and TSC
 6 | 	//    to monitor behavior over the entire sample loop.
 7 | 	// 2. Compute min/max/avg deltas on the individual measurements
 8 | 	//    taken in the loop (excluding the first delta, which is
 9 | 	//    sometimes slow).
10 | 	// 3. Compute core utilization and average frequency for the
11 | 	//    entire sample loop.
12 | 	// 4. Print out statistics on the deltas measured within
13 | 	//    the loop, along with core utilization and avg frequency.
14 | 	// 5. Print out the deltas of the programmable counters
15 | 	//    over the entire sample loop.
16 | 	//
17 | 	// Note that this code makes no attempt to detect or correct
18 | 	// for wraparound of the fixed or programmable performance 
19 | 	// counters -- either for the entire loop or for the individual
20 | 	// measurements within the loop!
21 | 	//
22 | 	gen_cyc_end = rdpmc_actual_cycles(); 
23 | 	gen_ref_end = rdpmc_reference_cycles(); 
24 | 	tsc_end = rdtscp(); 
25 | 	for (i=0; i<NUM_CORE_COUNTERS; i++) core_counter_end[i] = rdpmc(i); 
26 | 	minval = 1UL << 60; 
27 | 	maxval = 0; 
28 | 	avgval = 0;
29 | 	for (j=2; j<NSAMPLES; j++) { 
30 | 		count = values32[j]-values32[j-1]; 
31 | 		minval = MIN(count,minval); 
32 | 		maxval = MAX(count,maxval); 
33 | 		avgval += count; 
34 | 	} 
35 | 	avgval = avgval / (double)(NSAMPLES-2);
36 | 	utilization = (double)(corrected_pmc_delta(gen_ref_end,gen_ref_start,core_ctr_width))/(double)(tsc_end-tsc_start);
37 | 	avg_ghz = (double)(corrected_pmc_delta(gen_cyc_end,gen_cyc_start,core_ctr_width))/(double)(tsc_end-tsc_start)*nominal_ghz; 
38 | 	printf("%s minimum %ld average %f maximum %ld utilization %f avg_ghz %f\n",FUNCTIONLABEL,minval,avgval,maxval,utilization,avg_ghz); 
39 | 	printf("COUNTERS: %s core count deltas ",FUNCTIONLABEL);
40 | 	for (i=0; i<NUM_CORE_COUNTERS; i++) {
41 | 		printf(" %lu",corrected_pmc_delta(core_counter_end[i],core_counter_start[i],core_ctr_width));
42 | 	}
43 | 	printf("\n");
44 | 


--------------------------------------------------------------------------------
/LowOverheadTimersTests/counter_test_prolog.c:
--------------------------------------------------------------------------------
 1 | 	// Boilerplate code that goes before the sample loop for
 2 | 	//  tests that store the full 64 bits of each result.
 3 | 	//
 4 | 	// 1. Initialize the result array so that the corresponding
 5 | 	//    cache lines will be dirty in the L1 Data Cache.
 6 | 	//    (The (j>>24) construct is used to inhibit replacement
 7 | 	//    of this loop by a call to "memset()", which may contain
 8 | 	//    unwanted SIMD instructions that change the processor
 9 | 	//    power level.)
10 | 	// 2. Collect initial values of fixed-function core counters,
11 | 	//    programmable core performance counters, and TSC
12 | 	//    to monitor behavior over the entire sample loop.
13 | 
14 | 	for (j=0; j<NSAMPLES; j++) values64[j] = (j>>24); 
15 | 	for (i=0; i<NUM_CORE_COUNTERS; i++) core_counter_start[i] = rdpmc(i); 
16 | 	gen_cyc_start = rdpmc_actual_cycles(); 
17 | 	gen_ref_start = rdpmc_reference_cycles(); 
18 | 	tsc_start = rdtscp(); 
19 | 
20 | 


--------------------------------------------------------------------------------
/LowOverheadTimersTests/counter_test_prolog_32.c:
--------------------------------------------------------------------------------
 1 | 	// Boilerplate code that goes before the sample loop for
 2 | 	//  tests that store only the low-order 32 bits of each result.
 3 | 	//
 4 | 	// 1. Initialize the result array so that the corresponding
 5 | 	//    cache lines will be dirty in the L1 Data Cache.
 6 | 	//    (The (j>>24) construct is used to inhibit replacement
 7 | 	//    of this loop by a call to "memset()", which may contain
 8 | 	//    unwanted SIMD instructions that change the processor
 9 | 	//    power level.)
10 | 	// 2. Collect initial values of fixed-function core counters,
11 | 	//    programmable core performance counters, and TSC
12 | 	//    to monitor behavior over the entire sample loop.
13 | 
14 | 	for (j=0; j<NSAMPLES; j++) values32[j] = (j>>24); 
15 | 	for (i=0; i<NUM_CORE_COUNTERS; i++) core_counter_start[i] = rdpmc(i); 
16 | 	gen_cyc_start = rdpmc_actual_cycles(); 
17 | 	gen_ref_start = rdpmc_reference_cycles(); 
18 | 	tsc_start = rdtscp(); 
19 | 
20 | 


--------------------------------------------------------------------------------
/LowOverheadTimersTests/run_timer_test_ensemble.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NTRIALS=10
 4 | MAXTRIAL=$(( $NTRIALS - 1 ))
 5 | 
 6 | for MODE in external inline
 7 | do
 8 | 	for COMPILER in icc gcc
 9 | 	do
10 | 		if [ -x ./timer_ovhd_${MODE}.${COMPILER}.exe ]
11 | 		then
12 | 			echo "Running $NTRIALS iterations of timer_ovhd_${MODE}.${COMPILER}.exe"
13 | 			rm -f log.test_${MODE}_${COMPILER}
14 | 
15 | 			for ITER in `seq 0 $MAXTRIAL`
16 | 			do
17 | 				echo -n "$ITER "
18 | 				./timer_ovhd_${MODE}.${COMPILER}.exe >> log.test_${MODE}_${COMPILER}
19 | 			done
20 | 			echo ""
21 | 		else
22 | 			echo "executable ./timer_ovhd_${MODE}.${COMPILER}.exe not found, skipping...."
23 | 		fi
24 | 	done
25 | done
26 | 


--------------------------------------------------------------------------------
/LowOverheadTimersTests/summarize.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # compute the average of the average values for each of these timer events in a specified output file
 4 | # extra cruft to sort the results and discard the slowest one
 5 | 
 6 | for COUNTER in multiline_inline_rdtscp rdtsc inline_rdtsc_64 inline_rdtsc_32 rdtscp inline_rdtscp_64 inline_rdtscp_32 full_rdtscp rdpmc_instructions rdpmc_actual_cycles inline_rdpmc_actual_cycles_64 inline_rdpmc_actual_cycles_32 rdpmc inline_rdpmc_programmable_64 inline_rdpmc_programmable_32 rdpmc_reference_cycles inline_rdpmc_reference_cycles_64 inline_rdpmc_reference_cycles_32
 7 | do
 8 | 	echo -n "${COUNTER} "
 9 | 	grep "^${COUNTER} " $1 | awk 'START {max=0} {s+=$5; if($5>max) max=$5} END {print (s-max)/(NR-1)}'
10 | done
11 | 
12 | 


--------------------------------------------------------------------------------
/LowOverheadTimersTests/test_timer_overhead.c:
--------------------------------------------------------------------------------
  1 | #define _GNU_SOURCE
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <fcntl.h>
  5 | #include <unistd.h>
  6 | #include <signal.h>
  7 | #include <sched.h>
  8 | #include <string.h>
  9 | #include <errno.h>
 10 | 
 11 | # ifndef MIN
 12 | # define MIN(x,y) ((x)<(y)?(x):(y))
 13 | # endif
 14 | # ifndef MAX
 15 | # define MAX(x,y) ((x)>(y)?(x):(y))
 16 | # endif
 17 | 
 18 | // ----------IMPORTANT -----------
 19 | //    Use the INLINE_TIMERS preprocessor variable to determine
 20 | //    whether the source code to the timers is included here
 21 | //    or just the headers (for separate compilation and linking).
 22 | //
 23 | #ifdef INLINE_TIMERS
 24 | #include "low_overhead_timers.c"
 25 | #else
 26 | #include "low_overhead_timers.h"
 27 | #endif
 28 | 
 29 | #define inline_rdpmc(hi,low,counter) \
 30 |    __asm__ volatile("rdpmc" : "=a" (low), "=d" (hi) : "c" (counter));
 31 | 
 32 | #define inline_rdtsc(hi,low) \
 33 | 	__asm__ volatile("rdtsc": "=a" (low), "=d" (hi));
 34 | 
 35 | #define inline_rdtscp(hi,low,aux) \
 36 | 	__asm__ volatile("rdtscp": "=a" (low), "=d" (hi), "=c" (aux));
 37 | 
 38 | # define NUM_CORE_COUNTERS 2
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | #define FATAL(fmt,args...) do {                \
 47 |     ERROR(fmt, ##args);                        \
 48 |     exit(1);                                   \
 49 |   } while (0)
 50 | 
 51 | #define ERROR(fmt,args...) \
 52 |     fprintf(stderr, fmt, ##args)
 53 | 
 54 | #define NSAMPLES 64
 55 | #define NHALFSAMPLES 256
 56 | 
 57 | unsigned long values64[NSAMPLES];
 58 | unsigned int values32[NSAMPLES];
 59 | unsigned int highlow32[NHALFSAMPLES];
 60 | 
 61 | 
 62 | #define INNERTIMES (10000)
 63 | #define MIDDLETIMES (1000)
 64 | 
 65 | // This function can be extracted for standalone use....
 66 | // If USE_PAUSE is not defined, it will spin very quickly
 67 | // (>1 billion increments per second).
 68 | // If USE_PAUSE is defined, it will spin very slowly --
 69 | // between ~4 cycles and ~26 cycles per PAUSE instruction depending
 70 | // on the processor model.
 71 | unsigned long spin_function(unsigned long initial_counter)
 72 | {
 73 | 	int middle, inner;
 74 | #pragma nounroll
 75 | 		for (middle=0; middle<MIDDLETIMES; middle++) {
 76 | #pragma nounroll
 77 | 			for (inner=0; inner<INNERTIMES; inner++) {
 78 | #ifdef USE_PAUSE
 79 | 				__asm__ volatile("pause");
 80 | 				__asm__ volatile("pause");
 81 | 				__asm__ volatile("pause");
 82 | 				__asm__ volatile("pause");
 83 | #endif
 84 | 				initial_counter++;
 85 | 			}
 86 | 		}
 87 | 	return(initial_counter);
 88 | }
 89 | 
 90 | 
 91 | int main ( int argc, char *argv[] )
 92 | {
 93 | 	int cpu;
 94 | 	int chip, core;
 95 | 	int i,j,k,l,counter;
 96 | 	unsigned int low, high, aux;
 97 | 	unsigned long count;
 98 | 	unsigned long dummy;
 99 | 	int core_ctr_width;
100 | 	int fixed_ctr_width;
101 | 
102 | 	unsigned long gen_ins_start, gen_ins_end, fix_ins_start, fix_ins_end;
103 | 	unsigned long gen_cyc_start, gen_cyc_end, fix_cyc_start, fix_cyc_end;
104 | 	unsigned long gen_ref_start, gen_ref_end, fix_ref_start, fix_ref_end;
105 | 	unsigned long tsc_start, tsc_end,  tscp_start, tscp_end;
106 | 	unsigned long core_counter_start[NUM_CORE_COUNTERS],core_counter_end[NUM_CORE_COUNTERS];
107 | 	long delta_cycle, delta_tsc;
108 | 	long minval,maxval;
109 | 	double avgval;
110 | 	double spintime;
111 | 	register unsigned long tmp64;
112 | 	char FUNCTIONLABEL[100];
113 | 
114 | 	float utilization, nominal_ghz, avg_ghz;
115 | 
116 | 	nominal_ghz = get_TSC_frequency() / 1.0e9;
117 | 	printf("Nominal GHz %f\n",nominal_ghz);
118 | 
119 | 	core_ctr_width = get_core_counter_width();
120 | 	fixed_ctr_width = get_fixed_counter_width();
121 | 	printf("programmable core counter width is %d bits\n",core_ctr_width);
122 | 	printf("fixed-function core counter width is %d bits\n",fixed_ctr_width);
123 | 	if (core_ctr_width != fixed_ctr_width) {
124 | 		printf("Warning -- programmable counter width does not match fixed-function counter width -- this code may have errors!\n");
125 | 	}
126 | 
127 | 	// bind to a single processor
128 | 	cpu = 4;			
129 |     cpu_set_t cpu_set;
130 |     CPU_ZERO(&cpu_set);
131 |     CPU_SET(cpu, &cpu_set);
132 |     if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) FATAL("cannot set cpu affinity: %m\n");
133 | 	printf("Affinity set to cpu %d\n",cpu);
134 | 	printf("  get_core_number returns %d\n",get_core_number());
135 | 	printf("  get_socket_number returns %d\n",get_socket_number());
136 | 	count = full_rdtscp(&chip, &core);
137 | 	printf("  full_rdtsc returns chip %d, core %d\n",chip,core);
138 | 
139 | 	printf("Spinning for a short time to allow the processor to ramp up to full speed\n");
140 | 	dummy = 0;
141 | 	for (i=0; i<NUM_CORE_COUNTERS; i++) core_counter_start[i] = rdpmc(i);
142 | 	fix_ins_start = rdpmc_instructions();
143 | 	tsc_start = rdtscp();
144 | 	count = spin_function(dummy);
145 | 	tsc_end = rdtscp();
146 | 	fix_ins_end = rdpmc_instructions();
147 | 	for (i=0; i<NUM_CORE_COUNTERS; i++) core_counter_end[i] = rdpmc(i);
148 | 	spintime = (double)(tsc_end - tsc_start) / nominal_ghz / 1.0e9;
149 | 	printf("  Spinning for %lu increments took %f seconds and executed %lu instructions\n",count,spintime,fix_ins_end - fix_ins_start);
150 | 	printf("COUNTERS: spinning core count deltas ");
151 | 	for (i=0; i<NUM_CORE_COUNTERS; i++) {
152 | 		printf(" %lu",corrected_pmc_delta(core_counter_end[i],core_counter_start[i],core_ctr_width));
153 | 	}
154 | 	printf("\n");
155 | 	// the get_core_counter_width() function uses the cpuid instruction, which will serialize
156 | 	// execution and try to keep any rdtsc instructions in the next test from getting too far out of order
157 | 	count += get_core_counter_width();
158 | 
159 | // --- 0 ---
160 | 	strcpy(FUNCTIONLABEL,"multiline_inline_rdtscp");
161 | #include "counter_test_prolog.c"
162 | 	for (j=0; j<NSAMPLES; j++) {
163 | 		__asm__ volatile(
164 | 		 "\
165 | 		   rdtscp                           \n\
166 | 		   shl      $32, %%rdx              \n\
167 | 		   orq      %%rax, %%rdx            \n\
168 | 		   movq     %%rdx, %[cyc]"
169 | 		   :
170 | 		   /*outputs here*/
171 | 		   [cyc]    "=r" (tmp64)
172 | 		   :
173 | 		   /*inputs here*/
174 | 		   :
175 | 		   /*clobbered registers*/
176 | 		   "rax","eax","rcx","ecx","rdx"
177 | 		);
178 | 		values64[j] = tmp64;
179 | 	  }
180 | #include "counter_test_epilog.c"
181 | 
182 | // --- 1 ---
183 | 	strcpy(FUNCTIONLABEL,"rdtsc");
184 | #include "counter_test_prolog.c"
185 | for (j=0; j<NSAMPLES; j++) values64[j] = rdtsc();
186 | #include "counter_test_epilog.c"
187 | 
188 | // --- 2 ---
189 | 	strcpy(FUNCTIONLABEL,"inline_rdtsc_64");
190 | #include "counter_test_prolog.c"
191 | 	for (j=0; j<NSAMPLES; j++) {
192 |         inline_rdtsc(high,low);
193 |         values64[j] = ((unsigned long)high<<32) | (unsigned long)low;
194 |       }
195 | #include "counter_test_epilog.c"
196 | 
197 | // --- 3 ---
198 | 	strcpy(FUNCTIONLABEL,"inline_rdtsc_32");
199 | #include "counter_test_prolog_32.c"
200 | 	for (j=0; j<NSAMPLES; j++) {
201 |         inline_rdtsc(high,low);
202 |         values32[j] = low;
203 |       }
204 | #include "counter_test_epilog_32.c"
205 | 
206 | // --- 4 ---
207 | 	strcpy(FUNCTIONLABEL,"rdtscp");
208 | #include "counter_test_prolog.c"
209 | 	for (j=0; j<NSAMPLES; j++) values64[j] = rdtscp();
210 | #include "counter_test_epilog.c"
211 | 
212 | // --- 5 ---
213 | 	strcpy(FUNCTIONLABEL,"inline_rdtscp_64");
214 | #include "counter_test_prolog.c"
215 | 	for (j=0; j<NSAMPLES; j++) {
216 |         inline_rdtscp(high,low,aux);
217 |         values64[j] = ((unsigned long)high<<32) | (unsigned long)low;
218 |       }
219 | #include "counter_test_epilog.c"
220 | 
221 | // --- 6 ---
222 | 	strcpy(FUNCTIONLABEL,"inline_rdtscp_32");
223 | #include "counter_test_prolog_32.c"
224 | 	for (j=0; j<NSAMPLES; j++) {
225 |         inline_rdtscp(high,low,aux);
226 |         values32[j] = low;
227 |       }
228 | #include "counter_test_epilog_32.c"
229 | 
230 | // --- 7 ---
231 | 	strcpy(FUNCTIONLABEL,"full_rdtscp");
232 | #include "counter_test_prolog.c"
233 | 	for (j=0; j<NSAMPLES; j++) values64[j] = full_rdtscp(&chip,&core);
234 | #include "counter_test_epilog.c"
235 | 
236 | // --- 8 ---
237 | 	strcpy(FUNCTIONLABEL,"rdpmc_instructions");
238 | #include "counter_test_prolog.c"
239 | 	for (j=0; j<NSAMPLES; j++) values64[j] = rdpmc_instructions();
240 | #include "counter_test_epilog.c"
241 | 
242 | // --- 9 ---
243 | 	strcpy(FUNCTIONLABEL,"rdpmc_actual_cycles");
244 | #include "counter_test_prolog.c"
245 | 	for (j=0; j<NSAMPLES; j++) values64[j] = rdpmc_actual_cycles();
246 | #include "counter_test_epilog.c"
247 | 
248 | // --- 10 ---
249 | 	strcpy(FUNCTIONLABEL,"inline_rdpmc_actual_cycles_64");
250 | 	counter = (1<<30)+1; 
251 | #include "counter_test_prolog.c"
252 | 	for (j=0; j<NSAMPLES; j++) {
253 |         inline_rdpmc(high,low,counter);
254 |         values64[j] = ((unsigned long)high<<32) | (unsigned long)low;
255 |       }
256 | #include "counter_test_epilog.c"
257 | 
258 | // --- 11 ---
259 | 	strcpy(FUNCTIONLABEL,"inline_rdpmc_actual_cycles_32");
260 | 	counter = (1<<30)+1; 
261 | #include "counter_test_prolog_32.c"
262 | 	for (j=0; j<NSAMPLES; j++) {
263 |         inline_rdpmc(high,low,counter);
264 |         values32[j] = low;
265 |       }
266 | #include "counter_test_epilog_32.c"
267 | 
268 | // --- 12 ---
269 | 	strcpy(FUNCTIONLABEL,"rdpmc");
270 | 	counter = 0;
271 | #include "counter_test_prolog.c"
272 | 	for (j=0; j<NSAMPLES; j++) values64[j] = rdpmc(counter);
273 | #include "counter_test_epilog.c"
274 | 
275 | // --- 13 ---
276 | 	strcpy(FUNCTIONLABEL,"inline_rdpmc_programmable_64");
277 | 	counter = 0; 
278 | #include "counter_test_prolog.c"
279 | 	for (j=0; j<NSAMPLES; j++) {
280 |         inline_rdpmc(high,low,counter);
281 |         values64[j] = ((unsigned long)high<<32) | (unsigned long)low;
282 |       }
283 | #include "counter_test_epilog.c"
284 | 
285 | // --- 14 ---
286 | 	strcpy(FUNCTIONLABEL,"inline_rdpmc_programmable_32");
287 | 	counter = 0; 
288 | #include "counter_test_prolog_32.c"
289 | 	for (j=0; j<NSAMPLES; j++) {
290 |         inline_rdpmc(high,low,counter);
291 |         values32[j] = low;
292 |       }
293 | #include "counter_test_epilog_32.c"
294 | 
295 | // --- 15 ---
296 | 	strcpy(FUNCTIONLABEL,"rdpmc_reference_cycles");
297 | #include "counter_test_prolog.c"
298 | 	for (j=0; j<NSAMPLES; j++) values64[j] = rdpmc_reference_cycles();
299 | #include "counter_test_epilog.c"
300 | 
301 | // --- 16 ---
302 | 	strcpy(FUNCTIONLABEL,"inline_rdpmc_reference_cycles_64");
303 | 	counter = (1<<30)+2; 
304 | #include "counter_test_prolog.c"
305 | 	for (j=0; j<NSAMPLES; j++) {
306 |         inline_rdpmc(high,low,counter);
307 |         values64[j] = ((unsigned long)high<<32) | (unsigned long)low;
308 |       }
309 | #include "counter_test_epilog.c"
310 | 
311 | // --- 17 ---
312 | 	strcpy(FUNCTIONLABEL,"inline_rdpmc_reference_cycles_32");
313 | 	counter = (1<<30)+2; 
314 | #include "counter_test_prolog_32.c"
315 | 	for (j=0; j<NSAMPLES; j++) {
316 |         inline_rdpmc(high,low,counter);
317 |         values32[j] = low;
318 |       }
319 | #include "counter_test_epilog_32.c"
320 | 
321 | }
322 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This project contains a set of very low-overhead timers/counters for 
 2 | inline use in C code running on Intel64 processors.
 3 | 
 4 | These interfaces use the lowest-level hardware instructions to read
 5 | the Time Stamp Counter (TSC), the core fixed-function performance counters,
 6 | the core programmable performance counters, and a few auxiliary bits
 7 | of information.
 8 | 
 9 | These are intended for use in highly controlled tests.  The hardware
10 | performance counters are private to each logical processor, so differences
11 | only make sense between values that are guaranteed to have come from the
12 | same logical processor.
13 | 
14 | These timers/counters employ user-space instructions only, and require
15 | no elevated privileges.   Elevated privileges are required to program
16 | to counters, but not to read them.  The sample scripts SetupPowerLevelCounters.sh
17 | and SetupUserKernelCounters.sh require the msrtools binaries to be installed
18 | and root permission to execute.  They set up the first four hardware performance
19 | counters on each logical processor to measure events that are useful for
20 | determining whether the output of the test_timer_overhead program is 
21 | valid.
22 | 
23 | The interfaces provided include:
24 | 
25 | * rdtsc() returns the number of "nominal" processor cycles since the system booted in a 64-bit unsigned integer.
26 | For all recent Intel processors, this counter increments at a fixed rate, independent of the actual
27 | core clock speed or the energy-saving mode.
28 | * rdtscp() is the same as rdtsc except that it is partially ordered -- it will not execute until all prior
29 | instructions in program order have executed.  (See also full_rdtscp)
30 | * full_rdtscp() returns the number of "nominal" processor cycles in a 64-bit unsigned integer and also 
31 | modifies its two integer arguments to show the processor socket and processor core that were in use
32 | when the call was made.  (Note: the various cores in a chip usually have very similar values for 
33 | the TSC, but they are allowed to vary by processor.  This function guarantees that you know exactly
34 | which processor the TSC reading came from.)
35 | * get_core_number() uses the RDTSCP instruction, but returns only the core number in an integer variable.
36 | * get_socket_number() uses the RDTSCP instruction, but returns only the socket number in an integer variable.
37 | * rdpmc_instructions() uses a "fixed-function" performance counter to return the count of retired instructions on
38 | the current core in the low-order 48 bits of an unsigned 64-bit integer.
39 | * rdpmc_actual_cycles() uses a "fixed-function" performance counter to return the count of actual CPU core cycles
40 | executed by the current core.  Core cycles are not accumulated while the processor is in the "HALT" state,
41 | which is used when the operating system has no task(s) to run on a processor core.
42 | * rdpmc_reference_cycles() uses a "fixed-function" performance counter to return the count of "reference" (or "nominal")
43 | CPU core cycles executed by the current core.  This counts at the same rate as the TSC, but does not count
44 | when the core is in the "HALT" state.  If a timed section of code shows a larger change in TSC than in
45 | rdpmc_reference_cycles, the processor probably spent some time in a HALT state.
46 | * rdpmc() reads the programmable core performance counter number specified in the input argument.
47 |   No error or bounds checking is performed.
48 | 
49 | Auxiliary routines include:
50 | * get_num_core_counters() uses the CPUID instruction to discover the number of programmable core counters
51 | per logical processor.  This can be 2, 4, or 8, depending on model and mode of operation.
52 | * get_core_counter_width() uses the CPUID instruction to discover the width of the programmable core 
53 | performance counters.  This is 48 bits on most recent Intel processors, but processors with 40-bit and
54 | 32-bit counter widths also exist.
55 | * corrected_pmc_delta() uses the counter width provided by get_core_counter_width() to compute a difference
56 | between two counter values with corrections for a single wrap-around of the counter.
57 | * get_TSC_frequency() parses the Brand Identification string from the CPUID instruction to get the "nominal"
58 | frequency of the processor, which is also the invariant TSC frequency, and returned as a float value in Hz.
59 | This can then be used to convert TSC cycles to seconds.
60 | 
61 | 
62 | The source code "low_overhead_counters.c" can be used in one of two ways:
63 | 1. #include "low_overhead_counters.c" directly into your code, or
64 | 2. #include "low_overhead_counters.h" in your code and put
65 | "low_overhead_counters.c" as a separate input on the compile
66 | line, or compile it separately and include "low_overhead_counters.o"
67 | on the compile/link line.
68 | 
69 | The second approach may add a few instructions and/or cycles to the 
70 | overhead of the counters unless inter-procedural optimizations are applied.
71 | 
72 | 
73 | A driver code and scripts for testing these interfaces are in the LowOverheadTimersTests subdirectory.
74 | 


--------------------------------------------------------------------------------
/low_overhead_timers.c:
--------------------------------------------------------------------------------
  1 | // Some very low-overhead timer/counter interfaces:
  2 | //
  3 | // rdtsc() returns the number of "nominal" processor cycles since the system booted in a 64-bit unsigned integer.
  4 | //       For all recent Intel processors, this counter increments at a fixed rate, independent of the actual
  5 | //       core clock speed or the energy-saving mode.
  6 | // rdtscp() is the same as rdtsc except that it is partially ordered -- it will not execute until all prior
  7 | //       instructions in program order have executed.  (See also full_rdtscp)
  8 | // full_rdtscp() returns the number of "nominal" processor cycles in a 64-bit unsigned integer and also 
  9 | //       modifies its two integer arguments to show the processor socket and processor core that were in use
 10 | //       when the call was made.  (Note: the various cores in a chip usually have very similar values for 
 11 | //       the TSC, but they are allowed to vary by processor.  This function guarantees that you know exactly
 12 | //       which processor the TSC reading came from.)
 13 | // get_core_number() uses the RDTSCP instruction, but returns only the core number in an integer variable.
 14 | // get_socket_number() uses the RDTSCP instruction, but returns only the socket number in an integer variable.
 15 | // rdpmc_instructions() uses a "fixed-function" performance counter to return the count of retired instructions on
 16 | //       the current core in the low-order 48 bits of an unsigned 64-bit integer.
 17 | // rdpmc_actual_cycles() uses a "fixed-function" performance counter to return the count of actual CPU core cycles
 18 | //       executed by the current core.  Core cycles are not accumulated while the processor is in the "HALT" state,
 19 | //       which is used when the operating system has no task(s) to run on a processor core.
 20 | // rdpmc_reference_cycles() uses a "fixed-function" performance counter to return the count of "reference" (or "nominal")
 21 | //       CPU core cycles executed by the current core.  This counts at the same rate as the TSC, but does not count
 22 | //       when the core is in the "HALT" state.  If a timed section of code shows a larger change in TSC than in
 23 | //       rdpmc_reference_cycles, the processor probably spent some time in a HALT state.
 24 | // rdpmc() reads the programmable core performance counter number specified in the input argument.
 25 | //		 No error or bounds checking is performed.
 26 | //
 27 | // get_TSC_frequency() parses the Brand Identification string from the CPUID instruction to get the "nominal"
 28 | //       frequency of the processor, which is also the invariant TSC frequency, and returned as a float value in Hz.
 29 | //       This can then be used to convert TSC cycles to seconds.
 30 | //
 31 | 
 32 | extern inline __attribute__((always_inline)) unsigned long rdtsc()
 33 | {
 34 |    unsigned long a, d;
 35 | 
 36 |    __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
 37 | 
 38 |    return (a | (d << 32));
 39 | }
 40 | 
 41 | 
 42 | extern inline __attribute__((always_inline)) unsigned long rdtscp()
 43 | {
 44 |    unsigned long a, d, c;
 45 | 
 46 |    __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
 47 | 
 48 |    return (a | (d << 32));
 49 | }
 50 | 
 51 | extern inline __attribute__((always_inline)) unsigned long full_rdtscp(int *chip, int *core)
 52 | {
 53 |    unsigned long a, d, c;
 54 | 
 55 |    __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
 56 | 	*chip = (c & 0xFFF000UL)>>12;
 57 | 	*core = c & 0xFFFUL;
 58 | 
 59 |    return (a | (d << 32));
 60 | }
 61 | 
 62 | 
 63 | extern inline __attribute__((always_inline)) int get_core_number()
 64 | {
 65 |    unsigned long a, d, c;
 66 | 
 67 |    __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
 68 | 
 69 |    return ( c & 0xFFFUL );
 70 | }
 71 | 
 72 | extern inline __attribute__((always_inline)) int get_socket_number()
 73 | {
 74 |    unsigned long a, d, c;
 75 | 
 76 |    __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
 77 | 
 78 |    return ( (c & 0xF000UL)>>12 );
 79 | }
 80 | 
 81 | 
 82 | extern inline __attribute__((always_inline)) unsigned long rdpmc_instructions()
 83 | {
 84 |    unsigned long a, d, c;
 85 | 
 86 |    c = (1UL<<30);
 87 |    __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c));
 88 | 
 89 |    return (a | (d << 32));
 90 | }
 91 | 
 92 | extern inline __attribute__((always_inline)) unsigned long rdpmc_actual_cycles()
 93 | {
 94 |    unsigned long a, d, c;
 95 | 
 96 |    c = (1UL<<30)+1;
 97 |    __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c));
 98 | 
 99 |    return (a | (d << 32));
100 | }
101 | 
102 | extern inline __attribute__((always_inline)) unsigned long rdpmc_reference_cycles()
103 | {
104 |    unsigned long a, d, c;
105 | 
106 |    c = (1UL<<30)+2;
107 |    __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c));
108 | 
109 |    return (a | (d << 32));
110 | }
111 | 
112 | extern inline __attribute__((always_inline)) unsigned long rdpmc(int c)
113 | {
114 |         unsigned long a, d;
115 | 
116 |         __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c));
117 | 
118 |         return (a | (d << 32));
119 | }
120 | 
121 | // number of core performance counters per logical processor
122 | // varies by model and mode of operation (HT often splits the
123 | // counters across threads).
124 | // The number of counters per logical processor is contained in
125 | // bits 15:8 of EAX after executing the CPUID instruction
126 | // with an initial EAX value of 0x0a (optional input in ECX is not used).
127 | int get_num_core_counters()
128 | {
129 | 	unsigned int eax, ebx, ecx, edx;
130 | 	unsigned int leaf, subleaf;
131 | 	int width;
132 | 
133 | 	leaf = 0x0000000a;
134 | 	subleaf = 0x0;
135 | 	__asm__ __volatile__ ("cpuid" : \
136 | 	  "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (leaf), "c" (subleaf));
137 | 
138 | 	return((eax & 0x0000ff00) >> 8);
139 | }
140 | 
141 | // core performance counter width varies by processor
142 | // the width is contained in bits 23:16 of the EAX register
143 | // after executing the CPUID instruction with an initial EAX
144 | // argument of 0x0a (subleaf 0x0 in ECX).
145 | int get_core_counter_width()
146 | {
147 | 	unsigned int eax, ebx, ecx, edx;
148 | 	unsigned int leaf, subleaf;
149 | 	int width;
150 | 
151 | 	leaf = 0x0000000a;
152 | 	subleaf = 0x0;
153 | 	__asm__ __volatile__ ("cpuid" : \
154 | 	  "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (leaf), "c" (subleaf));
155 | 
156 | 	return((eax & 0x00ff0000) >> 16);
157 | }
158 | 
159 | // fixed-function performance counter width varies by processor
160 | // the width is contained in bits 12:5 of the EDX register
161 | // after executing the CPUID instruction with an initial EAX
162 | // argument of 0x0a (subleaf 0x0 in ECX).
163 | int get_fixed_counter_width()
164 | {
165 | 	unsigned int eax, ebx, ecx, edx;
166 | 	unsigned int leaf, subleaf;
167 | 	int width;
168 | 
169 | 	leaf = 0x0000000a;
170 | 	subleaf = 0x0;
171 | 	__asm__ __volatile__ ("cpuid" : \
172 | 	  "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (leaf), "c" (subleaf));
173 | 
174 | 	return((edx & 0x00001fe0) >> 5);
175 | }
176 | 
177 | // assume that these functions will automatically do the right thing if they are
178 | // included more than once....
179 | #include <stdio.h>
180 | #include <stdlib.h>
181 | 
182 | // Utility routine to compute counter differences taking into account rollover
183 | // when the performance counter width is not known at compile time.  
184 | // Use the "get_counter_width()" function to get the counter width on the
185 | // current system, then use that as the third argument to this function.
186 | // 64-bit counters don't generally roll over, but I added a special case
187 | // for this 
188 | unsigned long corrected_pmc_delta(unsigned long end, unsigned long start, int pmc_width)
189 | {
190 | 	unsigned long error_return=0xffffffffffffffff;
191 | 	unsigned long result;
192 | 	// sanity checks
193 | 	if ((pmc_width <= 0) || (pmc_width > 64)) {
194 | 		fprintf(stderr,"ERROR: corrected_pmc_delta() called with illegal performance counter width %d\n",pmc_width);
195 | 		return(error_return);
196 | 	}
197 | 	// Due to the specifics of unsigned arithmetic, for pmc_width == sizeof(unsigned long),
198 | 	// the simple calculation (end-start) gives the correct delta even if the counter has
199 | 	// rolled (leaving end < start).
200 | 	if (pmc_width == 64) {
201 | 		return (end - start);
202 | 	} else {
203 | 		// for pmc_width < sizeof(unsigned long), rollover must be detected and corrected explicitly
204 | 		if (end >= start) {
205 | 			result = end - start;
206 | 		} else {
207 | 			// I think this works independent of ordering, but this makes the most intuitive sense
208 | 			result = (end + (1UL<<pmc_width)) - start;
209 | 		}
210 | 		return (result);
211 | 	}
212 | }
213 | 
214 | // Ugly, ugly, ugly hack to get nominal frequency from CPUID Brand String
215 | // on Intel processors.
216 | // Converted from C++ to C.
217 | // Only works for products that use "GHz" as the frequency designator,
218 | // not "MHz" or "THz".  So far this works on all processors tested.
219 | // Return value is frequency in Hz, so user will need to divide by 1e9
220 | // if GHz is desired....
221 | float get_TSC_frequency()
222 | {
223 | 	unsigned int eax, ebx, ecx, edx;
224 | 	unsigned int leaf, subleaf;
225 | 	unsigned int  intbuf[12];
226 | 	char *buffer;
227 | 	int i,j,k,base,start,stop,length;
228 | 	float freq_GHz;
229 | 	float frequency;
230 | 
231 | 	subleaf=0;
232 | 
233 | 	base = 0;
234 | 	for (leaf=0x80000002; leaf<0x80000005; leaf++) {
235 | 		// printf("DEBUG: leaf = %x\n",leaf);
236 | 		__asm__ __volatile__ ("cpuid" : \
237 | 		  "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (leaf), "c" (subleaf));
238 | 
239 | 		// printf("leaf = %x, eax = %8.8x, ebx = %8.8x, ecx = %8.8x, edx = %8.8x\n",leaf, eax, ebx, ecx, edx);
240 | 		intbuf[base] = eax;
241 | 		intbuf[base+1] = ebx;
242 | 		intbuf[base+2] = ecx;
243 | 		intbuf[base+3] = edx;
244 | 		base += 4;
245 | 		// printf("  DEBUG: %.8s %.8s %.8s %.8s\n",eax,ebx,ecx,edx);
246 | 	}
247 | 	// for (base=0; base<12; base++) {
248 | 	// 	printf("base[%d] = %8.8x\n",base,intbuf[base]);
249 | 	// }
250 | 	// printf("444444443333333333222222222211111111110000000000\n");
251 | 	// printf("765432109876543210987654321098765432109876543210\n");
252 | 	// printf("%48.48s\n",(char *)&intbuf[0]);
253 | 	buffer = (char *) &intbuf[0];
254 | 	// for (base=0; base<48; base++) {
255 | 	// 	printf("%c",buffer[base]);
256 | 	// }
257 | 	// printf("\n");
258 | 	// printf("000000000011111111112222222222333333333344444444\n");
259 | 	// printf("012345678901234567890123456789012345678901234567\n");
260 | 	// printf("\n");
261 | 	// printf("\n");
262 | 	// printf("Scanning backwards to try to find the frequency digits....\n");
263 | 	for (base=47; base>0; base--){
264 | 		if (buffer[base] == 0x7a) {
265 | 			// printf("Found z at location %d\n",base);
266 | 			if (buffer[base-1] == 0x48) {
267 | 				// printf("Found H at location %d\n",base-1);
268 | 				if (buffer[base-2] == 0x47) {
269 | 					// printf("Found G at location %d\n",base-2);
270 | 					// printf(" -- need to extract string now\n");
271 | 					i = base-3;
272 | 					stop = base-3;
273 | 					// printf("begin reverse search at stop character location %d\n",i);
274 | 					while(buffer[i] != 0x20) {
275 | 						// printf("found a non-blank character %c (%x) at location %d\n",buffer[i],buffer[i],i);
276 | 						i--;
277 | 					}
278 | 					start = i+1;
279 | 					length = stop - start + 1;
280 | 					k = length+1;
281 | 					// for (j=stop; j<start; j--) {
282 | 						// printf("DEBUG: buffer[%d] = %c\n",j,buffer[j]);
283 | 						// k--;
284 | 					// }
285 | 					// printf("DEBUG: starting position of frequency string is %d\n",start);
286 | 					//
287 | 					// note that sscanf will automatically stop when the string changes from digits
288 | 					// to non-digits, so I don't need to NULL-terminate the string in the buffer.
289 | 					//
290 | 					sscanf((char *)&buffer[start],"%f",&freq_GHz);
291 | 					// printf("Frequency is %f GHz\n",freq_GHz);
292 | 					frequency = 1.0e9*freq_GHz;
293 | 					return (frequency);
294 | 				}
295 | 			}
296 | 		}
297 | 	}
298 | 	return(-1.0);
299 | }
300 | 
301 | 


--------------------------------------------------------------------------------
/low_overhead_timers.h:
--------------------------------------------------------------------------------
 1 | // Some timers
 2 | //
 3 | // rdtsc() returns the number of "nominal" processor cycles since the system booted in a 64-bit unsigned integer.
 4 | //       For all recent Intel processors, this counter increments at a fixed rate, independent of the actual
 5 | //       core clock speed or the energy-saving mode.
 6 | // rdtscp() is the same as rdtsc except that it is partially ordered -- it will not execute until all prior
 7 | //       instructions in program order have executed.  (See also full_rdtscp)
 8 | // full_rdtscp() returns the number of "nominal" processor cycles in a 64-bit unsigned integer and also 
 9 | //       modifies its two integer arguments to show the processor socket and processor core that were in use
10 | //       when the call was made.  (Note: the various cores in a chip usually have very similar values for 
11 | //       the TSC, but they are allowed to vary by processor.  This function guarantees that you know exactly
12 | //       which processor the TSC reading came from.)
13 | // get_core_number() uses the RDTSCP instruction, but returns only the core number in an integer variable.
14 | // get_socket_number() uses the RDTSCP instruction, but returns only the socket number in an integer variable.
15 | // rdpmc_instructions() uses a "fixed-function" performance counter to return the count of retired instructions on
16 | //       the current core in the low-order 48 bits of an unsigned 64-bit integer.
17 | // rdpmc_actual_cycles() uses a "fixed-function" performance counter to return the count of actual CPU core cycles
18 | //       executed by the current core.  Core cycles are not accumulated while the processor is in the "HALT" state,
19 | //       which is used when the operating system has no task(s) to run on a processor core.
20 | // rdpmc_reference_cycles() uses a "fixed-function" performance counter to return the count of "reference" (or "nominal")
21 | //       CPU core cycles executed by the current core.  This counts at the same rate as the TSC, but does not count
22 | //       when the core is in the "HALT" state.  If a timed section of code shows a larger change in TSC than in
23 | //       rdpmc_reference_cycles, the processor probably spent some time in a HALT state.
24 | // rdpmc() reads the programmable core performance counter number specified in the input argument.
25 | //		 No error or bounds checking is performed.
26 | //
27 | // get_TSC_frequency() parses the Brand Identification string from the CPUID instruction to get the "nominal"
28 | //       frequency of the processor, which is also the invariant TSC frequency, and returned as a float value in Hz.
29 | //       This can then be used to convert TSC cycles to seconds.
30 | //
31 | 
32 | unsigned long rdtsc();
33 | unsigned long rdtscp();
34 | unsigned long full_rdtscp(int *chip, int *core);
35 | int get_core_number();
36 | int get_socket_number();
37 | unsigned long rdpmc_instructions();
38 | unsigned long rdpmc_actual_cycles();
39 | unsigned long rdpmc_reference_cycles();
40 | unsigned long rdpmc(int c);
41 | int get_num_core_counters();
42 | int get_core_counter_width();
43 | int get_fixed_counter_width();
44 | unsigned long corrected_pmc_delta(unsigned long end, unsigned long start, int pmc_width);
45 | float get_TSC_frequency();
46 | 


--------------------------------------------------------------------------------