├── .gitignore
├── LICENSE
├── MSR_ArchPerfMon_v3.h
├── MSR_Architectural.h
├── MSR_defs.h
├── Makefile
├── README.md
├── SF_test_offsets.c
├── SKX_IMC_BusDeviceFunctionOffset.h
├── SetupCoreCounters.sh
├── SnoopFilterMapper.c
├── low_overhead_timers.c
├── low_overhead_timers.h
├── run_ensemble.sh
├── ssum.c
└── va2pa_lib.c


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.s
3 | *.optrpt
4 | *.exe
5 | log.*
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, John D McCalpin and the University of Texas at Austin
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/MSR_ArchPerfMon_v3.h:
--------------------------------------------------------------------------------
 1 | //-------------------------------------------------
 2 | // Intel Arch SW Developer's Manual, Volume 3, document 325384-060, September 2016
 3 | //-------------------------------------------------
 4 | // Part 1: Architectural performance monitoring version 3, Volume 3B, section 18.2
 5 | //    and Section 35.1
 6 | //-------------------------------------------------
 7 | #define IA32_PMC0 0xC1L
 8 | #define IA32_PMC1 0xC2L
 9 | #define IA32_PMC2 0xC3L
10 | #define IA32_PMC3 0xC4L
11 | #define IA32_PMC4 0xC5L
12 | #define IA32_PMC5 0xC6L
13 | #define IA32_PMC6 0xC7L
14 | #define IA32_PMC7 0xC8L
15 | #define IA32_PERFEVTSEL0 0x186L
16 | #define IA32_PERFEVTSEL1 0x187L
17 | #define IA32_PERFEVTSEL2 0x188L
18 | #define IA32_PERFEVTSEL3 0x189L
19 | #define IA32_PERFEVTSEL4 0x18AL
20 | #define IA32_PERFEVTSEL5 0x18BL
21 | #define IA32_PERFEVTSEL6 0x18CL
22 | #define IA32_PERFEVTSEL7 0x18DL
23 | #define IA32_PERF_STATUS 0x198L
24 | #define IA32_THERM_STATUS 0x19CL
25 | #define IA32_PERF_CTL 0x199L
26 | #define IA32_MISC_ENABLE 0x1A0L
27 | #define IA32_FIXED_CTR0 0x309L
28 | #define IA32_FIXED_CTR1 0x30AL
29 | #define IA32_FIXED_CTR2 0x30BL
30 | #define IA32_FIXED_CTR_CTRL 0x38DL
31 | #define IA32_PERF_GLOBAL_STATUS 0x38EL
32 | #define IA32_PERF_GLOBAL_CTRL 0x38FL
33 | #define IA32_PERF_GLOBAL_OVF_CTRL 0x390L
34 | 


--------------------------------------------------------------------------------
/MSR_Architectural.h:
--------------------------------------------------------------------------------
 1 | // -----------------------------------------------------------------
 2 | // Part 2: Performance-related MSRs from "Architectural MSRs" 
 3 | //   (Volume 3B, Table 35-2) excludes those listed above in
 4 | //   "Architectural Performance Monitoring"
 5 | //
 6 | // Name, MSR_Address
 7 | #define IA32_TIME_STAMP_COUNTER 0x10L
 8 | #define IA32_MPERF 0xE7L
 9 | #define IA32_APERF 0xE8L
10 | #define IA32_CLOCK_MODULATION 0x19AL
11 | #define IA32_ENERGY_PERF_BIAS 0x1B0L
12 | #define IA32_PACKAGE_THERM_STATUS 0x1B1L
13 | #define IA32_DEBUGCTL 0x1D9L
14 | #define IA32_PLATFORM_DCA_CAP 0x1F8L
15 | #define IA32_CPU_DCA_CAP 0x1F9L
16 | #define IA32_DCA_0_CAP 0x1FAL
17 | #define IA32_PERF_CAPABILITIES 0x345L
18 | #define IA32_PEBS_ENABLE 0x3F1L
19 | #define IA32_A_PMC0 0x4C1L
20 | #define IA32_A_PMC1 0x4C2L
21 | #define IA32_A_PMC2 0x4C2L
22 | #define IA32_A_PMC3 0x4C3L
23 | #define IA32_A_PMC4 0x4C4L
24 | #define IA32_A_PMC5 0x4C5L
25 | #define IA32_A_PMC6 0x4C6L
26 | #define IA32_A_PMC7 0x4C7L
27 | #define IA32_TSC_AUX 0xC0000203L
28 | 


--------------------------------------------------------------------------------
/MSR_defs.h:
--------------------------------------------------------------------------------
  1 | //-------------------------------------------------
  2 | // MSR list for performance monitoring utility for Xeon E5 v3 (Haswell EP, 06_3F, Hikari/Wrangler/Lonestar5)
  3 | // Revision 0.2, 2017-03-02
  4 | // John D. McCalpin, mccalpin@tacc.utexas.edu
  5 | //-------------------------------------------------
  6 | // This is a shortened version of Xeon_E5_v3_Perf_MSRs.txt that just includes
  7 | // the MSR names and numbers in cpp #define format...
  8 | // Since I use the MSR numbers primarily in "pread()" and "pwrite()" calls,
  9 | // I will define all of these as signed long (64-bit) integers.
 10 | // https://www.gnu.org/software/libc/manual/html_node/File-Position-Primitive.html
 11 | //-------------------------------------------------
 12 | // Intel Arch SW Developer's Manual, Volume 3, document 325384-060, September 2016
 13 | //-------------------------------------------------
 14 | // Part 1: Architectural performance monitoring version 3, Volume 3B, section 18.2 and Section 35.1
 15 | //-------------------------------------------------
 16 | #include "MSR_ArchPerfMon_v3.h"
 17 | 
 18 | // -----------------------------------------------------------------
 19 | // Part 2: Performance-related MSRs from "Architectural MSRs" 
 20 | //   (Volume 3B, Table 35-2) excludes those listed above in
 21 | //   "Architectural Performance Monitoring"
 22 | //
 23 | #include "MSR_Architectural.h"
 24 | 
 25 | // -----------------------------------------------------------------
 26 | // Part 3: MSRs from Volume 3B, Tables 34-5 & 34-6 for Nehalem & Westmere, plus Table 34-8 for Xeon 5600
 27 | // Skipped here....
 28 | 
 29 | // -----------------------------------------------------------------
 30 | // Part 4: MSRs from Volume 3B, Table 34-10, Sandy Bridge 06_2Ah and 06_2Dh
 31 | // All Architectural MSRs are included (including Architectural Perf Monitoring, v3)
 32 | //   06_2Dh is Xeon E5 (Stampede)
 33 | //   06_2Ah is Xeon E3 (Scorpion)
 34 | // Stampede:
 35 | //   DisplayFamily_DisplayModel 06_2Dh
 36 | //   Family 06, ExtendedFamily 00, Model 13, ExtendedModel 02
 37 | //   cpuinfo model 45 (decimal)
 38 | 
 39 | // Update 2012-10-02: Downloaded new revision of Volume 3C (326019-044, August 2012)
 40 | // and reviewed Sandy Bridge MSRs in Section 35.7, Table 35-11
 41 | 
 42 | // Note: IA32_PERF_STATUS and MSR_PERF_STATUS are both 0x198, but the former only includes P-state, while the latter includes both P-state and voltage info
 43 | // Note: All versions of Volume 3B screw up 0x1AD, calling it decimal 428, when it is actually 429.
 44 | //       Nehalem/Westmere have both 0x1AC and 0x1AD, while Sandy Bridge only defines one of the two
 45 | 
 46 | // Name, MSR, Access, (~WriteMask), Notes
 47 | 
 48 | // Same on Nehalem/Westmere as on Sandy Bridge
 49 | #define MSR_PLATFORM_INFO 0xCEL
 50 | #define MSR_PKG_CST_CONFIG_CONTROL 0xE2L
 51 | #define MSR_TEMPERATURE_TARGET 0x1A2L
 52 | #define MSR_MISC_FEATURE_CONTROL 0x1A4L
 53 | #define MSR_OFFCORE_RSP_0 0x1A6L
 54 | #define MSR_OFFCORE_RSP_1 0x1A7L
 55 | #define MSR_MISC_PWR_MGMT 0x1AAL
 56 | #define MSR_TURBO_POWER_CURRENT_LIMIT 0x1ACL
 57 | #define MSR_TURBO_RATIO_LIMIT 0x1ADL
 58 | #define MSR_POWER_CTL 0x1AFL
 59 | #define MSR_PEBS_LD_LAT 0x3F6L
 60 | #define MSR_PKG_C3_RESIDENCY 0x3F8L
 61 | #define MSR_PKG_C6_RESIDENCY 0x3F9L
 62 | #define MSR_PKG_C7_RESIDENCY 0x3FAL
 63 | #define MSR_CORE_C3_RESIDENCY 0x3FCL
 64 | #define MSR_CORE_C6_RESIDENCY 0x3FDL
 65 | 
 66 | // Changes between Westmere and Sandy Bridge -- same register has additional info fields in SNB
 67 | #define MSR_PERF_STATUS 0x198L
 68 | 
 69 | // New features in Sandy Bridge
 70 | #define MSR_CORE_C7_RESIDENCY 0x3FEL
 71 | #define MSR_RAPL_POWER_UNIT 0x606L
 72 | #define MSR_PKG_C2_RESIDENCY 0x60DL
 73 | #define MSR_PKG_POWER_LIMIT 0x610L
 74 | #define MSR_PKG_ENERGY_STATUS 0x611L
 75 | #define MSR_PKG_POWER_INFO 0x614L
 76 | #define MSR_PP0_POWER_LIMIT 0x638L
 77 | #define MSR_PP0_ENERGY_STATUS 0x639L
 78 | #define MSR_PP0_POLICY 0x63AL
 79 | #define MSR_PP0_PERF_STATUS 0x63BL
 80 | 
 81 | // -----------------------------------------------------------------
 82 | // Part 5: MSRs from August 2012 Volume 3C, Table 35-12, Extra MSRs for Sandy Bridge 06_2Ah 
 83 | // Skipped here....
 84 | 
 85 | // -----------------------------------------------------------------
 86 | // Part 6: MSRs from August 2012 Volume 3C, Table 35-12, Extra MSRs for Sandy Bridge 06_2Dh 
 87 | // All Architectural MSRs are included (including Architectural Perf Monitoring, v3)
 88 | //   06_2Dh is Xeon E5 (Stampede)
 89 | 
 90 | // Name, MSR, Access, (~WriteMask), Notes
 91 | #define MSR_PKG_PERF_STATUS 0x613L
 92 | #define MSR_RAPL_PERF_STATUS 0x614L
 93 | #define MSR_DRAM_POWER_LIMIT 0x618L
 94 | #define MSR_DRAM_ENERGY_STATUS 0x619L
 95 | #define MSR_DRAM_PERF_STATUS 0x61BL
 96 | #define MSR_DRAM_POWER_INFO 0x61CL
 97 | 
 98 | #define U_MSR_PMON_FIXED_CTL 0x703L
 99 | #define U_MSR_PMON_FIXED_CTR 0x704L
100 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CC = icc
 2 | #CFLAGS = -DIMC_COUNTS -DCHA_COUNTS -DMAP_L3 -sox -g -O -xCORE-AVX512
 3 | CFLAGS = -DIMC_COUNTS -DCHA_COUNTS -sox -g -O -xCORE-AVX512
 4 | 
 5 | default: SnoopFilterMapper
 6 | 
 7 | SnoopFilterMapper.o: SnoopFilterMapper.c
 8 | 	icc $(CFLAGS) -qopenmp -c SnoopFilterMapper.c
 9 | 
10 | ssum.o: ssum.c
11 | 	icc -sox -g -O -xCORE-AVX512 -qopt-zmm-usage=high -c ssum.c
12 | 
13 | SnoopFilterMapper: SnoopFilterMapper.o ssum.o va2pa_lib.o low_overhead_timers.c
14 | 	icc $(CFLAGS) -qopenmp SnoopFilterMapper.o ssum.o va2pa_lib.o -o SnoopFilterMapper
15 | 
16 | SF_test_offsets.o: SF_test_offsets.c
17 | 	icc $(CFLAGS) -qopenmp -DRANDOMOFFSETS -DMYHUGEPAGE_1GB -c SF_test_offsets.c 
18 | 
19 | SF_test_offsets: SF_test_offsets.o ssum.o va2pa_lib.o low_overhead_timers.c
20 | 	icc $(CFLAGS) -qopenmp SF_test_offsets.o ssum.o va2pa_lib.o -o SF_test_offsets
21 | 
22 | SnoopFilterMapper_THP.o: SnoopFilterMapper.c 
23 | 	icc $(CFLAGS) -qopenmp -DMYHUGEPAGE_THP -c SnoopFilterMapper.c -o SnoopFilterMapper_THP.o
24 | 
25 | SnoopFilterMapper_THP: SnoopFilterMapper_THP.o ssum.o va2pa_lib.o low_overhead_timers.c
26 | 	icc $(CFLAGS) -qopenmp SnoopFilterMapper_THP.o ssum.o va2pa_lib.o -o SnoopFilterMapper_THP
27 | 
28 | clean:
29 | 	rm -f *.o
30 | 	rm -f SnoopFilterMapper SnoopFilterMapper_THP SF_test_offsets 
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SKX\_SF\_Conflicts -- READ.ME file
  2 | 
  3 | This is remarkably messy code, including way too much processor-specific customization, but it does provide a very compact way to demonstrate Snoop Filter Conflicts on Intel Skylake Xeon processors.
  4 | 
  5 | There are two main programs here:
  6 | 
  7 | 1. SnoopFilterMapper.c
  8 | 1. SF\_Test\_Offsets.c
  9 | 
 10 | These are very similar codes that are set up to repeatedly sum a contiguous, nominally
 11 | L2-containable array, with extensive performance counter monitoring, to look for evidence
 12 | of Snoop Filter Conflicts on Intel Xeon Scalable processors (a.k.a., Skylake Xeon, or Skylake Server).
 13 | 
 14 | * **SnoopFilterMapper** is specialized for use with 2MiB large pages (either pre-allocated
 15 |  or Transparent Huge Pages).  It is intended to be run hundreds or thousands of times
 16 |  (getting a different set of physical addresses each time), allowing post-processing 
 17 |  of results to investigate the relationship between L2 miss rates (which should be near
 18 |  zero) and Snoop Filter Evictions.
 19 | 
 20 | * **SF\_Test\_Offsets** is specialized for use with 1 GiB large pages (which must be pre-allocated
 21 |  at system boot time).  It is intended to be run many times with different offsets
 22 |  provided on the command line.  The offset determines the start of the contiguous, L2-
 23 |  containable array relative to the beginning of each of the 1 GiB pages used.
 24 |  In this case, ensembles of runs are not needed, since the largest L2-containable array
 25 |  size is much smaller than a 1 GiB page (1 MiB per core times 28-cores = 28 MiB), 
 26 |  ensuring that the contiguous virtual address range used corresponds to a contiguous
 27 |  physical address range.  
 28 | 
 29 | ## SnoopFilterMapper
 30 | This outline lists the major operations executed by the **SnoopFilterMapper** code
 31 | 
 32 | 1.  Allocates a 2GiB array
 33 |     *  Options for 1GiB pages (#ifdef MYHUGEPAGE\_1GB) or pre-allocated 2MiB pages (default) or 2MiB Transparent Huge Pages (#ifdef MYHUGEPAGE_THP).
 34 | 1.  Initialize/instantiate full 2GiB array.
 35 | 1.  Grab the physical addresses of each page in the array (either 2 values or 2048 values) and save in an array.
 36 | 1.  Check CPUID to see if the processor model is correct.
 37 | 1.  Optionally prepare to use the IMC counters (#ifdef IMC\_COUNTS)
 38 |     1.  mmap /dev/mem and check for the correct SKX VID/DID for bus 0, device 5, function 0
 39 | 1.  Optionally set up the CHA counters (#ifdef CHA\_COUNTS)
 40 |     1.  open one /dev/cpu/\*/msr device in each socket
 41 |     1.  read and print the four programmable core performance counters for the core in socket 0
 42 |     1.  program the counters and filter in each of the CHAs (NUM\_CHA\_USED is hardcoded)
 43 | 1.  Optionally program the IMC counters (#ifdef IMC\_COUNTS)
 44 | 1.  Optionally determines the mapping of addresses to L3 numbers (#ifdef MAP\_L3)
 45 |     1.  Mostly written for 2MiB pages.
 46 |     1.  For the first PAGES\_MAPPED pages:
 47 |         1.  Check to see if mapping file already exists for the 2MiB page physical address
 48 |         1.  If exists, read the file
 49 |     1.  else, for each cache line
 50 |         1.  Read the L3 counts, access the line many times, read the L3 counts
 51 |     1.  Sanity check results -- if good, store L3 mapping, if bad, repeat.
 52 |     1.  After mapping all lines in the page, write the mapping file for later use.
 53 |     1.  After all pages are mapped, add up the number of lines mapped to each CHA.  (Not needed once it is shown that short contiguous ranges cover all the CHAs almost-uniformly.)
 54 | 1.  Run an OpenMP parallel "warm-up" loop of AVX512 instructions to try to make sure the cores have spun up the AVX512 units and boosted the cores to the correct frequency.
 55 | 1.  Optionally read the initial values of the IMC counters (#ifdef IMC\_COUNTS)
 56 | 1.  Optionally read the initial values of the CHA counters (#ifdef CHA\_COUNTS)
 57 | 1.  Code Under Test
 58 |     1.  save start\_tsc() (in OpenMP master thread)
 59 |     1.  First OpenMP loop: 
 60 |         1.  Check core number for each thread using RDTSCP TSC_AUX value 
 61 |         1.  NOTE: implies KMP_AFFINITY="granularity=fine"
 62 |     1.  Read initial values of programmable core counters on each core used.
 63 |     1.  Second OpenMP loop:
 64 |         1.  read initial value of fixed-function counters on each core in use
 65 |         1.  Repeat call to ssum() "inner\_repetitions" times (with individualized array start/stop values per thread).
 66 |         1.  read final value of fixed-function counters on each core in use
 67 |         1.  NOTE: these counter reads are inside the OpenMP barrier, so they can be used to detect load imbalance.
 68 |     1.  Third OpenMP loop:
 69 |         1.  Check core number for each thread using RDTSCP TSC\_AUX value 
 70 |         1.  Read final values of programmable core counters on each core used.
 71 |     1.  save end\_tsc() (in OpenMP master thread)
 72 | 1.  Optionally read the final values of the CHA counters (#ifdef CHA\_COUNTS)
 73 | 1.  Optionally read the final values of the IMC counters (#ifdef IMC\_COUNTS)
 74 | 1.  Post-Processing
 75 |     1.  Compute package sums of core counters
 76 |     1.  Optionally compute package sums of CHA counters
 77 |     1.  Optionally compute package sums of IMC counters
 78 |     1.  Compute utilization, average frequency, and IPC for each thread (inside the OpenMP barriers).
 79 |     1.  Compute snoop filter eviction rate (assumes SF EVICTS are in CHA counter 0)
 80 | 
 81 | ## SF\_Test\_Offsets
 82 | The **SF\_test\_offsets** code is very similar to **SnoopFilterMapper**, but is specialized to run the code under test using contiguous memory at various offsets from the base of each 1GiB page.
 83 | 
 84 | * This is probably similar enough to merge with SnoopFilterMapper.c.
 85 | * The code includes "#ifdef SIMPLE\_OMP\_LOOP" to run the reduction in scalar mode instead of using the external AVX512 ssum() routine.
 86 |  * The OpenMP scalar reduction mode loses the "between the barriers" fixed-function core counter data, but retains all the other performance counter data.
 87 | 
 88 | ## Porting Notes
 89 | 
 90 | 1. The main functionality of the codes is enabled/disabled through preprocessor variables
 91 |    * MAP\_L3 -- if defined, causes the code to use CHA counters to attempt to map each cache line in the contiguous array to one of the L3 cache slices.   This code is complex and slow (about 6 seconds for the 32768 cache lines in a 2MiB page), and is not needed when using this code to look for snoop filter conflicts.
 92 |      * Requires CHA\_COUNTS to be defined, which requires root privileges for access to the /dev/cpu/\*/msr device drivers.
 93 |    * CHA\_COUNTS -- if defined, causes the code to program the hardware performance counters in each CHA to measure four specific events, and to read these counts before and after the code under test.
 94 |      * This must be defined to directly measure Snoop Filter Evictions.
 95 |      * Requires root privileges for access to the /dev/cpu/\*/msr device drivers.
 96 |     * The events are:
 97 |       1. Snoop Filter Evictions: SF\_EVICTION.ALL (sum of M, E, S states)
 98 |       2. L3 Data Read Lookups: LLC\_LOOKUP.DATA\_READ (requires CHA\_FILTER0[26:17])
 99 |       3. L2 Writebacks to L3: LLC\_LOOKUP.DATA\_WRITE (requires CHA\_FILTER0[26:17])
100 |       4. L3 Writebacks to Memory: LLC\_VICTIMS.TOTAL (MESF) (does not count clean victims)
101 |     * The CHA\_FILTER0 in each CHA is programmed to count all L3 lookups (hit or miss, any state), but not to count SF lookups.
102 |    * IMC\_COUNTS -- if defined, causes the code to program the IMC counters in each DRAM channel to measure four events, and to read these counts before and after the code under test.
103 |      * This is not required to measure snoop filter conflicts.
104 |      * With the L2-contained array access kernel, these counters don't provide any useful information, but they are still in the code for historical reasons.
105 |      * If this variable is defined, more portability checks include:
106 |        * The file SKX\_IMC\_BusDeviceFunctionOffset.h contains (potentially) machine-specific bus numbers for the PCI configuration space used to access the memory controller performance counters that will need to be checked and/or updated.
107 |        * The variable `mmconfig_base` is set to 0x80000000 in the main program.  This value is used to map all of PCI configuration space to a local pointer for access to the memory controller performance counters.
108 |        * The easiest way to find the correct value for your system is `grep MMCONFIG /proc/iomem`
109 |      * The events are:
110 |        1. DRAM cache line reads: CAS\_COUNT.READS
111 |        2. DRAM cache line writes: CAS\_COUNT.WRITES
112 |        3. DRAM bank "activate" operations: ACT.ALL
113 |        4. DRAM bank "precharge" operations due to bank conflicts: PRE\_COUNT.MISS
114 | 2. There are a stupid number of machine-specific defines in the code:
115 |    * The ARRAYSIZE is set to 2GiB by default.
116 |    * Only a fraction of this is typically used for the contiguous, L2-resident array accesses, but the large size allows for two 1 GiB pages or 1024 2 MiB pages.  These large sizes are useful when the code is being used to determine the mapping of physical addresses to L3 slices.
117 |    * MYPAGESIZE is typically set to 2097152L for **SnoopFilterMapper** and to 1073741824UL for **SF\_Test\_offsets**.
118 |    * PAGES\_MAPPED tells the code how many pages to look at when mapping physical addresses to L3 slices (assuming MAP\_L3 is defined).  This is typically set to 1/2 the number of cores in use, so that the code will only be mapping the cache lines that are used in the contiguous L2-contained summation kernel.
119 |    * NUM\_SOCKETS is set to two for measurement on 2-socket systems, but the typical use case only uses socket 0.
120 |    * NUM\_IMC\_CHANNELS is set to 6, which is the correct number of channels per socket for all of the Xeon Scalable Processor models.  This would need to be reduced to 4 for testing the "Xeon W-21xx" processors.
121 |    * NUM\_CHA\_BOXES is set to 28.  All Xeon Scalable Processors have MSR addresses for all 28 CHAs, even if some (or many) are disabled, so there should be no need to change this.
122 |    * NUM\_CHA\_USED is set to 24, which is the number of "active" CHAs in the Xeon Platinum 8160 processor.   This should be changed to the correct number of active L3 slices for other SKX processor models.  
123 |    * Inactive CHAs will return zero on all performance counts, so both the individual results and the socket-wide sums should be correct as long as NUM\_CHA\_USED is at least as large as the number of CHA/L3 slices actually active.)
124 |    * MAXCORES is set to 96, which is the number of logical processors on a two-socket Xeon Platinum 8160 system with HyperThreading enabled (2 sockets * 24 physical cores/socket * 2 logical processors/physical core = 96).  This is used for array sizing, so it only needs to be changed if the actual number of cores used is larger than this value.
125 |    * CORES\_USED is set to 24, which is the number of physical cores in a Xeon Platinum 8160 processor.  The code assumes that this variable matches the OpenMP thread count, with loop structures set up to execute one iteration per OpenMP thread.  
126 |      * This will need to be changed for testing other core counts.
127 |      * The runtime environment must be consistent with the value used in the compilation!!!
128 |      * More notes on the runtime environment are included below....
129 |    * RANDOM\_OFFSETS does not really mean what it says.... It is not used in **SnoopFilterMapper**, but is used in **SF\_Test\_offsets**.  When defined, the code expects an integer argument on the command line.  The argument is interpreted as the number of 64-bit array elements above the base of each 1 GiB page to start the contiguous, L2-containable array accesses.    This ensures that the contiguous virtual address range used maps to a contiguous physical address range.
130 |    * SIMPLE\_OMP\_LOOP in **SF\_Test\_offsets** switches the code under test from an AVX512-optimized external summation routine (ssum.c) to a simple OpenMP sum reduction. 
131 |      * With the AVX-512 code, the 512-bit loads ensure that each cache line is consumed by a single load operation, so there is no possibility that the cache line can be evicted from the cache while it is still in use.
132 |      * With the simple OpenMP reduction, the Intel 18 compiler generates scalar code, so 8 load operations are required to process each cache line.  In this case, it is possible for a line to be evicted (by a Snoop Filter Eviction) before it has been completely processed.  When this happens, there will be more than one L2 miss associated with processing that cache line one time, and the overall L2 miss rate will increase.
133 | 3. Run time environment
134 |    * I use the Intel OpenMP runtime environment variable `KMP_HW_SUBSET=1s,24c,1t` to limit the execution to 24 cores on 1 socket.  
135 |      * With this environment variable definition, there is no need to set `OMP_NUM_THREADS`.
136 |      * The code assumes that threads are not allowed to migrate between the two thread contexts on each logical processor (when HyperThreading is enabled).  This can be enforced by the `1t` option to KMP\_HW\_SUBSET, or by adding the `granularity=fine` to the KMP\_AFFINITY environment variable.
137 |    * The code expects the fixed-function and programmable core performance counters to be enabled and configured correctly on each core that is used:
138 |    * IA32\_PERF\_GLOBAL\_CTRL (MSR 0x38f) should be set to 0x70000000f to enable the three fixed-function counters and four programmable counters per core.  (With HyperThreading disabled, setting this MSR to 0x7000000ff enables all eight programmable counters on each core).
139 |    * IA32\_FIXED\_CTR\_CTRL (MSR 0x38d) should be set to 0x333 to enable the three fixed-function counters to count in both user and kernel mode.  This may require disabling the NMI watchdog, which typically uses one of these counters.
140 |    * The script "SetupCoreCounters.sh" sets the core counters to a useful set:
141 |      1. MEM\_INST\_RETIRED.ALL\_LOADS (0x004381d0)
142 |      2. L1D.REPLACEMENTS (0x00430151)
143 |      3. L2\_RQSTS.MISS (0x00433f24)
144 |      4. L2\_LINES\_IN.ALL (0x00431ff1)
145 |    * If CHA\_COUNTS is defined, the code will print out the core performance counter event select registers on core 0, but does not check to see if the values are the expected ones.
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/SF_test_offsets.c:
--------------------------------------------------------------------------------
   1 | // John D. McCalpin, mccalpin@tacc.utexas.edu
   2 | static char const rcsid[] = "$Id: SF_test_offsets.c,v 1.4 2018/05/17 22:20:24 mccalpin Exp mccalpin $";
   3 | 
   4 | // include files
   5 | #include <stdio.h>				// printf, etc
   6 | #include <stdint.h>				// standard integer types, e.g., uint32_t
   7 | #include <signal.h>				// for signal handler
   8 | #include <stdlib.h>				// exit() and EXIT_FAILURE
   9 | #include <string.h>				// strerror() function converts errno to a text string for printing
  10 | #include <fcntl.h>				// for open()
  11 | #include <errno.h>				// errno support
  12 | #include <assert.h>				// assert() function
  13 | #include <unistd.h>				// sysconf() function, sleep() function
  14 | #include <sys/mman.h>			// support for mmap() function
  15 | #include <linux/mman.h>			// required for 1GiB page support in mmap()
  16 | #include <math.h>				// for pow() function used in RAPL computations
  17 | #include <time.h>
  18 | #include <sys/time.h>			// for gettimeofday
  19 | 
  20 | # define ARRAYSIZE 2147483648L
  21 | 
  22 | #ifdef MYHUGEPAGE_1GB
  23 | // 1 GiB pages
  24 | #define MYPAGESIZE 1073741824UL
  25 | #define NUMPAGES 32L
  26 | #define PAGES_MAPPED 32L			// this code is not working correctly for 1GiB pages, but I already know the answers....
  27 | #else
  28 | #define MYPAGESIZE 2097152L
  29 | #define NUMPAGES 1024L
  30 | #define PAGES_MAPPED 14L
  31 | #endif
  32 | 
  33 | #define SPECIAL_VALUE (-1)
  34 | 
  35 | // interfaces for va2pa_lib.c
  36 | void print_pagemap_entry(unsigned long long pagemap_entry);
  37 | unsigned long long get_pagemap_entry( void * va );
  38 | 
  39 | int dumpall;			// when set to 1, will cause dump of lots of stuff for debugging
  40 | int report;
  41 | int nwraps;				// track number of performance counter wraps
  42 | 
  43 | double *array;					// array pointer to mmap on 1GiB pages
  44 | double *page_pointers[NUMPAGES];		// one pointer for each page allocated
  45 | uint64_t pageframenumber[NUMPAGES];	// one PFN entry for each page allocated
  46 | 
  47 | // constant value defines
  48 | # define NUM_SOCKETS 2				// 
  49 | # define NUM_IMC_CHANNELS 6			// includes channels on all IMCs in a socket
  50 | # define NUM_IMC_COUNTERS 5			// 0-3 are the 4 programmable counters, 4 is the fixed-function DCLK counter
  51 | # define NUM_CHA_BOXES 28
  52 | # define NUM_CHA_USED 28
  53 | # define NUM_CHA_COUNTERS 4
  54 | 
  55 | long imc_counts[NUM_SOCKETS][NUM_IMC_CHANNELS][NUM_IMC_COUNTERS][2];	// including the fixed-function (DCLK) counter as the final entry
  56 | long imc_pkg_sums[NUM_SOCKETS][NUM_IMC_COUNTERS];						// sum across channels for each chip
  57 | char imc_event_name[NUM_SOCKETS][NUM_IMC_CHANNELS][NUM_IMC_COUNTERS][32];		// reserve 32 characters for the IMC event names for each socket, channel, counter
  58 | uint32_t imc_perfevtsel[NUM_IMC_COUNTERS];			// expected control settings for the counters
  59 | uint32_t imc_vid_did[3];							// PCIe configuration space vendor and device IDs for the IMC blocks 
  60 | long cha_counts[NUM_SOCKETS][NUM_CHA_BOXES][NUM_CHA_COUNTERS][2];		// 2 sockets, 28 tiles per socket, 4 counters per tile, 2 times (before and after)
  61 | uint32_t cha_perfevtsel[NUM_CHA_COUNTERS];
  62 | long cha_pkg_sums[NUM_SOCKETS][NUM_CHA_COUNTERS];
  63 | 
  64 | #define MAXCORES 112
  65 | #define CORES_USED 24
  66 | // New feature -- core counters.
  67 | // upgrade to include counters for all cores 
  68 | long core_counters[MAXCORES][4][2];					// 24 cores & 24 threads on one socket, 4 counters, before and after
  69 | long fixed_counters[MAXCORES][4][2];				// 24 cores with 4 fixed-function core counters (Instr, CoreCyc, RefCyc, TSC)
  70 | long core_pkg_sums[NUM_SOCKETS][4];					// four core counters
  71 | long fixed_pkg_sums[NUM_SOCKETS][4];				// four fixed-function counters per core (Instr, CoreCyc, RefCyc, TSC)
  72 | 
  73 | int8_t cha_by_page[PAGES_MAPPED][32768];				// L3 numbers for each of the 32,768 cache lines in each of the first PAGES_MAPPED 2MiB pages
  74 | uint64_t paddr_by_page[PAGES_MAPPED];					// physical addresses of the base of each of the first PAGES_MAPPED 2MiB pages used
  75 | long lines_by_cha[NUM_CHA_USED];			// bulk count of lines assigned to each CHA
  76 | 
  77 | #ifdef DEBUG
  78 | FILE *log_file;					// log file for debugging -- should not be needed in production
  79 | #endif
  80 | unsigned int *mmconfig_ptr;         // must be pointer to 32-bit int so compiler will generate 32-bit loads and stores
  81 | 
  82 | struct timeval tp;		// seconds and microseconds from gettimeofday
  83 | struct timezone tzp;	// required, but not used here.
  84 | 
  85 | double ssum(double *a, long vl);
  86 | 
  87 | double mysecond()
  88 | {
  89 |         struct timeval tp;
  90 |         struct timezone tzp;
  91 |         int i;
  92 | 
  93 |         i = gettimeofday(&tp,&tzp);
  94 |         return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
  95 | }
  96 | 
  97 | # ifndef MIN
  98 | # define MIN(x,y) ((x)<(y)?(x):(y))
  99 | # endif
 100 | # ifndef MAX
 101 | # define MAX(x,y) ((x)>(y)?(x):(y))
 102 | # endif
 103 | 
 104 | 
 105 | #include "low_overhead_timers.c"
 106 | 
 107 | 
 108 | #include "SKX_IMC_BusDeviceFunctionOffset.h"
 109 | #include "MSR_defs.h"
 110 | // ===========================================================================================================================================================================
 111 | // Convert PCI(bus:device.function,offset) to uint32_t array index
 112 | uint32_t PCI_cfg_index(unsigned int Bus, unsigned int Device, unsigned int Function, unsigned int Offset)
 113 | {
 114 |     uint32_t byteaddress;
 115 |     uint32_t index;
 116 |     assert (Device >= 0);
 117 |     assert (Function >= 0);
 118 |     assert (Offset >= 0);
 119 |     assert (Device < (1<<5));
 120 |     assert (Function < (1<<3));
 121 |     assert (Offset < (1<<12));
 122 |     byteaddress = (Bus<<20) | (Device<<15) | (Function<<12) | Offset;
 123 |     index = byteaddress / 4;
 124 |     return ( index );
 125 | }
 126 | 
 127 | // ===========================================================================================================================================================================
 128 | int main(int argc, char *argv[])
 129 | {
 130 | 	// local declarations
 131 | 	// int cpuid_return[4];
 132 | 	int i;
 133 | 	int retries;
 134 | 	int zeros;
 135 | 	int rc;
 136 | 	int core_pmc_width, fixed_pmc_width;            // these will be looked up using CPUID to use in overflow/wraparound correction
 137 |     int uncore_pmc_width=48;                        // all the uncore stuff is model-dependent, but most are 48 bits
 138 | 	ssize_t rc64;
 139 | 	char description[100];
 140 | 	size_t len;
 141 | 	long arraylen;
 142 | 	long l2_contained_size, inner_repetitions;
 143 | 	unsigned long pagemapentry;
 144 | 	unsigned long paddr, basephysaddr;
 145 | 	unsigned long pagenum, basepagenum;
 146 | 	uint32_t bus, device, function, offset, ctl_offset, ctr_offset, value, index;
 147 | 	uint32_t socket, imc, channel, counter, controller;
 148 | 	long count,delta;
 149 | 	long j,k,page_number,page_base_index,line_number;
 150 | 	long jstart[CORES_USED], jend[CORES_USED], mycore, vl[CORES_USED];
 151 | 	uint32_t low_0, high_0, low_1, high_1;
 152 | 	char filename[100];
 153 | 	int pkg, tile;
 154 | 	int nr_cpus;
 155 | 	uint64_t msr_val, msr_num;
 156 | 	int mem_fd;
 157 | 	int msr_fd[2];				// one for each socket
 158 | 	int proc_in_pkg[2];			// one Logical Processor number for each socket
 159 | 	uid_t my_uid;
 160 | 	gid_t my_gid;
 161 | 	double sum,expected;
 162 | 	double t0, t1;
 163 | 	double avg_cycles;
 164 | 	unsigned long tsc_start, tsc_end;
 165 | 	float TSC_GHz;
 166 | 	double sf_evict_rate;
 167 | 	double bandwidth;
 168 |     unsigned long mmconfig_base=0x80000000;		// DOUBLE-CHECK THIS ON NEW SYSTEMS!!!!!   grep MMCONFIG /proc/iomem | awk -F- '{print $1}'
 169 |     unsigned long mmconfig_size=0x10000000;
 170 | 	double private_sum,partial_sums[CORES_USED];
 171 | 	long iters,iteration_counts[CORES_USED];
 172 | 	long BaseOffset;
 173 | 
 174 | 	TSC_GHz = get_TSC_frequency()/1.0e9;  
 175 |     core_pmc_width = get_core_counter_width();
 176 |     fixed_pmc_width = get_fixed_counter_width();
 177 | 
 178 | 	BaseOffset = 0;
 179 | #ifdef RANDOMOFFSETS
 180 | 	if (argc != 2) {
 181 | 		printf("Must Provide a Random Offset cache line offset value (an integer between 0 and 2^24-375000 (16,402,216))\n");
 182 | 		exit(1);
 183 | 	} else {
 184 | 		BaseOffset = atol(argv[1]);
 185 | 		printf("Random Cache Line Offset is %ld\n",BaseOffset);
 186 | 		BaseOffset = BaseOffset*8;
 187 | 		printf("Starting index for summation is %ld\n",BaseOffset);
 188 | 	}
 189 | #endif
 190 | 
 191 | 	retries = 0;
 192 | 	zeros = 0;
 193 | 	report = 1;
 194 | 	dumpall = 0;
 195 | 	nwraps = 0;
 196 | 	l2_contained_size = 125000 * CORES_USED;		// about 95% of the L2 space in the cores used
 197 | 	// l2_contained_size = 87380 * CORES_USED;		// with 24 cores, this gives almost exactly 16 MiB
 198 | 	for (i=0; i<CORES_USED; i++) {
 199 | 		iters = 0;
 200 | 		jstart[i] = BaseOffset + i*l2_contained_size/CORES_USED;
 201 | 		jend[i] = jstart[i] + l2_contained_size/CORES_USED;
 202 | 		vl[i] = jend[i]-jstart[i];
 203 | 		printf("thread %d jstart %ld jend %ld vl %ld\n",i,jstart[i],jend[i],vl[i]);
 204 | 
 205 | 		partial_sums[i] = 0.0;
 206 | 		iteration_counts[i] = 0;
 207 | 		for (counter=0; counter<4; counter++) {
 208 | 			core_counters[i][counter][0] = SPECIAL_VALUE;
 209 | 			core_counters[i][counter][1] = SPECIAL_VALUE;
 210 | 			fixed_counters[i][counter][0] = SPECIAL_VALUE;
 211 | 			fixed_counters[i][counter][1] = SPECIAL_VALUE;
 212 | 		}
 213 | 	}
 214 | 	// initialize the array that will hold the L3 numbers for each cache line for each of the first PAGES_MAPPED 2MiB pages
 215 | 	for (i=0; i<PAGES_MAPPED; i++) {
 216 | 		for (line_number=0; line_number<32768; line_number++) {
 217 | 			cha_by_page[i][line_number] = -1; 	// special value -- if set properly, all values should be in the range of 0..23
 218 | 		}
 219 | 	}
 220 | 
 221 | 	// allocate working array on a huge pages -- either 1GiB or 2MiB
 222 | 	len = NUMPAGES * MYPAGESIZE;
 223 | #ifdef MYHUGEPAGE_1GB
 224 | 	array = (double*) mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | MAP_HUGE_1GB, -1, 0 );
 225 | #else
 226 | 	array = (double*) mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB, -1, 0 );
 227 | #endif
 228 | 	if (array == (void *)(-1)) {
 229 |         perror("ERROR: mmap of array a failed! ");
 230 |         exit(1);
 231 |     }
 232 | 	// initialize working array
 233 | 	arraylen = NUMPAGES * MYPAGESIZE/sizeof(double);
 234 | #pragma omp parallel for
 235 | 	for (j=0; j<arraylen; j++) {
 236 | 		array[j] = 1.0;
 237 | 	}
 238 | 	// initialize page_pointers to point to the beginning of each page in the array
 239 | 	// then get and print physical addresses for each
 240 | #ifdef VERBOSE
 241 | 	printf(" Page    ArrayIndex            VirtAddr        PagemapEntry         PFN           PhysAddr\n");
 242 | #endif
 243 | 	for (j=0; j<NUMPAGES; j++) {
 244 | 		k = j*MYPAGESIZE/sizeof(double);
 245 | 		page_pointers[j] = &array[k];
 246 | 		pagemapentry = get_pagemap_entry(&array[k]);
 247 | 		pageframenumber[j] = (pagemapentry & (unsigned long) 0x007FFFFFFFFFFFFF);
 248 | #ifdef VERBOSE
 249 | 		printf(" %.5ld   %.10ld  %#18lx  %#18lx  %#18lx  %#18lx\n",j,k,&array[k],pagemapentry,pageframenumber[j],(pageframenumber[j]<<12));
 250 | #endif
 251 | 	}
 252 | 	printf("PAGE_ADDRESSES ");
 253 | 	for (j=0; j<PAGES_MAPPED; j++) {
 254 | 		basephysaddr = pageframenumber[j] << 12;
 255 | 		paddr_by_page[j] = basephysaddr;
 256 | 		printf("0x%.12lx ",paddr_by_page[j]);
 257 | 	}
 258 | 	printf("\n");
 259 | 
 260 | 
 261 | 	// initialize arrays for counter data
 262 | 	for (socket=0; socket<NUM_SOCKETS; socket++) {
 263 | 		for (channel=0; channel<NUM_IMC_CHANNELS; channel++) {
 264 | 			for (counter=0; counter<NUM_IMC_COUNTERS; counter++) {
 265 | 				imc_counts[socket][channel][counter][0] = 0;
 266 | 				imc_counts[socket][channel][counter][1] = 0;
 267 | 			}
 268 | 		}
 269 | 		for (tile=0; tile<NUM_CHA_USED; tile++) {
 270 | 			lines_by_cha[tile] = 0;
 271 | 			for (counter=0; counter<4; counter++) {
 272 | 				cha_counts[socket][tile][counter][0] = 0;
 273 | 				cha_counts[socket][tile][counter][1] = 0;
 274 | 			}
 275 | 		}
 276 | 	}
 277 | 
 278 | 	// get the host name, assume that it is of the TACC standard form, and use this as part
 279 | 	// of the log file name....  Standard form is "c263-109.stampede2.tacc.utexas.edu", so
 280 | 	// truncating at the first "." is done by writing \0 to character #8.
 281 | 	len = 100;	
 282 | 	rc = gethostname(description, len);
 283 | 	if (rc != 0) {
 284 | 		fprintf(stderr,"ERROR when trying to get hostname\n");
 285 | 		exit(-1);
 286 | 	}
 287 | 	description[8] = 0;		// assume hostname of the form c263-109.stampede2.tacc.utexas.edu -- truncate after first period
 288 | 
 289 | 	my_uid = getuid();
 290 | 	my_gid = getgid();
 291 | 
 292 | #ifdef DEBUG
 293 | 	sprintf(filename,"log.%s.perf_counters",description);
 294 | 	// sprintf(filename,"log.perf_counters");
 295 | 	log_file = fopen(filename,"w+");
 296 | 	if (log_file == 0) {
 297 | 		fprintf(stderr,"ERROR %s when trying to open log file %s\n",strerror(errno),filename);
 298 | 		exit(-1);
 299 | 	}
 300 | 
 301 | 	fprintf(log_file,"DEBUG: my uid is %d, my gid is %d\n",my_uid,my_gid);
 302 | 
 303 | 	rc = chown(filename,my_uid,my_gid);
 304 | 	if (rc == 0) {
 305 | 		fprintf(log_file,"DEBUG: Successfully changed ownership of log file to %d %d\n",my_uid,my_gid);
 306 | 	} else {
 307 | 		fprintf(stderr,"ERROR: Attempt to change ownership of log file failed -- bailing out\n");
 308 | 		exit(-1);
 309 | 	}
 310 | #endif
 311 | 
 312 | 	//========================================================================================================================
 313 | 	// initial checks
 314 | 	// 		is this a supported core?  (CPUID Family/Model)
 315 | 	//      Every processor that I am going to see will be Family 0x06 (no ExtFamily needed).
 316 | 	//      The DisplayModel field is (ExtModel<<4)+Model and should be 0x3F for all Xeon E5 v3 systems
 317 | 	int leaf = 1;
 318 | 	int subleaf = 0;
 319 | 	uint32_t eax, ebx, ecx, edx;
 320 | 	__asm__ __volatile__ ("cpuid" : \
 321 | 		  "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (leaf), "c" (subleaf));
 322 | 
 323 | 	// Alternate form: 
 324 | 	// 		The compiler cpuid intrinsics are not documented by Intel -- they use the Microsoft format
 325 | 	// 			described at https://msdn.microsoft.com/en-us/library/hskdteyh.aspx
 326 | 	// 			__cpuid(array to hold eax,ebx,ecx,edx outputs, initial eax value)
 327 | 	// 			__cpuidex(array to hold eax,ebx,ecx,edx outputs, initial eax value, initial ecx value)
 328 | 	//      CPUID function 0x01 returns the model info in eax.
 329 | 	//      		27:20 ExtFamily	-- expect 0x00
 330 | 	//      		19:16 ExtModel	-- expect 0x3 for HSW, 0x5 for SKX
 331 | 	//      		11:8  Family	-- expect 0x6
 332 | 	//      		7:4   Model		-- expect 0xf for HSW, 0x5 for SKX
 333 | 	// __cpuid(&cpuid_return[0], 1);
 334 | 	// uint32_t ModelInfo = cpuid_return[0] & 0x0fff0ff0;	// mask out the reserved and "stepping" fields, leaving only the based and extended Family/Model fields
 335 | 
 336 | 	uint32_t ModelInfo = eax & 0x0fff0ff0;	// mask out the reserved and "stepping" fields, leaving only the based and extended Family/Model fields
 337 | 	if (ModelInfo != 0x00050650) {				// expected values for Skylake Xeon
 338 | 		fprintf(stderr,"ERROR -- this does not appear to be the correct processor type!!!\n");
 339 | 		fprintf(stderr,"ERROR -- Expected CPUID(0x01) Family/Model bits = 0x%x, but found 0x%x\n",0x00050650,ModelInfo);
 340 | 		exit(1);
 341 | 	}
 342 | 
 343 | #ifdef IMC_COUNTS
 344 | 	// ===================================================================================================================
 345 | 	// ------------------ REQUIRES ROOT PERMISSIONS ------------------
 346 | 	// open /dev/mem for PCI device access and mmap() a pointer to the beginning
 347 | 	// of the 256 MiB PCI Configuration Space.
 348 | 	// 		check VID/DID for uncore bus:device:function combinations
 349 | 	//   Note that using /dev/mem for PCI configuration space access is required for some devices on KNL.
 350 | 	//   It is not required on other systems, but it is not particularly inconvenient either.
 351 | 	sprintf(filename,"/dev/mem");
 352 | #ifdef DEBUG
 353 | 	fprintf(log_file,"opening %s\n",filename);
 354 | #endif
 355 | 	mem_fd = open(filename, O_RDWR);
 356 | 	if (mem_fd == -1) {
 357 | 		fprintf(stderr,"ERROR %s when trying to open %s\n",strerror(errno),filename);
 358 | 		exit(-1);
 359 | 	}
 360 | 	int map_prot = PROT_READ | PROT_WRITE;
 361 | 	mmconfig_ptr = mmap(NULL, mmconfig_size, map_prot, MAP_SHARED, mem_fd, mmconfig_base);
 362 |     if (mmconfig_ptr == MAP_FAILED) {
 363 |         fprintf(stderr,"cannot mmap base of PCI configuration space from /dev/mem: address %lx\n", mmconfig_base);
 364 |         exit(2);
 365 | #ifdef DEBUG
 366 |     } else {
 367 | 		fprintf(log_file,"Successful mmap of base of PCI configuration space from /dev/mem at address %lx\n", mmconfig_base);
 368 | #endif
 369 | 	}
 370 |     close(mem_fd);      // OK to close file after mmap() -- the mapping persists until unmap() or program exit
 371 | 
 372 | 	// New simple test that does not need to know the uncore bus numbers here...
 373 | 	// Skylake bus 0, Function 5, offset 0 -- Sky Lake-E MM/Vt-d Configuration Registers
 374 | 	//
 375 | 	// simple test -- should return "20248086" on Skylake Xeon EP -- DID 0x2024, VID 0x8086
 376 | 	bus = 0x00;
 377 | 	device = 0x5;
 378 | 	function = 0x0;
 379 | 	offset = 0x0;
 380 | 	index = PCI_cfg_index(bus, device, function, offset);
 381 |     value = mmconfig_ptr[index];
 382 | 	if (value != 0x20248086) {
 383 | 		fprintf(stderr,"ERROR: Bus %x device %x function %x offset %x expected %x, found %x\n",bus,device,function,offset,0x20248086,value);
 384 | 		exit(3);
 385 | #ifdef DEBUG
 386 | 	} else {
 387 | 		fprintf(log_file,"DEBUG: Well done! Bus %x device %x function %x offset %x returns expected value of %x\n",bus,device,function,offset,value);
 388 | #endif
 389 | 	}
 390 | #endif
 391 | 
 392 | #ifdef CHA_COUNTS
 393 | 	// ===================================================================================================================
 394 | 	// open the MSR driver using one core in socket 0 and one core in socket 1
 395 | 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 396 |     proc_in_pkg[0] = 0;                 // logical processor 0 is in socket 0 in all TACC systems
 397 |     proc_in_pkg[1] = nr_cpus-1;         // logical processor N-1 is in socket 1 in all TACC 2-socket systems
 398 | 	for (pkg=0; pkg<2; pkg++) {
 399 | 		sprintf(filename,"/dev/cpu/%d/msr",proc_in_pkg[pkg]);
 400 | 		msr_fd[pkg] = open(filename, O_RDWR);
 401 | 		if (msr_fd[pkg] == -1) {
 402 | 			fprintf(stderr,"ERROR %s when trying to open %s\n",strerror(errno),filename);
 403 | 			exit(-1);
 404 | 		}
 405 | 	}
 406 | 	for (pkg=0; pkg<2; pkg++) {
 407 | 		pread(msr_fd[pkg],&msr_val,sizeof(msr_val),IA32_TIME_STAMP_COUNTER);
 408 | 		fprintf(stdout,"DEBUG: TSC on core %d socket %d is %ld\n",proc_in_pkg[pkg],pkg,msr_val);
 409 | 	}
 410 | 
 411 | 	pread(msr_fd[0],&msr_val,sizeof(msr_val),0x186);
 412 | 	printf("Core PerfEvtSel0 0x%lx\n",msr_val);
 413 | 	pread(msr_fd[0],&msr_val,sizeof(msr_val),0x187);
 414 | 	printf("Core PerfEvtSel1 0x%lx\n",msr_val);
 415 | 	pread(msr_fd[0],&msr_val,sizeof(msr_val),0x188);
 416 | 	printf("Core PerfEvtSel2 0x%lx\n",msr_val);
 417 | 	pread(msr_fd[0],&msr_val,sizeof(msr_val),0x189);
 418 | 	printf("Core PerfEvtSel3 0x%lx\n",msr_val);
 419 | 
 420 | 
 421 | 	// Program the CHA mesh counters
 422 | 	//   Each CHA has a block of 16 MSRs reserved, of which 12 are used
 423 | 	//   The base for each CHA is 0xE00 + 0x10*CHA
 424 | 	//   Within each block:
 425 | 	//   	Unit Control is at offset 0x00
 426 | 	//   	CTL0, 1, 2, 3 are at offsets 0x01, 0x02, 0x03, 0x04
 427 | 	//   	CTR0, 1, 2, 3 are at offsets 0x08, 0x09, 0x0a, 0x0b
 428 | 	//   For the moment I think I can ignore the filter registers at offsets 0x05 and 0x06
 429 | 	//     and the status register at offset 0x07
 430 | 	//   The control register needs bit 22 set to enabled, then bits 15:8 as Umask and 7:0 as EventSelect
 431 | 	//   Mesh Events:
 432 | 	//   	HORZ_RING_BL_IN_USE = 0xab
 433 | 	//   		LEFT_EVEN = 0x01
 434 | 	//   		LEFT_ODD = 0x02
 435 | 	//   		RIGHT_EVEN = 0x04
 436 | 	//   		RIGHT_ODD = 0x08
 437 | 	//   	VERT_RING_BL_IN_USE = 0xaa
 438 | 	//   		UP_EVEN = 0x01
 439 | 	//   		UP_ODD = 0x02
 440 | 	//   		DN_EVEN = 0x04
 441 | 	//   		DN_ODD = 0x08
 442 | 	//   For starters, I will combine even and odd and create 4 events
 443 | 	//   	0x004003ab	HORZ_RING_BL_IN_USE.LEFT
 444 | 	//   	0x00400cab	HORZ_RING_BL_IN_USE.RIGHT
 445 | 	//   	0x004003aa	VERT_RING_BL_IN_USE.UP
 446 | 	//   	0x00400caa	VERT_RING_BL_IN_USE.DN
 447 | 
 448 | 	// first set to try....
 449 | 	cha_perfevtsel[0] = 0x004003ab;		// HORZ_RING_BL_IN_USE.LEFT
 450 | 	cha_perfevtsel[1] = 0x00400cab;		// HORZ_RING_BL_IN_USE.RIGHT
 451 | 	cha_perfevtsel[2] = 0x004003aa;		// VERT_RING_BL_IN_USE.UP
 452 | 	cha_perfevtsel[3] = 0x00400caa;		// VERT_RING_BL_IN_USE.DN
 453 | 
 454 | 	// second set to try....
 455 | //	cha_perfevtsel[0] = 0x004001ab;		// HORZ_RING_BL_IN_USE.LEFT_EVEN
 456 | //	cha_perfevtsel[1] = 0x004002ab;		// HORZ_RING_BL_IN_USE.LEFT_ODD
 457 | //	cha_perfevtsel[2] = 0x004004ab;		// HORZ_RING_BL_IN_USE.RIGHT_EVEN
 458 | //	cha_perfevtsel[3] = 0x004008ab;		// HORZ_RING_BL_IN_USE.RIGHT_ODD
 459 | 
 460 | 	// Snoop Filter Eviction counters
 461 | 	cha_perfevtsel[0] = 0x0040073d;		// SF_EVICTION S,E,M states
 462 | 	cha_perfevtsel[1] = 0x00400334;		// LLC_LOOKUP.DATA_READ	<-- requires CHA_FILTER0[26:17]
 463 | 	cha_perfevtsel[2] = 0x00400534;		// LLC_LOOKUP.DATA_WRITE (WB from L2) <-- requires CHA_FILTER0[26:17]
 464 | 	cha_perfevtsel[3] = 0x0040af37;		// LLC_VICTIMS.TOTAL (MESF) (does not count clean victims)
 465 | 	uint64_t cha_filter0 = 0x01e20000;		// set bits 24,23,22,21,17 FMESI -- all LLC lookups, no SF lookups
 466 | 
 467 | 	printf("CHA PerfEvtSel0 0x%lx\n",cha_perfevtsel[0]);
 468 | 	printf("CHA PerfEvtSel1 0x%lx\n",cha_perfevtsel[1]);
 469 | 	printf("CHA PerfEvtSel2 0x%lx\n",cha_perfevtsel[2]);
 470 | 	printf("CHA PerfEvtSel3 0x%lx\n",cha_perfevtsel[3]);
 471 | 	printf("CHA FILTER0 0x%lx\n",cha_filter0);
 472 | 
 473 | #ifdef VERBOSE
 474 | 	printf("VERBOSE: programming CHA counters\n");
 475 | #endif
 476 | 
 477 | 	for (pkg=0; pkg<2; pkg++) {
 478 | 		for (tile=0; tile<NUM_CHA_USED; tile++) {
 479 | 			msr_num = 0xe00 + 0x10*tile;		// box control register -- set enable bit
 480 | 			msr_val = 0x00400000;
 481 | 			pwrite(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 482 | 			msr_num = 0xe00 + 0x10*tile + 1;	// ctl0
 483 | 			msr_val = cha_perfevtsel[0];
 484 | 			pwrite(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 485 | 			msr_num = 0xe00 + 0x10*tile + 2;	// ctl1
 486 | 			msr_val = cha_perfevtsel[1];
 487 | 			pwrite(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 488 | 			msr_num = 0xe00 + 0x10*tile + 3;	// ctl2
 489 | 			msr_val = cha_perfevtsel[2];
 490 | 			pwrite(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 491 | 			msr_num = 0xe00 + 0x10*tile + 4;	// ctl3
 492 | 			msr_val = cha_perfevtsel[3];
 493 | 			pwrite(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 494 | 			msr_num = 0xe00 + 0x10*tile + 5;	// filter0
 495 | 			msr_val = cha_filter0;				// bits 24:21,17 FMESI -- all LLC lookups, not not SF lookups
 496 | 			pwrite(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 497 | 		}
 498 | 	}
 499 | #ifdef VERBOSE
 500 | 	printf("VERBOSE: finished programming CHA counters\n");
 501 | #endif
 502 | #endif
 503 | 
 504 | #ifdef IMC_COUNTS
 505 | 	// ===================================================================================================================
 506 | 	// Read the current programming of the IMC counters and look for the standard values (in this order)
 507 | 	//     CAS_COUNT.READS		Event 0x04, Umask 0x03
 508 | 	//     CAS_COUNT.WRITES		Event 0x04, Umask 0x0C
 509 | 	//     ACT.ALL				Event 0x01, Umask 0x0B
 510 | 	//     PRE_COUNT.MISS		Event 0x02, Umask 0x01
 511 | 	//     DCLK
 512 | 
 513 | #ifdef VERBOSE
 514 | 	printf("Preparing to program IMC counters\n");
 515 | #endif
 516 | 	// expected values of IMC performance counter event select control registers
 517 | 	imc_perfevtsel[0] = 0x00400304;		// CAS_COUNT.READS
 518 | 	imc_perfevtsel[1] = 0x00400C04;		// CAS_COUNT.WRITES
 519 | 	imc_perfevtsel[2] = 0x00400B01;		// ACT_COUNT.ALL
 520 | 	imc_perfevtsel[3] = 0x00400102;		// PRE_COUNT.MISS
 521 | 	imc_perfevtsel[4] = 0x00400000;		// DCLK
 522 | 
 523 | 	imc_vid_did[0] = 0x20428086;		// all channel 0 devices are 2042
 524 | 	imc_vid_did[1] = 0x20468086;		// all channel 1 devices are 2046
 525 | 	imc_vid_did[2] = 0x204a8086;		// all channel 2 devices are 204a
 526 | 
 527 | 	printf("IMC PerfEvtSel0 0x%lx\n",imc_perfevtsel[0]);
 528 | 	printf("IMC PerfEvtSel1 0x%lx\n",imc_perfevtsel[1]);
 529 | 	printf("IMC PerfEvtSel2 0x%lx\n",imc_perfevtsel[2]);
 530 | 	printf("IMC PerfEvtSel3 0x%lx\n",imc_perfevtsel[3]);
 531 | 	printf("IMC PerfEvtSel4 0x%lx\n",imc_perfevtsel[4]);
 532 | 
 533 | 	// print the full wall-clock time in seconds and microseconds
 534 | 	// assume both components of tp struct are longs.
 535 | 	fprintf(stdout,"# %s\n", rcsid);
 536 |     i = gettimeofday(&tp,&tzp);
 537 | 	fprintf(stdout,"%ld %ld\n", tp.tv_sec,tp.tv_usec);
 538 | 
 539 | 	for (socket=0; socket<NUM_SOCKETS; socket++) {
 540 | 		bus = IMC_BUS_Socket[socket];
 541 | #ifdef VERBOSE
 542 | 		printf("VERBOSE: socket %d bus %d\n",socket,bus);
 543 | #endif
 544 | 		for (channel=0; channel<NUM_IMC_CHANNELS; channel++) {
 545 | 			device = IMC_Device_Channel[channel];
 546 | 			function = IMC_Function_Channel[channel];
 547 | #ifdef VERBOSE
 548 | 			printf("VERBOSE: channel %d device %d function %d\n",channel, device, function);
 549 | #endif
 550 | 			// check to make sure this is the correct device
 551 | 			offset = 0x0;
 552 | 			index = PCI_cfg_index(bus, device, function, offset);
 553 | 			value = mmconfig_ptr[index];
 554 | 			if ( value != imc_vid_did[channel%3]) {
 555 | 				fprintf(stderr,"WARNING!!!! socket %d, channel %d has vid_did %x but should be %x\n",socket,channel,value,imc_vid_did[channel%3]);
 556 | 			}
 557 | 			for (counter=0; counter<NUM_IMC_COUNTERS; counter++) {
 558 | 				// check to see if this unit is programmed correctly and reprogram if needed
 559 | 				offset = IMC_PmonCtl_Offset[counter];
 560 | 				index = PCI_cfg_index(bus, device, function, offset);
 561 | 				value = mmconfig_ptr[index];
 562 | 				if ( value != imc_perfevtsel[counter]) {
 563 | 					fprintf(stderr,"WARNING!!!! socket %d, channel %d has perfevtsel %x but should be %x -- reprogramming\n",socket,channel,value,imc_perfevtsel[counter]);
 564 | 					mmconfig_ptr[index] = imc_perfevtsel[counter];
 565 | 				}
 566 | 
 567 | 			}
 568 | 		}
 569 | 	}
 570 | #endif
 571 | 
 572 | // ========= END OF PERFORMANCE COUNTER SETUP ========================================================================
 573 | 
 574 | #ifdef MAP_L3
 575 | // ============== BEGIN L3 MAPPING TESTS ==============================
 576 | // For each of the PAGES_MAPPED 2MiB pages:
 577 | //   1. Use "access()" to see if the mapping file already exists.
 578 | //		If exists:
 579 | //   		2. Use "stat()" to make sure the file is the correct size
 580 | //   		   If right size:
 581 | //   		   	3. Read the contents into the 32768-element int8_t array of L3 numbers.
 582 | //   		   Else (wrong size):
 583 | //   		   	4. Abort and tell the user to fix it manually.
 584 | //   	Else (not exists):
 585 | //   		4. Call the mapping function to re-compute the map
 586 | //   		5. Create mapping file
 587 | //   		6. Save data in mapping file
 588 | //   		7. Close output file
 589 | 
 590 | 	FILE *ptr_mapping_file;
 591 | 	int needs_mapping;
 592 | 	int good, good_old, good_new, pass1, pass2, pass3, found, numtries;
 593 | 	int min_count, max_count, sum_count, old_cha;
 594 | 	double avg_count, goodness1, goodness2, goodness3;
 595 | 	int globalsum = 0;
 596 | 	long totaltries = 0;
 597 | 	int NFLUSHES = 1000;
 598 | 	for (page_number=0; page_number<PAGES_MAPPED; page_number++) {
 599 | 		needs_mapping=0;
 600 | 		sprintf(filename,"PADDR_0x%.12lx.map",paddr_by_page[page_number]);
 601 | 		i = access(filename, F_OK);
 602 | 		if (i == -1) {								// file does not exist
 603 | 			printf("DEBUG: Mapping file %s does not exist -- will create file after mapping cache lines\n",filename);
 604 | 			needs_mapping = 1;
 605 | 		} else {									// file exists
 606 | 			i = access(filename, R_OK);
 607 | 			if (i == -1) {							// file exists without read permissions
 608 | 				printf("ERROR: Mapping file %s exists, but without read permission\n",filename);
 609 | 				exit(1);
 610 | 			} else {								// file exists with read permissions
 611 | 				ptr_mapping_file = fopen(filename,"r");
 612 | 				if (!ptr_mapping_file) {
 613 | 					printf("ERROR: Failed to open Mapping File %s, should not happen\n",filename);
 614 | 					exit(2);
 615 | 				}
 616 | 				k = fread(&cha_by_page[page_number][0],(size_t) 32768,(size_t) 1,ptr_mapping_file);
 617 | 				if (k != 1) {					// incorrect read length
 618 | 					printf("ERROR: Read from Mapping File %s, returned the wrong record count %ld expected 1\n",filename,k);
 619 | 					exit(3);
 620 | 				} else {							// correct read length
 621 | 					printf("DEBUG: Mapping File read for %s succeeded -- skipping mapping for this page\n",filename);
 622 | 					needs_mapping = 0;
 623 | 				}
 624 | 			}
 625 | 		}
 626 | 		if (needs_mapping == 1) {
 627 | 			// code imported from SystemMirrors/Hikari/MemSuite/InterventionLatency/L3_mapping.c
 628 | #ifdef VERBOSE
 629 | 			printf("DEBUG: here I need to perform the mapping for paddr 0x%.12lx, and then save the file\n",paddr_by_page[page_number]);
 630 | #endif
 631 | 			page_base_index = page_number*262144;		// index of element at beginning of current 2MiB page
 632 | 			for (line_number=0; line_number<32768; line_number++) {
 633 | 				good = 0;
 634 | 				good_old = 0;
 635 | 				good_new = 0;
 636 | 				numtries = 0;
 637 | #ifdef VERBOSE
 638 | 				if (line_number%64 == 0) {
 639 | 					pagemapentry = get_pagemap_entry(&array[page_base_index+line_number*8]);
 640 | 					printf("DEBUG: page_base_index %ld line_number %ld index %ld pagemapentry 0x%lx\n",page_base_index,line_number,page_base_index+line_number*8,pagemapentry);
 641 | 				}
 642 | #endif
 643 | 				do  {               // -------------- Inner Repeat Loop until results pass "goodness" tests --------------
 644 | 					numtries++;
 645 | 					if (numtries > 100) {
 646 | 						printf("ERROR: No good results for line %d after %d tries\n",line_number,numtries);
 647 | 						exit(101);
 648 | 					}
 649 | 					totaltries++;
 650 | 				// 1. read L3 counters before starting test
 651 | 				for (tile=0; tile<NUM_CHA_USED; tile++) {
 652 | 					msr_num = 0xe00 + 0x10*tile + 0x8 + 1;				// counter 1 is the LLC_LOOKUPS.READ event
 653 | 					pread(msr_fd[0],&msr_val,sizeof(msr_val),msr_num);
 654 | 					cha_counts[0][tile][1][0] = msr_val;					//  use the array I have already declared for cha counts
 655 | 					// printf("DEBUG: page %ld line %ld msr_num 0x%x msr_val %ld cha_counter1 %lu\n",
 656 | 					//		page_number,line_number,msr_num,msr_val,cha_counts[0][tile][1][0]);
 657 | 				}
 658 | 
 659 | 				// 2. Access the line many times
 660 | 				sum = 0;
 661 | 				for (i=0; i<NFLUSHES; i++) {
 662 | 					sum += array[page_base_index+line_number*8];
 663 | 					_mm_mfence();
 664 | 					_mm_clflush(&array[page_base_index+line_number*8]);
 665 | 					_mm_mfence();
 666 | 				}
 667 | 				globalsum += sum;
 668 | 
 669 | 				// 3. read L3 counters after loads are done
 670 | 				for (tile=0; tile<NUM_CHA_USED; tile++) {
 671 | 					msr_num = 0xe00 + 0x10*tile + 0x8 + 1;				// counter 1 is the LLC_LOOKUPS.READ event
 672 | 					pread(msr_fd[0],&msr_val,sizeof(msr_val),msr_num);
 673 | 					cha_counts[0][tile][1][1] = msr_val;					//  use the array I have already declared for cha counts
 674 | 				}
 675 | 
 676 | #ifdef VERBOSE
 677 | 				for (tile=0; tile<NUM_CHA_USED; tile++) {
 678 | 					printf("DEBUG: page %ld line %ld cha_counter1_after %lu cha_counter1 before %lu delta %lu\n",
 679 | 							page_number,line_number,cha_counts[0][tile][1][1],cha_counts[0][tile][1][0],cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0]);
 680 | 				}
 681 | #endif
 682 | 
 683 | 				//   CHA counter 1 set to LLC_LOOKUP.READ
 684 | 				//
 685 | 				//  4. Determine which L3 slice owns the cache line and
 686 | 				//  5. Save the CHA number in the cha_by_page[page][line] array
 687 | 
 688 | 				// first do a rough quantitative checks of the "goodness" of the data
 689 | 				//		goodness1 = max/NFLUSHES (pass if >95%)
 690 | 				// 		goodness2 = min/NFLUSHES (pass if <20%)
 691 | 				//		goodness3 = avg/NFLUSHES (pass if <40%)
 692 | 				max_count = 0;
 693 | 				min_count = 1<<30;
 694 | 				sum_count = 0;
 695 | 				for (tile=0; tile<NUM_CHA_USED; tile++) {
 696 | 					max_count = MAX(max_count, cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0]);
 697 | 					min_count = MIN(min_count, cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0]);
 698 | 					sum_count += cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0];
 699 | 				}
 700 | 				avg_count = (double)(sum_count - max_count) / (double)(NUM_CHA_USED);
 701 | 				goodness1 = (double) max_count / (double) NFLUSHES;
 702 | 				goodness2 = (double) min_count / (double) NFLUSHES;
 703 | 				goodness3 =          avg_count / (double) NFLUSHES;
 704 | 				// compare the goodness parameters with manually chosen limits & combine into a single pass (good=1) or fail (good=0)
 705 | 				pass1 = 0;
 706 | 				pass2 = 0;
 707 | 				pass3 = 0;
 708 | 				if ( goodness1 > 0.95 ) pass1 = 1;
 709 | 				if ( goodness2 < 0.20 ) pass2 = 1;
 710 | 				if ( goodness3 < 0.40 ) pass3 = 1;
 711 | 				good_new = pass1 * pass2 * pass3;
 712 | #ifdef VERBOSE
 713 | 				printf("GOODNESS: line_number %ld max_count %d min_count %d sum_count %d avg_count %f goodness1 %f goodness2 %f goodness3 %f pass123 %d %d %d\n",
 714 | 								  line_number, max_count, min_count, sum_count, avg_count, goodness1, goodness2, goodness3, pass1, pass2, pass3);
 715 | 				if (good_new == 0) printf("DEBUG: one or more of the sanity checks failed for line=%ld: %d %d %d goodness values %f %f %f\n",
 716 | 					line_number,pass1,pass2,pass3,goodness1,goodness2,goodness3);
 717 | #endif
 718 | 
 719 | 				// test to see if more than one CHA reports > 0.95*NFLUSHES events
 720 | 				found = 0;
 721 | 				old_cha = -1;
 722 | 				int min_counts = (NFLUSHES*19)/20;
 723 | 				for (tile=0; tile<NUM_CHA_USED; tile++) {
 724 | 					if (cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0] >= min_counts) {
 725 | 						old_cha = cha_by_page[page_number][line_number];
 726 | 						cha_by_page[page_number][line_number] = tile;
 727 | 						found++;
 728 | #ifdef VERBOSE
 729 | 						if (found > 1) {
 730 | 							printf("WARNING: Multiple (%d) CHAs found using counter 1 for cache line %ld, index %ld: old_cha %d new_cha %d\n",found,line_number,page_base_index+line_number*8,old_cha,cha_by_page[page_number][line_number]);
 731 | 						}
 732 | #endif
 733 | 					}
 734 | 				}
 735 | 				if (found == 0) {
 736 | 					good_old = 0;
 737 | #ifdef VERBOSE
 738 | 					printf("WARNING: no CHA entry has been found for line %ld!\n",line_number);
 739 | 					printf("DEBUG dump for no CHA found\n");
 740 | 					for (tile=0; tile<NUM_CHA_USED; tile++) {
 741 | 						printf("CHA %d LLC_LOOKUP.READ          delta %ld\n",tile,(cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0]));
 742 | 					}
 743 | #endif
 744 | 				} else if (found == 1) {
 745 | 					good_old = 1;
 746 | 				} else {
 747 | 					good_old = 0;
 748 | #ifdef VERBOSE
 749 | 					printf("DEBUG dump for multiple CHAs found\n");
 750 | 					for (tile=0; tile<NUM_CHA_USED; tile++) {
 751 | 						printf("CHA %d LLC_LOOKUP.READ          delta %ld\n",tile,(cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0]));
 752 | 					}
 753 | #endif
 754 | 				}
 755 | 				good = good_new * good_old;         // trigger a repeat if either the old or new tests failed
 756 | 				}
 757 | 				while (good == 0);
 758 | #if 0
 759 | 				// 6. save the cache line number in the appropriate the cbo_indices[cbo][#lines] array
 760 | 				// 7. increment the corresponding cbo_num_lines[cbo] array entry
 761 | 				this_cbo = cha_by_page[page_number][line_number];
 762 | 				if (this_cbo == -1) {
 763 | 					printf("ERROR: cha_by_page[%ld][%ld] has not been set!\n",page_number,line_number);
 764 | 					exit(80);
 765 | 				}
 766 | 				cbo_indices[this_cbo][cbo_num_lines[this_cbo]] = line_number;
 767 | 				cbo_num_lines[this_cbo]++;
 768 | #endif
 769 | 			}
 770 | 			// I have not overwritten the filename, but I will rebuild it here just in case I add something stupid in between....
 771 | 			sprintf(filename,"PADDR_0x%.12lx.map",paddr_by_page[page_number]);
 772 | 			ptr_mapping_file = fopen(filename,"w");
 773 | 			if (!ptr_mapping_file) {
 774 | 				printf("ERROR: Failed to open Mapping File %s for writing -- aborting\n",filename);
 775 | 				exit(4);
 776 | 			}
 777 | 			// first try -- write one record of 32768 bytes
 778 | 			rc64 = fwrite(&cha_by_page[page_number][0],(size_t) 32768, (size_t) 1, ptr_mapping_file);
 779 | 			if (rc64 != 1) {
 780 | 				printf("ERROR: failed to write one 32768 Byte record to  %s -- return code %ld\n",filename,rc64);
 781 | 				exit(5);
 782 | 			} else {
 783 | 				printf("SUCCESS: wrote mapping file %s\n",filename);
 784 | 			}
 785 | 		}
 786 | 	}
 787 | 	printf("DUMMY: globalsum %d\n",globalsum);
 788 | 	printf("VERBOSE: L3 Mapping Complete in %ld tries for %d cache lines ratio %f\n",totaltries,32768*PAGES_MAPPED,(double)totaltries/(double)(32768*PAGES_MAPPED));
 789 | 
 790 | #ifndef MYHUGEPAGE_1GB
 791 | 	// TODO!! Fix this so that it is not hard-coded for the 24p case!!
 792 | 	//
 793 | 	// now that the mapping is complete, I can add up the number of lines mapped to each CHA
 794 | 	// be careful to count only the lines that are used, not the full 24MiB
 795 | 	// 3 million elements is ~11.44 2MiB pages, so count all lines in each of the first 11 pages
 796 | 	// If I did the arithmetic correctly, the 3 million elements uses 931328 Bytes of the 12th 2MiB page
 797 | 	// which is 116416 elements or 14552 cache lines.
 798 | 
 799 | 	// first accumulate the first 11 full pages
 800 | 	for (page_number=0; page_number<11; page_number++) {
 801 | 		for (line_number=0; line_number<32768; line_number++) {
 802 | 			lines_by_cha[cha_by_page[page_number][line_number]]++;
 803 | 		}
 804 | 	}
 805 | 	// then accumulate the partial 12th page
 806 | 	for (line_number=0; line_number<14552; line_number++) {
 807 | 		lines_by_cha[cha_by_page[11][line_number]]++;
 808 | 	}
 809 | 	// output
 810 | 	long lines_accounted = 0;
 811 | 	printf("LINES_BY_CHA");
 812 | 	for (i=0; i<NUM_CHA_USED; i++) {
 813 | 		printf(" %ld",lines_by_cha[i]);
 814 | 		lines_accounted += lines_by_cha[i];
 815 | 	}
 816 | 	printf("\n");
 817 | 	printf("ACCCOUNTED FOR %ld lines expected %ld lines\n",lines_accounted,l2_contained_size/8);
 818 | #endif
 819 | 
 820 | // ============== END L3 MAPPING TESTS ==============================
 821 | #endif
 822 | 
 823 | 
 824 | 	// NEW LOOP STRUCTURE -- MCCALPIN
 825 | 	// I want to run the test at various offsets within each of the 1GiB
 826 | 	// pages allocated.
 827 | 	// Start with repeating the test for the beginning of each 1GiB page.
 828 | 	// I can simply add 134,217,728 to the jstart and jend values to 
 829 | 	// move to the next 1GiB page
 830 | 
 831 | 	printf("DEBUG: jstart[0] = %ld\n",jstart[0]);
 832 | 	long current_page;
 833 | 	for (current_page=0; current_page < NUMPAGES; current_page++) {
 834 | 		if (current_page > 0) {
 835 | 			for (i=0; i<CORES_USED; i++) {
 836 | 				jstart[i] += 134217728;
 837 | 				jend[i] += 134217728;
 838 | 			}
 839 | 			printf("DEBUG: jstart[0] = %ld\n",jstart[0]);
 840 | 		}
 841 | 
 842 | 
 843 | 	// For the snoop filter tests, I want to repeatedly read 
 844 | 	// some number of arrays per core with an aggregate footprint
 845 | 	// close to 1MiB per core 
 846 | 	// 24 cores = 24 MiB = 3 Mi elements, so 
 847 | 	// using an array length of 3 million should be just about right 95.3674%
 848 | 
 849 | 	// l2_contained_size = arraylen;			// only use if I want a large memory-contained version
 850 | 	inner_repetitions = 1000;
 851 | 	int stride = 2;		// used in thread binding checks: use 2 for Dell nodes, 1 for Intel nodes
 852 | 
 853 | 	// try to pre-load the working data into the L2 caches before the initial performance counter reads
 854 | 	sum = 0.0;
 855 | #pragma omp parallel for reduction(+:sum)
 856 | 	for (j=jstart[0]; j<jstart[0]+l2_contained_size; j++) sum += array[j];
 857 | 
 858 | 	// While I am at it, I need to warm up the cores using AVX-512 code to get them to full frequency
 859 | 	// This may take up to 100 microseconds, or maybe 400,000 AVX512 instructions per thread.
 860 | 	// This is a pain because I can't trust the compiler to generate AVX512 code at any given time,
 861 | 	// so I have to resort to inline assembly.
 862 | 	tsc_start = rdtsc();
 863 | #pragma omp parallel for 
 864 | 	for (i=0; i<CORES_USED; i++) {
 865 | 		for (j=0; j<10*1000*1000; j++) {
 866 | 			__asm__ __volatile__ (
 867 | 				"vpaddq %%zmm0, %%zmm1, %%zmm2\n\t"
 868 | 				"vpaddq %%zmm1, %%zmm2, %%zmm3\n\t"
 869 | 				"vpaddq %%zmm2, %%zmm3, %%zmm0\n\t"
 870 | 				"vpaddq %%zmm3, %%zmm0, %%zmm1"
 871 | 			: : : "zmm0","zmm1","zmm2","zmm3");
 872 | 		}
 873 | 	}
 874 | 	tsc_end = rdtsc();
 875 | 	printf("DEBUG: WARMUP LOOP took %lu TSC cycles\n",tsc_end - tsc_start);
 876 | 
 877 | 
 878 | // =================== BEGINNING OF PERFORMANCE COUNTER READS BEFORE KERNEL TESTING ==============================
 879 | #ifdef IMC_COUNTS
 880 | 	// read the initial values of the IMC counters
 881 |     for (socket=0; socket<NUM_SOCKETS; socket++) {
 882 |         bus = IMC_BUS_Socket[socket];
 883 |         for (channel=0; channel<NUM_IMC_CHANNELS; channel++) {
 884 |             device = IMC_Device_Channel[channel];
 885 |             function = IMC_Function_Channel[channel];
 886 |             for (counter=0; counter<NUM_IMC_COUNTERS; counter++) {
 887 |                 offset = IMC_PmonCtr_Offset[counter];
 888 |                 index = PCI_cfg_index(bus, device, function, offset);
 889 | 
 890 |                 // read each counter twice to identify rare cases where the low-order bits
 891 |                 // overflow and increment the high-order bits between the two reads.
 892 |                 // Use the second set of values unless (( high_1 != high_0 ) && ( low_1 > low_0))
 893 |                 //   (this indicates that the counter rolled between the 3rd and 4th reads).
 894 |                 low_0 = mmconfig_ptr[index];
 895 |                 high_0 = mmconfig_ptr[index+1];
 896 | 
 897 |                 low_1 = mmconfig_ptr[index];
 898 |                 high_1 = mmconfig_ptr[index+1];
 899 | 
 900 |                 if ( (high_1 != high_0) && (low_1 > low_0) ) {
 901 |                     count = ((uint64_t) high_0) << 32 | (uint64_t) low_0;
 902 |                 } else {
 903 |                     count = ((uint64_t) high_1) << 32 | (uint64_t) low_1;
 904 |                 }
 905 |                 imc_counts[socket][channel][counter][0] = count;
 906 |             }
 907 |         }
 908 |     }
 909 | #if 0
 910 | 	// for debugging only: print initial values of IMC counts
 911 | 	for (socket=0; socket<NUM_SOCKETS; socket++) {
 912 | 		for (channel=0; channel<NUM_IMC_CHANNELS; channel++) {
 913 | 			fprintf(stdout,"%d %d",socket,channel);
 914 | 			for (counter=0; counter<NUM_IMC_COUNTERS; counter++) {
 915 | 				fprintf(stdout," %ld",imc_counts[socket][channel][counter][0]);
 916 | 			}
 917 | 			fprintf(stdout,"\n");
 918 | 		}
 919 | 	}
 920 | #endif
 921 | #endif
 922 | 
 923 | #ifdef CHA_COUNTS
 924 | 	// read the initial values of the CHA mesh counters
 925 | 	for (pkg=0; pkg<2; pkg++) {
 926 | 		for (tile=0; tile<NUM_CHA_USED; tile++) {
 927 | 			for (counter=0; counter<4; counter++) {
 928 | 				msr_num = 0xe00 + 0x10*tile + 0x8 + counter;
 929 | 				pread(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 930 | 				cha_counts[pkg][tile][counter][0] = msr_val;
 931 | 			}
 932 | 		}
 933 | 	}
 934 | #if 0
 935 | 	// for debugging only: print initial values of CHA counters
 936 | 	for (pkg=0; pkg<2; pkg++) {
 937 | 		for (tile=0; tile<NUM_CHA_USED; tile++) {
 938 | 			for (counter=0; counter<4; counter++) {
 939 | 				printf("Package %d, tile %d, counter %d, value %lu\n",pkg,tile,counter,cha_counts[pkg][tile][counter][0]);
 940 | 			}
 941 | 		}
 942 | 	}
 943 | #endif
 944 | #endif
 945 | 
 946 | 	// ------ read programmable core counters before test loop ------
 947 | 
 948 | #pragma omp parallel for private(counter)
 949 | 	for (i=0; i<CORES_USED; i++) {
 950 | #ifdef CHECK_THREAD_LOCATION
 951 | 		if (get_core_number() != stride*i) {
 952 | 			printf("ERROR: thread %d is in the wrong place %d\n",i,get_core_number());
 953 | 		}
 954 | #endif
 955 | 		for (counter=0; counter<4; counter++) {
 956 | 			core_counters[i][counter][0] = rdpmc(counter);
 957 | 		}
 958 | 	}
 959 | 
 960 | 	tsc_start = rdtsc();
 961 | 	// ===================================== CODE TO TEST BEGINS HERE =======================================================================
 962 | 
 963 | #ifdef SIMPLE_OMP_LOOP
 964 | 	for (k=0; k<inner_repetitions; k++) {
 965 | #pragma omp parallel for, reduction(+:sum)
 966 | 		for (j=jstart[0]; j<jstart[0]+l2_contained_size; j++) {
 967 | 			sum += array[j];
 968 | 		}
 969 | 	}
 970 | #ifdef CHECK_START_STOP
 971 | 	printf("CHECK_START_STOP: SIMPLE_OMP_LOOP: start %ld end %ld vl %ld\n",jstart[0],jstart[0]+l2_contained_size,l2_contained_size);
 972 | #endif
 973 | #else
 974 | #pragma omp parallel for private(j,k,iters,private_sum)
 975 | 	for (i=0; i<CORES_USED; i++) {
 976 | 		iters = 0;
 977 | 		partial_sums[i] = 0.0;
 978 | 		fixed_counters[i][0][0] = rdpmc_instructions();
 979 | 		fixed_counters[i][1][0] = rdpmc_actual_cycles();
 980 | 		fixed_counters[i][2][0] = rdpmc_reference_cycles();
 981 | 		fixed_counters[i][3][0] = rdtsc();
 982 | 		for (k=0; k<inner_repetitions; k++) {
 983 | 			private_sum = ssum(&array[jstart[i]],vl[i]);
 984 | 			partial_sums[i] += private_sum;
 985 | 			iters++;
 986 | 		}
 987 | 		fixed_counters[i][0][1] = rdpmc_instructions();
 988 | 		fixed_counters[i][1][1] = rdpmc_actual_cycles();
 989 | 		fixed_counters[i][2][1] = rdpmc_reference_cycles();
 990 | 		fixed_counters[i][3][1] = rdtsc();
 991 | 		iteration_counts[i] = iters;
 992 | 	}
 993 | #ifdef CHECK_START_STOP
 994 | 	for (i=0; i<CORES_USED; i++) {
 995 | 		printf("CHECK_START_STOP: PER-THREAD-INDICES: thread %d jstart %ld jstop %ld vl %ld\n",i,jstart[i],jend[i],vl[i]);
 996 | 	}
 997 | #endif
 998 | #endif
 999 | 
1000 | // ===================================== END OF CODE UNDER TEST  ========================================================
1001 | 	tsc_end = rdtsc();
1002 | 
1003 | 	// use the partial sums so the optimizer does not remove the actual code under test
1004 | 	for (i=0; i<CORES_USED; i++) {
1005 | 		sum += partial_sums[i];
1006 | 	}
1007 | 
1008 | #pragma omp parallel for private(counter)
1009 | 	for (i=0; i<CORES_USED; i++) {
1010 | #ifdef CHECK_THREAD_LOCATION
1011 | 		if (get_core_number() != stride*i) {
1012 | 			printf("ERROR: thread %d is in the wrong place %d\n",i,get_core_number());
1013 | 		}
1014 | #endif
1015 | 		for (counter=0; counter<4; counter++) {
1016 | 			core_counters[i][counter][1] = rdpmc(counter);
1017 | #ifdef CHECK_SPECIAL_VALUES
1018 | 			if (core_counters[i][counter][1] == SPECIAL_VALUE) {
1019 | 				printf("BADNESS: SPECIAL_VALUE value returned on thread %d counter %d\n",i,counter);
1020 | 			}
1021 | #endif
1022 | #ifdef RETRIES
1023 | 			// if the counter returns zero, read it one more time....
1024 | 			if (core_counters[i][counter][1] == SPECIAL_VALUE) {
1025 | 				core_counters[i][counter][1] = rdpmc(counter);
1026 | #pragma omp atomic update 
1027 | 				retries++;
1028 | 			}
1029 | #endif
1030 | 		}
1031 | 	}
1032 | 
1033 | #ifdef CHECK_SPECIAL_VALUES
1034 | 	for (i=0; i<CORES_USED; i++) {
1035 | 		for (counter=0; counter<4; counter++) {
1036 | 			if (core_counters[i][counter][0] == SPECIAL_VALUE) {
1037 | 				printf("DEBUG: SPECIAL_VALUE found after loop in start count on thread %d counter %d\n",i,counter);
1038 | 				zeros++;
1039 | 			}
1040 | 			if (core_counters[i][counter][1] == SPECIAL_VALUE) {
1041 | 				printf("DEBUG: SPECIAL_VALUE found after loop in end count on thread %d counter %d\n",i,counter);
1042 | 				zeros++;
1043 | 			}
1044 | 		}
1045 | 	}
1046 | #endif
1047 | 
1048 | #ifdef CHA_COUNTS
1049 | 	// read the final values of the CHA mesh counters
1050 | 	for (pkg=0; pkg<2; pkg++) {
1051 | 		for (tile=0; tile<NUM_CHA_USED; tile++) {
1052 | 			for (counter=0; counter<4; counter++) {
1053 | 				msr_num = 0xe00 + 0x10*tile + 0x8 + counter;
1054 | 				pread(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
1055 | 				cha_counts[pkg][tile][counter][1] = msr_val;
1056 | 			}
1057 | 		}
1058 | 	}
1059 | #endif
1060 | 
1061 | #ifdef IMC_COUNTS
1062 | 	for (socket=0; socket<NUM_SOCKETS; socket++) {
1063 | 		bus = IMC_BUS_Socket[socket];
1064 | 		for (channel=0; channel<NUM_IMC_CHANNELS; channel++) {
1065 | 			device = IMC_Device_Channel[channel];
1066 | 			function = IMC_Function_Channel[channel];
1067 | 			for (counter=0; counter<NUM_IMC_COUNTERS; counter++) {
1068 | 				offset = IMC_PmonCtr_Offset[counter];
1069 | 				index = PCI_cfg_index(bus, device, function, offset);
1070 | 
1071 | 				// read each counter twice to identify rare cases where the low-order bits
1072 | 				// overflow and increment the high-order bits between the two reads.
1073 | 				// Use the second set of values unless (( high_1 != high_0 ) && ( low_1 > low_0))
1074 | 				//   (this indicates that the counter rolled between the 3rd and 4th reads).
1075 | 				low_0 = mmconfig_ptr[index];
1076 | 				high_0 = mmconfig_ptr[index+1];
1077 | 
1078 | 				low_1 = mmconfig_ptr[index];
1079 | 				high_1 = mmconfig_ptr[index+1];
1080 | 
1081 | 				if ( (high_1 != high_0) && (low_1 > low_0) ) {
1082 | 					count = ((uint64_t) high_0) << 32 | (uint64_t) low_0;
1083 | 				} else {
1084 | 					count = ((uint64_t) high_1) << 32 | (uint64_t) low_1;
1085 | 				}
1086 | 				imc_counts[socket][channel][counter][1] = count;
1087 | 			}
1088 | 		}
1089 | 	}
1090 | #endif
1091 | // ================================== END OF PERFORMANCE COUNTER READS AFTER TEST  ==============================================
1092 | 
1093 | 	t0 = 0.0;
1094 | 	t1 = (double) (tsc_end - tsc_start) / TSC_GHz / 1.0e9;
1095 | 	printf("Instrumented code required %f seconds to execute\n",t1-t0);
1096 | 	bandwidth = sizeof(double)*(double)l2_contained_size*(double)inner_repetitions / (t1-t0) / 1e9;
1097 | 	printf("Bandwidth %f GB/s\n",bandwidth);
1098 | 	printf("Bandwidth per core %f GB/s\n",bandwidth/(double)CORES_USED);
1099 | 	printf("Approx Bytes/cycle per core %f\n",bandwidth/(double)CORES_USED/2.0);
1100 | 
1101 | 	expected = (double)l2_contained_size * (double)(inner_repetitions) / (double)CORES_USED;
1102 | 	avg_cycles = (double)(tsc_end - tsc_start) / expected;
1103 | 	printf("Average TSC cycles per element %f\n",avg_cycles);
1104 | 
1105 | 	// clear the arrays for the package-level sums
1106 | 	for (pkg=0; pkg<2; pkg++) {
1107 | 		for (counter=0; counter<4; counter++) {			// no point in summing the cycle counts, so exclude counter 4
1108 | 			core_pkg_sums[pkg][counter] = 0;
1109 | 			fixed_pkg_sums[pkg][counter] = 0;
1110 | 			imc_pkg_sums[pkg][counter] = 0;
1111 | 			cha_pkg_sums[pkg][counter] = 0;
1112 | 		}
1113 | 	}
1114 | 
1115 | 	// compute core package sums and optional print
1116 | 	for (i=0; i<CORES_USED; i++) {
1117 | 		for (counter=0; counter<4; counter++) {
1118 | 			delta = corrected_pmc_delta(fixed_counters[i][counter][1],fixed_counters[i][counter][0],fixed_pmc_width);
1119 | 			fixed_pkg_sums[0][counter] += delta;
1120 | 		}
1121 | 		for (counter=0; counter<4; counter++) {
1122 | #ifdef CHECK_SPECIAL_VALUES
1123 | 			if (core_counters[i][counter][0] == SPECIAL_VALUE) {
1124 | 				printf("DEBUG: SPECIAL_VALUE found in post-processing in start count on thread %d counter %d\n",i,counter);
1125 | 			}
1126 | 			if (core_counters[i][counter][1] == SPECIAL_VALUE) {
1127 | 				printf("DEBUG: SPECIAL_VALUE found in post-processing in end count on thread %d counter %d\n",i,counter);
1128 | 			}
1129 | #endif
1130 | 			delta = corrected_pmc_delta(core_counters[i][counter][1],core_counters[i][counter][0],core_pmc_width);
1131 | #ifdef VERBOSE
1132 | 			printf("CORE %d counter %d end %ld start %ld delta %ld\n",i,counter,core_counters[i][counter][1],core_counters[i][counter][0],delta);
1133 | #endif
1134 | 			core_pkg_sums[0][counter] += delta;
1135 | 		}
1136 | 
1137 | 
1138 | 	}
1139 | 
1140 | 	if (dumpall == 1) {
1141 | 		report = 0;
1142 | 		for (i=0; i<CORES_USED; i++) {
1143 | 			for (counter=0; counter<4; counter++) {
1144 | 				delta = corrected_pmc_delta(core_counters[i][counter][1],core_counters[i][counter][0],core_pmc_width);
1145 | 				printf("CORE %d counter %d end %ld start %ld delta %ld\n",i,counter,core_counters[i][counter][1],core_counters[i][counter][0],delta);
1146 | 			}
1147 | 		}
1148 | 	}
1149 | 	report = 1;
1150 | 	dumpall = 0;
1151 | 
1152 | #ifdef CHA_COUNTS
1153 | 	// print out the differences and compute sums of differences
1154 | 	for (pkg=0; pkg<2; pkg++) {
1155 | 		for (tile=0; tile<NUM_CHA_USED; tile++) {
1156 | 			for (counter=0; counter<4; counter++) {
1157 | 				delta = corrected_pmc_delta(cha_counts[pkg][tile][counter][1],cha_counts[pkg][tile][counter][0],uncore_pmc_width);
1158 | #ifdef VERBOSE
1159 | 				printf("CHA pkg %d tile %d counter %d delta %ld\n",pkg,tile,counter,delta);
1160 | #endif
1161 | 				cha_pkg_sums[pkg][counter] += delta;
1162 | 			}
1163 | 		}
1164 | 	}
1165 | #endif
1166 | #ifdef IMC_COUNTS
1167 | 	for (pkg=0; pkg<2; pkg++) {
1168 | 		for (channel=0; channel<NUM_IMC_CHANNELS; channel++) {
1169 | 			for (counter=0; counter<NUM_IMC_COUNTERS; counter++) {
1170 | 				delta = corrected_pmc_delta(imc_counts[pkg][channel][counter][1],imc_counts[pkg][channel][counter][0],uncore_pmc_width);
1171 | #ifdef VERBOSE
1172 | 				printf("IMC pkg %d channel %d counter %d delta %ld\n",pkg,channel,counter,delta);
1173 | #endif
1174 | 				imc_pkg_sums[pkg][counter] += delta;
1175 | 			}
1176 | 		}
1177 | 	}
1178 | #endif
1179 | 
1180 | 	int max_display_pkg = 1;
1181 | 
1182 | 	for (pkg=0; pkg<max_display_pkg; pkg++) {
1183 | 		for (counter=0; counter<4; counter++) {
1184 | 			printf("CORE_PKG_SUMS pkg %d counter %d sum_delta %ld\n",pkg,counter,core_pkg_sums[pkg][counter]);
1185 | 		}
1186 | 	}
1187 | 
1188 | 	for (pkg=0; pkg<max_display_pkg; pkg++) {
1189 | 		for (counter=0; counter<4; counter++) {
1190 | 			printf("FIXED_PKG_SUMS pkg %d counter %d sum_delta %ld\n",pkg,counter,fixed_pkg_sums[pkg][counter]);
1191 | 		}
1192 | 	}
1193 | 
1194 | 	// the fixed-function counters are measured inside the OpenMP loop, so they should not be contaminated by 
1195 | 	// spin-waiting....
1196 | 	// Compute per-core metrics here -- note that the fixed-function counter set is (Instr, CoreCyc, RefCyc, TSC)
1197 | 	// 		Utilization = RefCyc/TSC (fixed2/fixed3)
1198 | 	// 		AvgGHz_unhalted = CoreCyc/RefCyc * 2.1  (fixed1/fixed2 * 2.1)
1199 | 	// 		AvgGHz_wall = CoreCyc/TSC * 2.1 (fixed1/fixed3 * 2.1)
1200 | 	// 		IPC = Instr/CoreCyc (fixed0/fixed1)
1201 | 	long delta_inst, delta_core, delta_ref, delta_tsc;
1202 | 	double utilization, avg_ghz, ipc;
1203 | 
1204 | 	printf("CORE_UTILIZATION ");
1205 | 	for (i=0; i<CORES_USED; i++) {
1206 | 		delta_ref  = corrected_pmc_delta(fixed_counters[i][2][1],fixed_counters[i][2][0],fixed_pmc_width);
1207 | 		delta_tsc  = corrected_pmc_delta(fixed_counters[i][3][1],fixed_counters[i][3][0],fixed_pmc_width);
1208 | 		utilization = (double)delta_ref / (double)delta_tsc;
1209 | 		printf("%6.4f ",utilization);
1210 | 	}
1211 | 	printf("\n");
1212 | 
1213 | 	float TSC_GHz;
1214 | 	TSC_GHz = get_TSC_frequency()/1.0e9;
1215 | 	printf("CORE_GHZ ");
1216 | 	for (i=0; i<CORES_USED; i++) {
1217 | 		delta_core = corrected_pmc_delta(fixed_counters[i][1][1],fixed_counters[i][1][0],fixed_pmc_width);
1218 | 		delta_ref  = corrected_pmc_delta(fixed_counters[i][2][1],fixed_counters[i][2][0],fixed_pmc_width);
1219 | 		avg_ghz = (double)delta_core / (double)delta_ref * TSC_GHz;
1220 | 		printf("%6.4f ",avg_ghz);
1221 | 	}
1222 | 	printf("\n");
1223 | 
1224 | 	printf("CORE_IPC ");
1225 | 	for (i=0; i<CORES_USED; i++) {
1226 | 		delta_inst = corrected_pmc_delta(fixed_counters[i][0][1],fixed_counters[i][0][0],fixed_pmc_width);
1227 | 		delta_core = corrected_pmc_delta(fixed_counters[i][1][1],fixed_counters[i][1][0],fixed_pmc_width);
1228 | 		ipc = (double)delta_inst / (double)delta_core;
1229 | 		printf("%6.4f ",ipc);
1230 | 	}
1231 | 	printf("\n");
1232 | 
1233 | 	printf("THREAD_EXECUTION_TIME ");
1234 | 	for (i=0; i<CORES_USED; i++) {
1235 | 		delta_tsc  = corrected_pmc_delta(fixed_counters[i][3][1],fixed_counters[i][3][0],fixed_pmc_width);
1236 | 		t0 = (double)delta_tsc / (TSC_GHz*1.0e9);
1237 | 		printf("%f ",t0);
1238 | 	}
1239 | 	printf("\n");
1240 | 
1241 | 
1242 | 
1243 | #ifdef CHA_COUNTS
1244 | 	for (pkg=0; pkg<max_display_pkg; pkg++) {
1245 | 		for (counter=0; counter<4; counter++) {
1246 | 			printf("CHA_PKG_SUMS pkg %d counter %d sum_delta %ld\n",pkg,counter,cha_pkg_sums[pkg][counter]);
1247 | 		}
1248 | 	}
1249 | #endif
1250 | #ifdef IMC_COUNTS
1251 | 	for (pkg=0; pkg<max_display_pkg; pkg++) {
1252 | 		for (counter=0; counter<4; counter++) {			// no point in summing the cycle counts, so exclude counter 4
1253 | 			printf("IMC_PKG_SUMS pkg %d counter %d sum_delta %ld\n",pkg,counter,imc_pkg_sums[pkg][counter]);
1254 | 		}
1255 | 	}
1256 | #endif
1257 | 
1258 | 	
1259 | 
1260 | 
1261 | 	// for the Snoop Filter set 
1262 | 	// 	expected = expected number of cache lines loaded from L2
1263 | 	// 	sf_evict_rate = #evictions / expected number of loads
1264 | 	expected = 8.0/64.0* (double)l2_contained_size * (double) inner_repetitions;
1265 | 	sf_evict_rate = (double) cha_pkg_sums[0][0] / expected;
1266 | 	printf("SnoopFilterEvictionRate %f\n",sf_evict_rate);
1267 | 
1268 | 	expected = (double)l2_contained_size * (double) (inner_repetitions+1); // adjusted for pre-load of data
1269 | 	printf("Dummy Sum value is %f, expected value %f\n",sum,expected);
1270 | 
1271 | 	expected = (double)l2_contained_size * (double) inner_repetitions;
1272 | 	printf("Expected number of cache lines loaded from L2 %f\n",expected/8.0);
1273 | 	printf("Number of performance counter wraprounds detected %d\n",nwraps);
1274 | #ifdef RETRIES
1275 | 	printf("Number of core performance counter reads retried %d\n",retries);
1276 | #endif
1277 | 	printf("Number of zero values found in the inner loop %d\n",zeros);
1278 | 	// printf("Expected Number of Loads for AVX2 code %ld\n",arraylen/4);	
1279 | 	// printf("Expected Number of Cache Lines loaded %ld\n",arraylen/8);	
1280 | 
1281 | 	for (i=0; i<CORES_USED; i++) {
1282 | 		if (iteration_counts[i] != inner_repetitions) {
1283 | 			printf("ERROR: thread %d iteration_counts %ld expected %ld\n",i,iteration_counts[i],inner_repetitions);
1284 | 		}
1285 | 	}
1286 | 
1287 | 	// per-core performance counter values
1288 | 	for (counter=0; counter<4; counter++) {
1289 | 		printf("CORE_counter %d ",counter);
1290 | 		for (i=0; i<CORES_USED; i++) {
1291 | 			delta = corrected_pmc_delta(core_counters[i][counter][1],core_counters[i][counter][0],core_pmc_width);
1292 | 			printf("%ld ",delta);
1293 | 		}
1294 | 		printf("\n");
1295 | 	}
1296 | 	// per-CHA performance counter values -- socket 0 only
1297 | 	for (counter=0; counter<4; counter++) {
1298 | 		printf("CHA_counter %d ",counter);
1299 | 		for (i=0; i<NUM_CHA_USED; i++) {
1300 | 			delta = corrected_pmc_delta(cha_counts[0][i][counter][1],cha_counts[0][i][counter][0],uncore_pmc_width);
1301 | 			printf("%ld ",delta);
1302 | 		}
1303 | 		printf("\n");
1304 | 	}
1305 | 
1306 | 	printf("Double-check physical address of first element used in array\n");
1307 | 	pagemapentry = get_pagemap_entry(&array[jstart[0]]);
1308 | 	printf("  array[%ld] va 0x%.16lx pa 0x%.16lx\n",jstart[0],&array[jstart[0]],pagemapentry);
1309 | 	}
1310 | }
1311 | 


--------------------------------------------------------------------------------
/SKX_IMC_BusDeviceFunctionOffset.h:
--------------------------------------------------------------------------------
 1 | // ================ Machine-Dependent Uncore Performance Monitor Locations =================
 2 | // These are the most common bus/device/function locations for the IMC counters on 
 3 | // Xeon Platinum 8160 (Skylake, SKX)
 4 | // These are set up to work on the TACC Stampede2 SKX nodes....
 5 | //
 6 | // Note that the PmonCtl offsets are for programmable counters 0-3, plus the fixed counter.
 7 | //     (The fixed counter only needs bit 22 enabled, most other bits are ignored)
 8 | // Note that the PmonCtr offsets are for the bottom 32 bits of a 48 bit counter in a
 9 | //     64-bit field.  The first four offsets are for the programmable counters 0-3,
10 | //     and the final value is for the Fixed-Function (DCLK) counter that should
11 | //     always increment at the DCLK frequency (1/2 the DDR transfer frequency).
12 | 
13 | int IMC_BUS_Socket[2] = {0x3a, 0xae};
14 | int IMC_Device_Channel[6] = {0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d};
15 | int IMC_Function_Channel[6] = {0x2, 0x6, 0x2, 0x2, 0x6, 0x2};
16 | int IMC_PmonCtl_Offset[5] = {0xd8, 0xdc, 0xe0, 0xe4, 0xf0}; 
17 | int IMC_PmonCtr_Offset[5] = {0xa0, 0xa8, 0xb0, 0xb8, 0xd0};
18 | 
19 | // ================ End of Machine-Dependent Uncore Performance Monitor Locations =================
20 | 


--------------------------------------------------------------------------------
/SetupCoreCounters.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # set this to point to the msrtools distribution if it is not 
 4 | # in the default path
 5 | WRMSR=./wrmsr
 6 | RDMSR=./rdmsr
 7 | 
 8 | echo "Are the performance counters globally enabled? Should be 0x70000000f or 0x7000000ff"
 9 | echo -n "  global perf counter enable MSR on core 0 is set to "
10 | $RDMSR -p 0 -c 0x38f
11 | echo "Are the fixed-function counters enabled? Should be 0x333 -- if any values include 0xb, need to disable the NMI watchdog"
12 | echo -n "  fixed counter config MSR on core 0 is set to "
13 | $RDMSR -p 0 -c 0x38d
14 | 
15 | 
16 | 
17 | # some useful events
18 | MEM_LOAD_RETIRED_L1_HIT=0x004301d1
19 | MEM_LOAD_RETIRED_L2_HIT=0x004302d1
20 | MEM_LOAD_RETIRED_L3_HIT=0x004304d1
21 | MEM_LOAD_RETIRED_L1_MISS=0x004308d1
22 | MEM_LOAD_RETIRED_L2_MISS=0x004310d1
23 | MEM_LOAD_RETIRED_L3_MISS=0x004320d1
24 | MEM_LOAD_RETIRED_FB_HIT=0x004340d1
25 | MEM_INST_RETIRED_ALL_LOADS=0x004381d0
26 | 
27 | L2_RQSTS_MISS=0x00433f24
28 | L1D_REPLACEMENTS=0x00430151
29 | L2_LINES_IN_ALL=0x00431ff1
30 | IDI_MISC_WB_DOWNGRADE=0x004304fe
31 | 
32 | # program these events on all cores
33 | echo "Setting up programmable counters on all cores...."
34 | echo "  Counter 0 MEM_INST_RETIRED_ALL_LOADS"
35 | echo "  Counter 1 L1D_REPLACEMENTS"
36 | echo "  Counter 2 L2_RQSTS_MISS"
37 | echo "  Counter 3 L2_LINES_IN_ALL"
38 | 
39 | $WRMSR -a 0x186 $MEM_INST_RETIRED_ALL_LOADS
40 | $WRMSR -a 0x187 $L1D_REPLACEMENTS
41 | $WRMSR -a 0x188 $L2_RQSTS_MISS
42 | $WRMSR -a 0x189 $L2_LINES_IN_ALL
43 | 


--------------------------------------------------------------------------------
/SnoopFilterMapper.c:
--------------------------------------------------------------------------------
   1 | // John D. McCalpin, mccalpin@tacc.utexas.edu
   2 | static char const rcsid[] = "$Id: SnoopFilterMapper.c,v 1.11 2018/05/17 22:24:58 mccalpin Exp mccalpin $";
   3 | 
   4 | // include files
   5 | #include <stdio.h>				// printf, etc
   6 | #include <stdint.h>				// standard integer types, e.g., uint32_t
   7 | #include <signal.h>				// for signal handler
   8 | #include <stdlib.h>				// exit() and EXIT_FAILURE
   9 | #include <string.h>				// strerror() function converts errno to a text string for printing
  10 | #include <fcntl.h>				// for open()
  11 | #include <errno.h>				// errno support
  12 | #include <assert.h>				// assert() function
  13 | #include <unistd.h>				// sysconf() function, sleep() function
  14 | #include <sys/mman.h>			// support for mmap() function
  15 | #include <linux/mman.h>			// required for 1GiB page support in mmap()
  16 | #include <math.h>				// for pow() function used in RAPL computations
  17 | #include <time.h>
  18 | #include <sys/time.h>			// for gettimeofday
  19 | 
  20 | # define ARRAYSIZE 2147483648L
  21 | 
  22 | // MYHUGEPAGE_1GB overrides default of 2MiB for hugepages
  23 | #if defined MYHUGEPAGE_1GB
  24 | #define MYPAGESIZE 1073741824UL
  25 | #define NUMPAGES 2L
  26 | #define PAGES_MAPPED 2L			// this is still specifying how many 2MiB pages to map
  27 | #else
  28 | #define MYPAGESIZE 2097152L
  29 | #define NUMPAGES 1024L
  30 | #define PAGES_MAPPED 14L
  31 | #endif
  32 | 
  33 | 
  34 | #define SPECIAL_VALUE (-1)
  35 | 
  36 | // interfaces for va2pa_lib.c
  37 | void print_pagemap_entry(unsigned long long pagemap_entry);
  38 | unsigned long long get_pagemap_entry( void * va );
  39 | 
  40 | int dumpall;			// when set to 1, will cause dump of lots of stuff for debugging
  41 | int report;
  42 | int nwraps;				// track number of performance counter wraps
  43 | 
  44 | double *array;					// array pointer to mmap on 1GiB pages
  45 | double *page_pointers[NUMPAGES];		// one pointer for each page allocated
  46 | uint64_t pageframenumber[NUMPAGES];	// one PFN entry for each page allocated
  47 | 
  48 | // constant value defines
  49 | # define NUM_SOCKETS 2				// 
  50 | # define NUM_IMC_CHANNELS 6			// includes channels on all IMCs in a socket
  51 | # define NUM_IMC_COUNTERS 5			// 0-3 are the 4 programmable counters, 4 is the fixed-function DCLK counter
  52 | # define NUM_CHA_BOXES 28
  53 | # define NUM_CHA_USED 28
  54 | # define NUM_CHA_COUNTERS 4
  55 | 
  56 | long imc_counts[NUM_SOCKETS][NUM_IMC_CHANNELS][NUM_IMC_COUNTERS][2];	// including the fixed-function (DCLK) counter as the final entry
  57 | long imc_pkg_sums[NUM_SOCKETS][NUM_IMC_COUNTERS];						// sum across channels for each chip
  58 | char imc_event_name[NUM_SOCKETS][NUM_IMC_CHANNELS][NUM_IMC_COUNTERS][32];		// reserve 32 characters for the IMC event names for each socket, channel, counter
  59 | uint32_t imc_perfevtsel[NUM_IMC_COUNTERS];			// expected control settings for the counters
  60 | uint32_t imc_vid_did[3];							// PCIe configuration space vendor and device IDs for the IMC blocks 
  61 | long cha_counts[NUM_SOCKETS][NUM_CHA_BOXES][NUM_CHA_COUNTERS][2];		// 2 sockets, 28 tiles per socket, 4 counters per tile, 2 times (before and after)
  62 | uint32_t cha_perfevtsel[NUM_CHA_COUNTERS];
  63 | long cha_pkg_sums[NUM_SOCKETS][NUM_CHA_COUNTERS];
  64 | 
  65 | #define MAXCORES 112
  66 | #define CORES_USED 24
  67 | // New feature -- core counters.
  68 | // upgrade to include counters for all cores 
  69 | long core_counters[MAXCORES][4][2];					// 24 cores & 24 threads on one socket, 4 counters, before and after
  70 | long fixed_counters[MAXCORES][4][2];				// 24 cores with 4 fixed-function core counters (Instr, CoreCyc, RefCyc, TSC)
  71 | long core_pkg_sums[NUM_SOCKETS][4];					// four core counters
  72 | long fixed_pkg_sums[NUM_SOCKETS][4];				// four fixed-function counters per core (Instr, CoreCyc, RefCyc, TSC)
  73 | 
  74 | int8_t cha_by_page[PAGES_MAPPED][32768];				// L3 numbers for each of the 32,768 cache lines in each of the first PAGES_MAPPED 2MiB pages
  75 | uint64_t paddr_by_page[PAGES_MAPPED];					// physical addresses of the base of each of the first PAGES_MAPPED 2MiB pages used
  76 | long lines_by_cha[NUM_CHA_USED];			// bulk count of lines assigned to each CHA
  77 | 
  78 | #ifdef DEBUG
  79 | FILE *log_file;					// log file for debugging -- should not be needed in production
  80 | #endif
  81 | unsigned int *mmconfig_ptr;         // must be pointer to 32-bit int so compiler will generate 32-bit loads and stores
  82 | 
  83 | struct timeval tp;		// seconds and microseconds from gettimeofday
  84 | struct timezone tzp;	// required, but not used here.
  85 | 
  86 | double ssum(double *a, long vl);
  87 | 
  88 | double mysecond()
  89 | {
  90 |         struct timeval tp;
  91 |         struct timezone tzp;
  92 |         int i;
  93 | 
  94 |         i = gettimeofday(&tp,&tzp);
  95 |         return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
  96 | }
  97 | 
  98 | # ifndef MIN
  99 | # define MIN(x,y) ((x)<(y)?(x):(y))
 100 | # endif
 101 | # ifndef MAX
 102 | # define MAX(x,y) ((x)>(y)?(x):(y))
 103 | # endif
 104 | 
 105 | 
 106 | #include "low_overhead_timers.c"
 107 | 
 108 | 
 109 | #include "SKX_IMC_BusDeviceFunctionOffset.h"
 110 | #include "MSR_defs.h"
 111 | // ===========================================================================================================================================================================
 112 | // Convert PCI(bus:device.function,offset) to uint32_t array index
 113 | uint32_t PCI_cfg_index(unsigned int Bus, unsigned int Device, unsigned int Function, unsigned int Offset)
 114 | {
 115 |     uint32_t byteaddress;
 116 |     uint32_t index;
 117 |     assert (Device >= 0);
 118 |     assert (Function >= 0);
 119 |     assert (Offset >= 0);
 120 |     assert (Device < (1<<5));
 121 |     assert (Function < (1<<3));
 122 |     assert (Offset < (1<<12));
 123 |     byteaddress = (Bus<<20) | (Device<<15) | (Function<<12) | Offset;
 124 |     index = byteaddress / 4;
 125 |     return ( index );
 126 | }
 127 | 
 128 | // ===========================================================================================================================================================================
 129 | int main(int argc, char *argv[])
 130 | {
 131 | 	// local declarations
 132 | 	// int cpuid_return[4];
 133 | 	int i;
 134 | 	int retries;
 135 | 	int zeros;
 136 | 	int rc;
 137 | 	int core_pmc_width, fixed_pmc_width;			// these will be looked up using CPUID to use in overflow/wraparound correction
 138 | 	int uncore_pmc_width=48;						// all the uncore stuff is model-dependent, but most are 48 bits
 139 | 	ssize_t rc64;
 140 | 	char description[100];
 141 | 	size_t len;
 142 | 	long arraylen;
 143 | 	long l2_contained_size, inner_repetitions;
 144 | 	unsigned long pagemapentry;
 145 | 	unsigned long paddr, basephysaddr;
 146 | 	unsigned long pagenum, basepagenum;
 147 | 	uint32_t bus, device, function, offset, ctl_offset, ctr_offset, value, index;
 148 | 	uint32_t socket, imc, channel, counter, controller;
 149 | 	long count,delta;
 150 | 	long j,k,page_number,page_base_index,line_number;
 151 | 	long jstart[CORES_USED], jend[CORES_USED], mycore, vl[CORES_USED];
 152 | 	uint32_t low_0, high_0, low_1, high_1;
 153 | 	char filename[100];
 154 | 	int pkg, tile;
 155 | 	int nr_cpus;
 156 | 	uint64_t msr_val, msr_num;
 157 | 	int mem_fd;
 158 | 	int msr_fd[2];				// one for each socket
 159 | 	int proc_in_pkg[2];			// one Logical Processor number for each socket
 160 | 	uid_t my_uid;
 161 | 	gid_t my_gid;
 162 | 	double sum,expected;
 163 | 	double t0, t1;
 164 | 	double avg_cycles;
 165 | 	unsigned long tsc_start, tsc_end;
 166 | 	float TSC_GHz;
 167 | 	double sf_evict_rate;
 168 | 	double bandwidth;
 169 |     unsigned long mmconfig_base=0x80000000;		// DOUBLE-CHECK THIS ON NEW SYSTEMS!!!!!   grep MMCONFIG /proc/iomem | awk -F- '{print $1}'
 170 |     unsigned long mmconfig_size=0x10000000;
 171 | 	double private_sum,partial_sums[CORES_USED];
 172 | 	long iters,iteration_counts[CORES_USED];
 173 | 	long BaseOffset;
 174 | 
 175 | 	TSC_GHz = get_TSC_frequency()/1.0e9;
 176 | 	core_pmc_width = get_core_counter_width();
 177 | 	fixed_pmc_width = get_fixed_counter_width();
 178 | 
 179 | 	BaseOffset = 0;
 180 | #ifdef RANDOMOFFSETS
 181 | 	if (argc != 2) {
 182 | 		printf("Must Provide a Random Offset cache line offset value (an integer between 0 and 2^24-375000 (16,402,216))\n");
 183 | 		exit(1);
 184 | 	} else {
 185 | 		BaseOffset = atol(argv[1]);
 186 | 		printf("Random Cache Line Offset is %ld\n",BaseOffset);
 187 | 		BaseOffset = BaseOffset*8;
 188 | 		printf("Starting index for summation is %ld\n",BaseOffset);
 189 | 	}
 190 | #endif
 191 | 
 192 | 	retries = 0;
 193 | 	zeros = 0;
 194 | 	report = 1;
 195 | 	dumpall = 0;
 196 | 	nwraps = 0;
 197 | 	l2_contained_size = 125000 * CORES_USED;		// about 95% of the L2 space in the cores used
 198 | 	for (i=0; i<CORES_USED; i++) {
 199 | 		iters = 0;
 200 | 		jstart[i] = BaseOffset + i*l2_contained_size/CORES_USED;
 201 | 		jend[i] = jstart[i] + l2_contained_size/CORES_USED;
 202 | 		vl[i] = jend[i]-jstart[i];
 203 | 		printf("thread %d jstart %ld jend %ld vl %ld\n",i,jstart[i],jend[i],vl[i]);
 204 | 
 205 | 		partial_sums[i] = 0.0;
 206 | 		iteration_counts[i] = 0;
 207 | 		for (counter=0; counter<4; counter++) {
 208 | 			core_counters[i][counter][0] = SPECIAL_VALUE;
 209 | 			core_counters[i][counter][1] = SPECIAL_VALUE;
 210 | 			fixed_counters[i][counter][0] = SPECIAL_VALUE;
 211 | 			fixed_counters[i][counter][1] = SPECIAL_VALUE;
 212 | 		}
 213 | 	}
 214 | 	// initialize the array that will hold the L3 numbers for each cache line for each of the first PAGES_MAPPED 2MiB pages
 215 | 	for (i=0; i<PAGES_MAPPED; i++) {
 216 | 		for (line_number=0; line_number<32768; line_number++) {
 217 | 			cha_by_page[i][line_number] = -1; 	// special value -- if set properly, all values should be in the range of 0..23
 218 | 		}
 219 | 	}
 220 | 
 221 | 	// allocate working array on a huge pages -- either 1GiB or 2MiB
 222 | 	len = NUMPAGES * MYPAGESIZE;
 223 | #if defined MYHUGEPAGE_1GB
 224 | 	array = (double*) mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | MAP_HUGE_1GB, -1, 0 );
 225 | #elif defined MYHUGEPAGE_THP
 226 | 	//array = (double*) mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0 );
 227 | 	rc = posix_memalign((void **)&array, (size_t) 2097152, (size_t) len);
 228 | 	if (rc != 0) {
 229 | 		printf("ERROR: posix_memalign call failed with error code %d\n",rc);
 230 | 		exit(3);
 231 | 	}
 232 | #else
 233 | 	array = (double*) mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB, -1, 0 );
 234 | #endif
 235 | 	if (array == (void *)(-1)) {
 236 |         perror("ERROR: mmap of array a failed! ");
 237 |         exit(1);
 238 |     }
 239 | 	// initialize working array
 240 | 	arraylen = NUMPAGES * MYPAGESIZE/sizeof(double);
 241 | #pragma omp parallel for
 242 | 	for (j=0; j<arraylen; j++) {
 243 | 		array[j] = 1.0;
 244 | 	}
 245 | 	// initialize page_pointers to point to the beginning of each page in the array
 246 | 	// then get and print physical addresses for each
 247 | #ifdef VERBOSE
 248 | 	printf(" Page    ArrayIndex            VirtAddr        PagemapEntry         PFN           PhysAddr\n");
 249 | #endif
 250 | 	for (j=0; j<NUMPAGES; j++) {
 251 | 		k = j*MYPAGESIZE/sizeof(double);
 252 | 		page_pointers[j] = &array[k];
 253 | 		pagemapentry = get_pagemap_entry(&array[k]);
 254 | 		pageframenumber[j] = (pagemapentry & (unsigned long) 0x007FFFFFFFFFFFFF);
 255 | #ifdef VERBOSE
 256 | 		printf(" %.5ld   %.10ld  %#18lx  %#18lx  %#18lx  %#18lx\n",j,k,&array[k],pagemapentry,pageframenumber[j],(pageframenumber[j]<<12));
 257 | #endif
 258 | 	}
 259 | 	printf("PAGE_ADDRESSES ");
 260 | 	for (j=0; j<PAGES_MAPPED; j++) {
 261 | 		basephysaddr = pageframenumber[j] << 12;
 262 | 		paddr_by_page[j] = basephysaddr;
 263 | 		printf("0x%.12lx ",paddr_by_page[j]);
 264 | 	}
 265 | 	printf("\n");
 266 | 
 267 | 
 268 | 	// initialize arrays for counter data
 269 | 	for (socket=0; socket<NUM_SOCKETS; socket++) {
 270 | 		for (channel=0; channel<NUM_IMC_CHANNELS; channel++) {
 271 | 			for (counter=0; counter<NUM_IMC_COUNTERS; counter++) {
 272 | 				imc_counts[socket][channel][counter][0] = 0;
 273 | 				imc_counts[socket][channel][counter][1] = 0;
 274 | 			}
 275 | 		}
 276 | 		for (tile=0; tile<NUM_CHA_USED; tile++) {
 277 | 			lines_by_cha[tile] = 0;
 278 | 			for (counter=0; counter<4; counter++) {
 279 | 				cha_counts[socket][tile][counter][0] = 0;
 280 | 				cha_counts[socket][tile][counter][1] = 0;
 281 | 			}
 282 | 		}
 283 | 	}
 284 | 
 285 | 	// get the host name, assume that it is of the TACC standard form, and use this as part
 286 | 	// of the log file name....  Standard form is "c263-109.stampede2.tacc.utexas.edu", so
 287 | 	// truncating at the first "." is done by writing \0 to character #8.
 288 | 	len = 100;	
 289 | 	rc = gethostname(description, len);
 290 | 	if (rc != 0) {
 291 | 		fprintf(stderr,"ERROR when trying to get hostname\n");
 292 | 		exit(-1);
 293 | 	}
 294 | 	description[8] = 0;		// assume hostname of the form c263-109.stampede2.tacc.utexas.edu -- truncate after first period
 295 | 
 296 | 	my_uid = getuid();
 297 | 	my_gid = getgid();
 298 | 
 299 | #ifdef DEBUG
 300 | 	sprintf(filename,"log.%s.perf_counters",description);
 301 | 	// sprintf(filename,"log.perf_counters");
 302 | 	log_file = fopen(filename,"w+");
 303 | 	if (log_file == 0) {
 304 | 		fprintf(stderr,"ERROR %s when trying to open log file %s\n",strerror(errno),filename);
 305 | 		exit(-1);
 306 | 	}
 307 | 
 308 | 	fprintf(log_file,"DEBUG: my uid is %d, my gid is %d\n",my_uid,my_gid);
 309 | 
 310 | 	rc = chown(filename,my_uid,my_gid);
 311 | 	if (rc == 0) {
 312 | 		fprintf(log_file,"DEBUG: Successfully changed ownership of log file to %d %d\n",my_uid,my_gid);
 313 | 	} else {
 314 | 		fprintf(stderr,"ERROR: Attempt to change ownership of log file failed -- bailing out\n");
 315 | 		exit(-1);
 316 | 	}
 317 | #endif
 318 | 
 319 | 	//========================================================================================================================
 320 | 	// initial checks
 321 | 	// 		is this a supported core?  (CPUID Family/Model)
 322 | 	//      Every processor that I am going to see will be Family 0x06 (no ExtFamily needed).
 323 | 	//      The DisplayModel field is (ExtModel<<4)+Model and should be 0x3F for all Xeon E5 v3 systems
 324 | 	int leaf = 1;
 325 | 	int subleaf = 0;
 326 | 	uint32_t eax, ebx, ecx, edx;
 327 | 	__asm__ __volatile__ ("cpuid" : \
 328 | 		  "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (leaf), "c" (subleaf));
 329 | 
 330 | 	// Alternate form: 
 331 | 	// 		The compiler cpuid intrinsics are not documented by Intel -- they use the Microsoft format
 332 | 	// 			described at https://msdn.microsoft.com/en-us/library/hskdteyh.aspx
 333 | 	// 			__cpuid(array to hold eax,ebx,ecx,edx outputs, initial eax value)
 334 | 	// 			__cpuidex(array to hold eax,ebx,ecx,edx outputs, initial eax value, initial ecx value)
 335 | 	//      CPUID function 0x01 returns the model info in eax.
 336 | 	//      		27:20 ExtFamily	-- expect 0x00
 337 | 	//      		19:16 ExtModel	-- expect 0x3 for HSW, 0x5 for SKX
 338 | 	//      		11:8  Family	-- expect 0x6
 339 | 	//      		7:4   Model		-- expect 0xf for HSW, 0x5 for SKX
 340 | 	// __cpuid(&cpuid_return[0], 1);
 341 | 	// uint32_t ModelInfo = cpuid_return[0] & 0x0fff0ff0;	// mask out the reserved and "stepping" fields, leaving only the based and extended Family/Model fields
 342 | 
 343 | 	uint32_t ModelInfo = eax & 0x0fff0ff0;	// mask out the reserved and "stepping" fields, leaving only the based and extended Family/Model fields
 344 | 	if (ModelInfo != 0x00050650) {				// expected values for Skylake Xeon
 345 | 		fprintf(stderr,"ERROR -- this does not appear to be the correct processor type!!!\n");
 346 | 		fprintf(stderr,"ERROR -- Expected CPUID(0x01) Family/Model bits = 0x%x, but found 0x%x\n",0x00050650,ModelInfo);
 347 | 		exit(1);
 348 | 	}
 349 | 
 350 | #ifdef IMC_COUNTS
 351 | 	// ===================================================================================================================
 352 | 	// ------------------ REQUIRES ROOT PERMISSIONS ------------------
 353 | 	// open /dev/mem for PCI device access and mmap() a pointer to the beginning
 354 | 	// of the 256 MiB PCI Configuration Space.
 355 | 	// 		check VID/DID for uncore bus:device:function combinations
 356 | 	//   Note that using /dev/mem for PCI configuration space access is required for some devices on KNL.
 357 | 	//   It is not required on other systems, but it is not particularly inconvenient either.
 358 | 	sprintf(filename,"/dev/mem");
 359 | #ifdef DEBUG
 360 | 	fprintf(log_file,"opening %s\n",filename);
 361 | #endif
 362 | 	mem_fd = open(filename, O_RDWR);
 363 | 	if (mem_fd == -1) {
 364 | 		fprintf(stderr,"ERROR %s when trying to open %s\n",strerror(errno),filename);
 365 | 		exit(-1);
 366 | 	}
 367 | 	int map_prot = PROT_READ | PROT_WRITE;
 368 | 	mmconfig_ptr = mmap(NULL, mmconfig_size, map_prot, MAP_SHARED, mem_fd, mmconfig_base);
 369 |     if (mmconfig_ptr == MAP_FAILED) {
 370 |         fprintf(stderr,"cannot mmap base of PCI configuration space from /dev/mem: address %lx\n", mmconfig_base);
 371 |         exit(2);
 372 | #ifdef DEBUG
 373 |     } else {
 374 | 		fprintf(log_file,"Successful mmap of base of PCI configuration space from /dev/mem at address %lx\n", mmconfig_base);
 375 | #endif
 376 | 	}
 377 |     close(mem_fd);      // OK to close file after mmap() -- the mapping persists until unmap() or program exit
 378 | 
 379 | 	// New simple test that does not need to know the uncore bus numbers here...
 380 | 	// Skylake bus 0, Function 5, offset 0 -- Sky Lake-E MM/Vt-d Configuration Registers
 381 | 	//
 382 | 	// simple test -- should return "20248086" on Skylake Xeon EP -- DID 0x2024, VID 0x8086
 383 | 	bus = 0x00;
 384 | 	device = 0x5;
 385 | 	function = 0x0;
 386 | 	offset = 0x0;
 387 | 	index = PCI_cfg_index(bus, device, function, offset);
 388 |     value = mmconfig_ptr[index];
 389 | 	if (value != 0x20248086) {
 390 | 		fprintf(stderr,"ERROR: Bus %x device %x function %x offset %x expected %x, found %x\n",bus,device,function,offset,0x20248086,value);
 391 | 		exit(3);
 392 | #ifdef DEBUG
 393 | 	} else {
 394 | 		fprintf(log_file,"DEBUG: Well done! Bus %x device %x function %x offset %x returns expected value of %x\n",bus,device,function,offset,value);
 395 | #endif
 396 | 	}
 397 | #endif
 398 | 
 399 | #ifdef CHA_COUNTS
 400 | 	// ===================================================================================================================
 401 | 	// open the MSR driver using one core in socket 0 and one core in socket 1
 402 | 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 403 |     proc_in_pkg[0] = 0;                 // logical processor 0 is in socket 0 in all TACC systems
 404 |     proc_in_pkg[1] = nr_cpus-1;         // logical processor N-1 is in socket 1 in all TACC 2-socket systems
 405 | 	for (pkg=0; pkg<2; pkg++) {
 406 | 		sprintf(filename,"/dev/cpu/%d/msr",proc_in_pkg[pkg]);
 407 | 		msr_fd[pkg] = open(filename, O_RDWR);
 408 | 		if (msr_fd[pkg] == -1) {
 409 | 			fprintf(stderr,"ERROR %s when trying to open %s\n",strerror(errno),filename);
 410 | 			exit(-1);
 411 | 		}
 412 | 	}
 413 | 	for (pkg=0; pkg<2; pkg++) {
 414 | 		pread(msr_fd[pkg],&msr_val,sizeof(msr_val),IA32_TIME_STAMP_COUNTER);
 415 | 		fprintf(stdout,"DEBUG: TSC on core %d socket %d is %ld\n",proc_in_pkg[pkg],pkg,msr_val);
 416 | 	}
 417 | 
 418 | 	pread(msr_fd[0],&msr_val,sizeof(msr_val),0x186);
 419 | 	printf("Core PerfEvtSel0 0x%lx\n",msr_val);
 420 | 	pread(msr_fd[0],&msr_val,sizeof(msr_val),0x187);
 421 | 	printf("Core PerfEvtSel1 0x%lx\n",msr_val);
 422 | 	pread(msr_fd[0],&msr_val,sizeof(msr_val),0x188);
 423 | 	printf("Core PerfEvtSel2 0x%lx\n",msr_val);
 424 | 	pread(msr_fd[0],&msr_val,sizeof(msr_val),0x189);
 425 | 	printf("Core PerfEvtSel3 0x%lx\n",msr_val);
 426 | 
 427 | 
 428 | 	// Program the CHA mesh counters
 429 | 	//   Each CHA has a block of 16 MSRs reserved, of which 12 are used
 430 | 	//   The base for each CHA is 0xE00 + 0x10*CHA
 431 | 	//   Within each block:
 432 | 	//   	Unit Control is at offset 0x00
 433 | 	//   	CTL0, 1, 2, 3 are at offsets 0x01, 0x02, 0x03, 0x04
 434 | 	//   	CTR0, 1, 2, 3 are at offsets 0x08, 0x09, 0x0a, 0x0b
 435 | 	//   For the moment I think I can ignore the filter registers at offsets 0x05 and 0x06
 436 | 	//     and the status register at offset 0x07
 437 | 	//   The control register needs bit 22 set to enabled, then bits 15:8 as Umask and 7:0 as EventSelect
 438 | 	//   Mesh Events:
 439 | 	//   	HORZ_RING_BL_IN_USE = 0xab
 440 | 	//   		LEFT_EVEN = 0x01
 441 | 	//   		LEFT_ODD = 0x02
 442 | 	//   		RIGHT_EVEN = 0x04
 443 | 	//   		RIGHT_ODD = 0x08
 444 | 	//   	VERT_RING_BL_IN_USE = 0xaa
 445 | 	//   		UP_EVEN = 0x01
 446 | 	//   		UP_ODD = 0x02
 447 | 	//   		DN_EVEN = 0x04
 448 | 	//   		DN_ODD = 0x08
 449 | 	//   For starters, I will combine even and odd and create 4 events
 450 | 	//   	0x004003ab	HORZ_RING_BL_IN_USE.LEFT
 451 | 	//   	0x00400cab	HORZ_RING_BL_IN_USE.RIGHT
 452 | 	//   	0x004003aa	VERT_RING_BL_IN_USE.UP
 453 | 	//   	0x00400caa	VERT_RING_BL_IN_USE.DN
 454 | 
 455 | 	// first set to try....
 456 | 	cha_perfevtsel[0] = 0x004003ab;		// HORZ_RING_BL_IN_USE.LEFT
 457 | 	cha_perfevtsel[1] = 0x00400cab;		// HORZ_RING_BL_IN_USE.RIGHT
 458 | 	cha_perfevtsel[2] = 0x004003aa;		// VERT_RING_BL_IN_USE.UP
 459 | 	cha_perfevtsel[3] = 0x00400caa;		// VERT_RING_BL_IN_USE.DN
 460 | 
 461 | 	// second set to try....
 462 | //	cha_perfevtsel[0] = 0x004001ab;		// HORZ_RING_BL_IN_USE.LEFT_EVEN
 463 | //	cha_perfevtsel[1] = 0x004002ab;		// HORZ_RING_BL_IN_USE.LEFT_ODD
 464 | //	cha_perfevtsel[2] = 0x004004ab;		// HORZ_RING_BL_IN_USE.RIGHT_EVEN
 465 | //	cha_perfevtsel[3] = 0x004008ab;		// HORZ_RING_BL_IN_USE.RIGHT_ODD
 466 | 
 467 | 	// Snoop Filter Eviction counters
 468 | 	cha_perfevtsel[0] = 0x0040073d;		// SF_EVICTION S,E,M states
 469 | 	cha_perfevtsel[1] = 0x00400334;		// LLC_LOOKUP.DATA_READ	<-- requires CHA_FILTER0[26:17]
 470 | 	cha_perfevtsel[2] = 0x00400534;		// LLC_LOOKUP.DATA_WRITE (WB from L2) <-- requires CHA_FILTER0[26:17]
 471 | 	cha_perfevtsel[3] = 0x0040af37;		// LLC_VICTIMS.TOTAL (MESF) (does not count clean victims)
 472 | 	uint64_t cha_filter0 = 0x01e20000;		// set bits 24,23,22,21,17 FMESI -- all LLC lookups, no SF lookups
 473 | 
 474 | 	printf("CHA PerfEvtSel0 0x%lx\n",cha_perfevtsel[0]);
 475 | 	printf("CHA PerfEvtSel1 0x%lx\n",cha_perfevtsel[1]);
 476 | 	printf("CHA PerfEvtSel2 0x%lx\n",cha_perfevtsel[2]);
 477 | 	printf("CHA PerfEvtSel3 0x%lx\n",cha_perfevtsel[3]);
 478 | 	printf("CHA FILTER0 0x%lx\n",cha_filter0);
 479 | 
 480 | #ifdef VERBOSE
 481 | 	printf("VERBOSE: programming CHA counters\n");
 482 | #endif
 483 | 
 484 | 	for (pkg=0; pkg<2; pkg++) {
 485 | 		for (tile=0; tile<NUM_CHA_USED; tile++) {
 486 | 			msr_num = 0xe00 + 0x10*tile;		// box control register -- set enable bit
 487 | 			msr_val = 0x00400000;
 488 | 			pwrite(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 489 | 			msr_num = 0xe00 + 0x10*tile + 1;	// ctl0
 490 | 			msr_val = cha_perfevtsel[0];
 491 | 			pwrite(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 492 | 			msr_num = 0xe00 + 0x10*tile + 2;	// ctl1
 493 | 			msr_val = cha_perfevtsel[1];
 494 | 			pwrite(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 495 | 			msr_num = 0xe00 + 0x10*tile + 3;	// ctl2
 496 | 			msr_val = cha_perfevtsel[2];
 497 | 			pwrite(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 498 | 			msr_num = 0xe00 + 0x10*tile + 4;	// ctl3
 499 | 			msr_val = cha_perfevtsel[3];
 500 | 			pwrite(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 501 | 			msr_num = 0xe00 + 0x10*tile + 5;	// filter0
 502 | 			msr_val = cha_filter0;				// bits 24:21,17 FMESI -- all LLC lookups, not not SF lookups
 503 | 			pwrite(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 504 | 		}
 505 | 	}
 506 | #ifdef VERBOSE
 507 | 	printf("VERBOSE: finished programming CHA counters\n");
 508 | #endif
 509 | #endif
 510 | 
 511 | #ifdef IMC_COUNTS
 512 | 	// ===================================================================================================================
 513 | 	// Read the current programming of the IMC counters and look for the standard values (in this order)
 514 | 	//     CAS_COUNT.READS		Event 0x04, Umask 0x03
 515 | 	//     CAS_COUNT.WRITES		Event 0x04, Umask 0x0C
 516 | 	//     ACT.ALL				Event 0x01, Umask 0x0B
 517 | 	//     PRE_COUNT.MISS		Event 0x02, Umask 0x01
 518 | 	//     DCLK
 519 | 
 520 | #ifdef VERBOSE
 521 | 	printf("Preparing to program IMC counters\n");
 522 | #endif
 523 | 	// expected values of IMC performance counter event select control registers
 524 | 	imc_perfevtsel[0] = 0x00400304;		// CAS_COUNT.READS
 525 | 	imc_perfevtsel[1] = 0x00400C04;		// CAS_COUNT.WRITES
 526 | 	imc_perfevtsel[2] = 0x00400B01;		// ACT_COUNT.ALL
 527 | 	imc_perfevtsel[3] = 0x00400102;		// PRE_COUNT.MISS
 528 | 	imc_perfevtsel[4] = 0x00400000;		// DCLK
 529 | 
 530 | 	imc_vid_did[0] = 0x20428086;		// all channel 0 devices are 2042
 531 | 	imc_vid_did[1] = 0x20468086;		// all channel 1 devices are 2046
 532 | 	imc_vid_did[2] = 0x204a8086;		// all channel 2 devices are 204a
 533 | 
 534 | 	printf("IMC PerfEvtSel0 0x%lx\n",imc_perfevtsel[0]);
 535 | 	printf("IMC PerfEvtSel1 0x%lx\n",imc_perfevtsel[1]);
 536 | 	printf("IMC PerfEvtSel2 0x%lx\n",imc_perfevtsel[2]);
 537 | 	printf("IMC PerfEvtSel3 0x%lx\n",imc_perfevtsel[3]);
 538 | 	printf("IMC PerfEvtSel4 0x%lx\n",imc_perfevtsel[4]);
 539 | 
 540 | 	// print the full wall-clock time in seconds and microseconds
 541 | 	// assume both components of tp struct are longs.
 542 | 	fprintf(stdout,"# %s\n", rcsid);
 543 |     i = gettimeofday(&tp,&tzp);
 544 | 	fprintf(stdout,"%ld %ld\n", tp.tv_sec,tp.tv_usec);
 545 | 
 546 | 	for (socket=0; socket<NUM_SOCKETS; socket++) {
 547 | 		bus = IMC_BUS_Socket[socket];
 548 | #ifdef VERBOSE
 549 | 		printf("VERBOSE: socket %d bus %d\n",socket,bus);
 550 | #endif
 551 | 		for (channel=0; channel<NUM_IMC_CHANNELS; channel++) {
 552 | 			device = IMC_Device_Channel[channel];
 553 | 			function = IMC_Function_Channel[channel];
 554 | #ifdef VERBOSE
 555 | 			printf("VERBOSE: channel %d device %d function %d\n",channel, device, function);
 556 | #endif
 557 | 			// check to make sure this is the correct device
 558 | 			offset = 0x0;
 559 | 			index = PCI_cfg_index(bus, device, function, offset);
 560 | 			value = mmconfig_ptr[index];
 561 | 			if ( value != imc_vid_did[channel%3]) {
 562 | 				fprintf(stderr,"WARNING!!!! socket %d, channel %d has vid_did %x but should be %x\n",socket,channel,value,imc_vid_did[channel%3]);
 563 | 			}
 564 | 			for (counter=0; counter<NUM_IMC_COUNTERS; counter++) {
 565 | 				// check to see if this unit is programmed correctly and reprogram if needed
 566 | 				offset = IMC_PmonCtl_Offset[counter];
 567 | 				index = PCI_cfg_index(bus, device, function, offset);
 568 | 				value = mmconfig_ptr[index];
 569 | 				if ( value != imc_perfevtsel[counter]) {
 570 | 					fprintf(stderr,"WARNING!!!! socket %d, channel %d has perfevtsel %x but should be %x -- reprogramming\n",socket,channel,value,imc_perfevtsel[counter]);
 571 | 					mmconfig_ptr[index] = imc_perfevtsel[counter];
 572 | 				}
 573 | 
 574 | 			}
 575 | 		}
 576 | 	}
 577 | #endif
 578 | 
 579 | // ========= END OF PERFORMANCE COUNTER SETUP ========================================================================
 580 | 
 581 | #ifdef MAP_L3
 582 | // ============== BEGIN L3 MAPPING TESTS ==============================
 583 | // For each of the PAGES_MAPPED 2MiB pages:
 584 | //   1. Use "access()" to see if the mapping file already exists.
 585 | //		If exists:
 586 | //   		2. Use "stat()" to make sure the file is the correct size
 587 | //   		   If right size:
 588 | //   		   	3. Read the contents into the 32768-element int8_t array of L3 numbers.
 589 | //   		   Else (wrong size):
 590 | //   		   	4. Abort and tell the user to fix it manually.
 591 | //   	Else (not exists):
 592 | //   		4. Call the mapping function to re-compute the map
 593 | //   		5. Create mapping file
 594 | //   		6. Save data in mapping file
 595 | //   		7. Close output file
 596 | 
 597 | 	FILE *ptr_mapping_file;
 598 | 	int needs_mapping;
 599 | 	int good, good_old, good_new, pass1, pass2, pass3, found, numtries;
 600 | 	int min_count, max_count, sum_count, old_cha;
 601 | 	double avg_count, goodness1, goodness2, goodness3;
 602 | 	int globalsum = 0;
 603 | 	long totaltries = 0;
 604 | 	int NFLUSHES = 1000;
 605 | 	for (page_number=0; page_number<PAGES_MAPPED; page_number++) {
 606 | 		needs_mapping=0;
 607 | 		sprintf(filename,"PADDR_0x%.12lx.map",paddr_by_page[page_number]);
 608 | 		i = access(filename, F_OK);
 609 | 		if (i == -1) {								// file does not exist
 610 | 			printf("DEBUG: Mapping file %s does not exist -- will create file after mapping cache lines\n",filename);
 611 | 			needs_mapping = 1;
 612 | 		} else {									// file exists
 613 | 			i = access(filename, R_OK);
 614 | 			if (i == -1) {							// file exists without read permissions
 615 | 				printf("ERROR: Mapping file %s exists, but without read permission\n",filename);
 616 | 				exit(1);
 617 | 			} else {								// file exists with read permissions
 618 | 				ptr_mapping_file = fopen(filename,"r");
 619 | 				if (!ptr_mapping_file) {
 620 | 					printf("ERROR: Failed to open Mapping File %s, should not happen\n",filename);
 621 | 					exit(2);
 622 | 				}
 623 | 				k = fread(&cha_by_page[page_number][0],(size_t) 32768,(size_t) 1,ptr_mapping_file);
 624 | 				if (k != 1) {					// incorrect read length
 625 | 					printf("ERROR: Read from Mapping File %s, returned the wrong record count %ld expected 1\n",filename,k);
 626 | 					exit(3);
 627 | 				} else {							// correct read length
 628 | 					printf("DEBUG: Mapping File read for %s succeeded -- skipping mapping for this page\n",filename);
 629 | 					needs_mapping = 0;
 630 | 				}
 631 | 			}
 632 | 		}
 633 | 		if (needs_mapping == 1) {
 634 | 			// code imported from SystemMirrors/Hikari/MemSuite/InterventionLatency/L3_mapping.c
 635 | #ifdef VERBOSE
 636 | 			printf("DEBUG: here I need to perform the mapping for paddr 0x%.12lx, and then save the file\n",paddr_by_page[page_number]);
 637 | #endif
 638 | 			page_base_index = page_number*262144;		// index of element at beginning of current 2MiB page
 639 | 			for (line_number=0; line_number<32768; line_number++) {
 640 | 				good = 0;
 641 | 				good_old = 0;
 642 | 				good_new = 0;
 643 | 				numtries = 0;
 644 | #ifdef VERBOSE
 645 | 				if (line_number%64 == 0) {
 646 | 					pagemapentry = get_pagemap_entry(&array[page_base_index+line_number*8]);
 647 | 					printf("DEBUG: page_base_index %ld line_number %ld index %ld pagemapentry 0x%lx\n",page_base_index,line_number,page_base_index+line_number*8,pagemapentry);
 648 | 				}
 649 | #endif
 650 | 				do  {               // -------------- Inner Repeat Loop until results pass "goodness" tests --------------
 651 | 					numtries++;
 652 | 					if (numtries > 100) {
 653 | 						printf("ERROR: No good results for line %d after %d tries\n",line_number,numtries);
 654 | 						exit(101);
 655 | 					}
 656 | 					totaltries++;
 657 | 				// 1. read L3 counters before starting test
 658 | 				for (tile=0; tile<NUM_CHA_USED; tile++) {
 659 | 					msr_num = 0xe00 + 0x10*tile + 0x8 + 1;				// counter 1 is the LLC_LOOKUPS.READ event
 660 | 					pread(msr_fd[0],&msr_val,sizeof(msr_val),msr_num);
 661 | 					cha_counts[0][tile][1][0] = msr_val;					//  use the array I have already declared for cha counts
 662 | 					// printf("DEBUG: page %ld line %ld msr_num 0x%x msr_val %ld cha_counter1 %lu\n",
 663 | 					//		page_number,line_number,msr_num,msr_val,cha_counts[0][tile][1][0]);
 664 | 				}
 665 | 
 666 | 				// 2. Access the line many times
 667 | 				sum = 0;
 668 | 				for (i=0; i<NFLUSHES; i++) {
 669 | 					sum += array[page_base_index+line_number*8];
 670 | 					_mm_mfence();
 671 | 					_mm_clflush(&array[page_base_index+line_number*8]);
 672 | 					_mm_mfence();
 673 | 				}
 674 | 				globalsum += sum;
 675 | 
 676 | 				// 3. read L3 counters after loads are done
 677 | 				for (tile=0; tile<NUM_CHA_USED; tile++) {
 678 | 					msr_num = 0xe00 + 0x10*tile + 0x8 + 1;				// counter 1 is the LLC_LOOKUPS.READ event
 679 | 					pread(msr_fd[0],&msr_val,sizeof(msr_val),msr_num);
 680 | 					cha_counts[0][tile][1][1] = msr_val;					//  use the array I have already declared for cha counts
 681 | 				}
 682 | 
 683 | #ifdef VERBOSE
 684 | 				for (tile=0; tile<NUM_CHA_USED; tile++) {
 685 | 					printf("DEBUG: page %ld line %ld cha_counter1_after %lu cha_counter1 before %lu delta %lu\n",
 686 | 							page_number,line_number,cha_counts[0][tile][1][1],cha_counts[0][tile][1][0],cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0]);
 687 | 				}
 688 | #endif
 689 | 
 690 | 				//   CHA counter 1 set to LLC_LOOKUP.READ
 691 | 				//
 692 | 				//  4. Determine which L3 slice owns the cache line and
 693 | 				//  5. Save the CHA number in the cha_by_page[page][line] array
 694 | 
 695 | 				// first do a rough quantitative checks of the "goodness" of the data
 696 | 				//		goodness1 = max/NFLUSHES (pass if >95%)
 697 | 				// 		goodness2 = min/NFLUSHES (pass if <20%)
 698 | 				//		goodness3 = avg/NFLUSHES (pass if <40%)
 699 | 				max_count = 0;
 700 | 				min_count = 1<<30;
 701 | 				sum_count = 0;
 702 | 				for (tile=0; tile<NUM_CHA_USED; tile++) {
 703 | 					max_count = MAX(max_count, cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0]);
 704 | 					min_count = MIN(min_count, cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0]);
 705 | 					sum_count += cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0];
 706 | 				}
 707 | 				avg_count = (double)(sum_count - max_count) / (double)(NUM_CHA_USED);
 708 | 				goodness1 = (double) max_count / (double) NFLUSHES;
 709 | 				goodness2 = (double) min_count / (double) NFLUSHES;
 710 | 				goodness3 =          avg_count / (double) NFLUSHES;
 711 | 				// compare the goodness parameters with manually chosen limits & combine into a single pass (good=1) or fail (good=0)
 712 | 				pass1 = 0;
 713 | 				pass2 = 0;
 714 | 				pass3 = 0;
 715 | 				if ( goodness1 > 0.95 ) pass1 = 1;
 716 | 				if ( goodness2 < 0.20 ) pass2 = 1;
 717 | 				if ( goodness3 < 0.40 ) pass3 = 1;
 718 | 				good_new = pass1 * pass2 * pass3;
 719 | #ifdef VERBOSE
 720 | 				printf("GOODNESS: line_number %ld max_count %d min_count %d sum_count %d avg_count %f goodness1 %f goodness2 %f goodness3 %f pass123 %d %d %d\n",
 721 | 								  line_number, max_count, min_count, sum_count, avg_count, goodness1, goodness2, goodness3, pass1, pass2, pass3);
 722 | 				if (good_new == 0) printf("DEBUG: one or more of the sanity checks failed for line=%ld: %d %d %d goodness values %f %f %f\n",
 723 | 					line_number,pass1,pass2,pass3,goodness1,goodness2,goodness3);
 724 | #endif
 725 | 
 726 | 				// test to see if more than one CHA reports > 0.95*NFLUSHES events
 727 | 				found = 0;
 728 | 				old_cha = -1;
 729 | 				int min_counts = (NFLUSHES*19)/20;
 730 | 				for (tile=0; tile<NUM_CHA_USED; tile++) {
 731 | 					if (cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0] >= min_counts) {
 732 | 						old_cha = cha_by_page[page_number][line_number];
 733 | 						cha_by_page[page_number][line_number] = tile;
 734 | 						found++;
 735 | #ifdef VERBOSE
 736 | 						if (found > 1) {
 737 | 							printf("WARNING: Multiple (%d) CHAs found using counter 1 for cache line %ld, index %ld: old_cha %d new_cha %d\n",found,line_number,page_base_index+line_number*8,old_cha,cha_by_page[page_number][line_number]);
 738 | 						}
 739 | #endif
 740 | 					}
 741 | 				}
 742 | 				if (found == 0) {
 743 | 					good_old = 0;
 744 | #ifdef VERBOSE
 745 | 					printf("WARNING: no CHA entry has been found for line %ld!\n",line_number);
 746 | 					printf("DEBUG dump for no CHA found\n");
 747 | 					for (tile=0; tile<NUM_CHA_USED; tile++) {
 748 | 						printf("CHA %d LLC_LOOKUP.READ          delta %ld\n",tile,(cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0]));
 749 | 					}
 750 | #endif
 751 | 				} else if (found == 1) {
 752 | 					good_old = 1;
 753 | 				} else {
 754 | 					good_old = 0;
 755 | #ifdef VERBOSE
 756 | 					printf("DEBUG dump for multiple CHAs found\n");
 757 | 					for (tile=0; tile<NUM_CHA_USED; tile++) {
 758 | 						printf("CHA %d LLC_LOOKUP.READ          delta %ld\n",tile,(cha_counts[0][tile][1][1]-cha_counts[0][tile][1][0]));
 759 | 					}
 760 | #endif
 761 | 				}
 762 | 				good = good_new * good_old;         // trigger a repeat if either the old or new tests failed
 763 | 				}
 764 | 				while (good == 0);
 765 | #if 0
 766 | 				// 6. save the cache line number in the appropriate the cbo_indices[cbo][#lines] array
 767 | 				// 7. increment the corresponding cbo_num_lines[cbo] array entry
 768 | 				this_cbo = cha_by_page[page_number][line_number];
 769 | 				if (this_cbo == -1) {
 770 | 					printf("ERROR: cha_by_page[%ld][%ld] has not been set!\n",page_number,line_number);
 771 | 					exit(80);
 772 | 				}
 773 | 				cbo_indices[this_cbo][cbo_num_lines[this_cbo]] = line_number;
 774 | 				cbo_num_lines[this_cbo]++;
 775 | #endif
 776 | 			}
 777 | 			// I have not overwritten the filename, but I will rebuild it here just in case I add something stupid in between....
 778 | 			sprintf(filename,"PADDR_0x%.12lx.map",paddr_by_page[page_number]);
 779 | 			ptr_mapping_file = fopen(filename,"w");
 780 | 			if (!ptr_mapping_file) {
 781 | 				printf("ERROR: Failed to open Mapping File %s for writing -- aborting\n",filename);
 782 | 				exit(4);
 783 | 			}
 784 | 			// first try -- write one record of 32768 bytes
 785 | 			rc64 = fwrite(&cha_by_page[page_number][0],(size_t) 32768, (size_t) 1, ptr_mapping_file);
 786 | 			if (rc64 != 1) {
 787 | 				printf("ERROR: failed to write one 32768 Byte record to  %s -- return code %ld\n",filename,rc64);
 788 | 				exit(5);
 789 | 			} else {
 790 | 				printf("SUCCESS: wrote mapping file %s\n",filename);
 791 | 			}
 792 | 		}
 793 | 	}
 794 | 	printf("DUMMY: globalsum %d\n",globalsum);
 795 | 	printf("VERBOSE: L3 Mapping Complete in %ld tries for %d cache lines ratio %f\n",totaltries,32768*PAGES_MAPPED,(double)totaltries/(double)(32768*PAGES_MAPPED));
 796 | 
 797 | #ifndef MYHUGEPAGE_1GB
 798 | 	// TODO!!  Fix this so that it is not hard-coded for the 24-core case!!!!
 799 | 	//
 800 | 	// now that the mapping is complete, I can add up the number of lines mapped to each CHA
 801 | 	// be careful to count only the lines that are used, not the full 24MiB
 802 | 	// 3 million elements is ~11.44 2MiB pages, so count all lines in each of the first 11 pages
 803 | 	// If I did the arithmetic correctly, the 3 million elements uses 931328 Bytes of the 12th 2MiB page
 804 | 	// which is 116416 elements or 14552 cache lines.
 805 | 
 806 | 	// first accumulate the first 11 full pages
 807 | 	for (page_number=0; page_number<11; page_number++) {
 808 | 		for (line_number=0; line_number<32768; line_number++) {
 809 | 			lines_by_cha[cha_by_page[page_number][line_number]]++;
 810 | 		}
 811 | 	}
 812 | 	// then accumulate the partial 12th page
 813 | 	for (line_number=0; line_number<14552; line_number++) {
 814 | 		lines_by_cha[cha_by_page[11][line_number]]++;
 815 | 	}
 816 | 	// output
 817 | 	long lines_accounted = 0;
 818 | 	printf("LINES_BY_CHA");
 819 | 	for (i=0; i<NUM_CHA_USED; i++) {
 820 | 		printf(" %ld",lines_by_cha[i]);
 821 | 		lines_accounted += lines_by_cha[i];
 822 | 	}
 823 | 	printf("\n");
 824 | 	printf("ACCCOUNTED FOR %ld lines expected %ld lines\n",lines_accounted,l2_contained_size/8);
 825 | #endif
 826 | // ============== END L3 MAPPING TESTS ==============================
 827 | #endif
 828 | 
 829 | 
 830 | 	// For the snoop filter tests, I want to repeatedly read 
 831 | 	// some number of arrays per core with an aggregate footprint
 832 | 	// close to 1MiB per core 
 833 | 	// 24 cores = 24 MiB = 3 Mi elements, so 
 834 | 	// using an array length of 3 million should be just about right 95.3674%
 835 | 
 836 | 	// l2_contained_size = arraylen;			// only use if I want a large memory-contained version
 837 | 	inner_repetitions = 1000;
 838 | 	int stride = 1;		// used in thread binding checks: use 2 for Dell nodes, 1 for Intel nodes
 839 | 
 840 | 	// try to pre-load the working data into the L2 caches before the initial performance counter reads
 841 | 	sum = 0.0;
 842 | #pragma omp parallel for reduction(+:sum)
 843 | 	for (i=0; i<l2_contained_size; i++) sum += array[i];
 844 | 
 845 | 	// While I am at it, I need to warm up the cores using AVX-512 code to get them to full frequency
 846 | 	// This may take up to 100 microseconds, or maybe 400,000 AVX512 instructions per thread.
 847 | 	// This is a pain because I can't trust the compiler to generate AVX512 code at any given time,
 848 | 	// so I have to resort to inline assembly.
 849 | 	tsc_start = rdtsc();
 850 | #pragma omp parallel for 
 851 | 	for (i=0; i<CORES_USED; i++) {
 852 | 		for (j=0; j<10*1000*1000; j++) {
 853 | 			__asm__ __volatile__ (
 854 | 				"vpaddq %%zmm0, %%zmm1, %%zmm2\n\t"
 855 | 				"vpaddq %%zmm1, %%zmm2, %%zmm3\n\t"
 856 | 				"vpaddq %%zmm2, %%zmm3, %%zmm0\n\t"
 857 | 				"vpaddq %%zmm3, %%zmm0, %%zmm1"
 858 | 			: : : "zmm0","zmm1","zmm2","zmm3");
 859 | 		}
 860 | 	}
 861 | 	tsc_end = rdtsc();
 862 | 	printf("DEBUG: WARMUP LOOP took %lu TSC cycles\n",tsc_end - tsc_start);
 863 | 
 864 | 
 865 | // =================== BEGINNING OF PERFORMANCE COUNTER READS BEFORE KERNEL TESTING ==============================
 866 | #ifdef IMC_COUNTS
 867 | 	// ------------ read the initial values of the IMC counters ------------
 868 |     for (socket=0; socket<NUM_SOCKETS; socket++) {
 869 |         bus = IMC_BUS_Socket[socket];
 870 |         for (channel=0; channel<NUM_IMC_CHANNELS; channel++) {
 871 |             device = IMC_Device_Channel[channel];
 872 |             function = IMC_Function_Channel[channel];
 873 |             for (counter=0; counter<NUM_IMC_COUNTERS; counter++) {
 874 |                 offset = IMC_PmonCtr_Offset[counter];
 875 |                 index = PCI_cfg_index(bus, device, function, offset);
 876 | 
 877 |                 // read each counter twice to identify rare cases where the low-order bits
 878 |                 // overflow and increment the high-order bits between the two reads.
 879 |                 // Use the second set of values unless (( high_1 != high_0 ) && ( low_1 > low_0))
 880 |                 //   (this indicates that the counter rolled between the 3rd and 4th reads).
 881 |                 low_0 = mmconfig_ptr[index];
 882 |                 high_0 = mmconfig_ptr[index+1];
 883 | 
 884 |                 low_1 = mmconfig_ptr[index];
 885 |                 high_1 = mmconfig_ptr[index+1];
 886 | 
 887 |                 if ( (high_1 != high_0) && (low_1 > low_0) ) {
 888 |                     count = ((uint64_t) high_0) << 32 | (uint64_t) low_0;
 889 |                 } else {
 890 |                     count = ((uint64_t) high_1) << 32 | (uint64_t) low_1;
 891 |                 }
 892 |                 imc_counts[socket][channel][counter][0] = count;
 893 |             }
 894 |         }
 895 |     }
 896 | #if 0
 897 | 	// for debugging only: print initial values of IMC counts
 898 | 	for (socket=0; socket<NUM_SOCKETS; socket++) {
 899 | 		for (channel=0; channel<NUM_IMC_CHANNELS; channel++) {
 900 | 			fprintf(stdout,"%d %d",socket,channel);
 901 | 			for (counter=0; counter<NUM_IMC_COUNTERS; counter++) {
 902 | 				fprintf(stdout," %ld",imc_counts[socket][channel][counter][0]);
 903 | 			}
 904 | 			fprintf(stdout,"\n");
 905 | 		}
 906 | 	}
 907 | #endif
 908 | #endif
 909 | 
 910 | #ifdef CHA_COUNTS
 911 | 	// ------------ read the initial values of the CHA mesh counters ------------
 912 | 	for (pkg=0; pkg<2; pkg++) {
 913 | 		for (tile=0; tile<NUM_CHA_USED; tile++) {
 914 | 			for (counter=0; counter<4; counter++) {
 915 | 				msr_num = 0xe00 + 0x10*tile + 0x8 + counter;
 916 | 				pread(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
 917 | 				cha_counts[pkg][tile][counter][0] = msr_val;
 918 | 			}
 919 | 		}
 920 | 	}
 921 | #if 0
 922 | 	// for debugging only: print initial values of CHA counters
 923 | 	for (pkg=0; pkg<2; pkg++) {
 924 | 		for (tile=0; tile<NUM_CHA_USED; tile++) {
 925 | 			for (counter=0; counter<4; counter++) {
 926 | 				printf("Package %d, tile %d, counter %d, value %lu\n",pkg,tile,counter,cha_counts[pkg][tile][counter][0]);
 927 | 			}
 928 | 		}
 929 | 	}
 930 | #endif
 931 | #endif
 932 | 
 933 | 	// ------------ read the initial values of the programmable core counters ------------
 934 | 
 935 | #pragma omp parallel for private(counter)
 936 | 	for (i=0; i<CORES_USED; i++) {
 937 | #ifdef CHECK_THREAD_LOCATION
 938 | 		if (get_core_number() != stride*i) {
 939 | 			printf("ERROR: thread %d is in the wrong place %d\n",i,get_core_number());
 940 | 		}
 941 | #endif
 942 | 		for (counter=0; counter<4; counter++) {
 943 | 			core_counters[i][counter][0] = rdpmc(counter);
 944 | 		}
 945 | 	}
 946 | 
 947 | 	tsc_start = rdtsc();
 948 | 	// ================= CODE UNDER TEST BEGINS HERE ====================
 949 | 
 950 | #ifdef SIMPLE_OMP_LOOP
 951 | 	sum = 0.0;
 952 | 	for (k=0; k<inner_repetitions; k++) {
 953 | #pragma omp parallel for, reduction(+:sum)
 954 | 		for (j=0; j<l2_contained_size; j++) {
 955 | 			sum += array[j];
 956 | 		}
 957 | 	}
 958 | #else
 959 | #pragma omp parallel for private(j,k,iters,private_sum)
 960 | 	for (i=0; i<CORES_USED; i++) {
 961 | 		iters = 0;
 962 | 		fixed_counters[i][0][0] = rdpmc_instructions();
 963 | 		fixed_counters[i][1][0] = rdpmc_actual_cycles();
 964 | 		fixed_counters[i][2][0] = rdpmc_reference_cycles();
 965 | 		fixed_counters[i][3][0] = rdtsc();
 966 | 		for (k=0; k<inner_repetitions; k++) {
 967 | 			private_sum = ssum(&array[jstart[i]],vl[i]);
 968 | 			partial_sums[i] += private_sum;
 969 | 			iters++;
 970 | 		}
 971 | 		fixed_counters[i][0][1] = rdpmc_instructions();
 972 | 		fixed_counters[i][1][1] = rdpmc_actual_cycles();
 973 | 		fixed_counters[i][2][1] = rdpmc_reference_cycles();
 974 | 		fixed_counters[i][3][1] = rdtsc();
 975 | 		iteration_counts[i] = iters;
 976 | 	}
 977 | #endif
 978 | 	// ================ CODE UNDER TEST ENDS HERE ====================
 979 | 	tsc_end = rdtsc();
 980 | 
 981 | 	// use the partial sums so the optimizer will not eliminate code
 982 | 	for (i=0; i<CORES_USED; i++) {
 983 | 		sum += partial_sums[i];
 984 | 	}
 985 | 
 986 | 	// -------------------- read the final values of the Programmable Core counters ------------------------ 
 987 | #pragma omp parallel for private(counter)
 988 | 	for (i=0; i<CORES_USED; i++) {
 989 | #ifdef CHECK_THREAD_LOCATION
 990 | 		if (get_core_number() != stride*i) {
 991 | 			printf("ERROR: thread %d is in the wrong place %d\n",i,get_core_number());
 992 | 		}
 993 | #endif
 994 | 		for (counter=0; counter<4; counter++) {
 995 | 			core_counters[i][counter][1] = rdpmc(counter);
 996 | #ifdef RETRIES
 997 | 			// if the counter returns zero, read it one more time....
 998 | 			if (core_counters[i][counter][1] == SPECIAL_VALUE) {
 999 | 				core_counters[i][counter][1] = rdpmc(counter);
1000 | #pragma omp atomic update 
1001 | 				retries++;
1002 | 			}
1003 | #endif
1004 | 		}
1005 | 	}
1006 | 
1007 | #ifdef CHECK_SPECIAL_VALUES
1008 | 	for (i=0; i<CORES_USED; i++) {
1009 | 		for (counter=0; counter<4; counter++) {
1010 | 			if (core_counters[i][counter][0] == SPECIAL_VALUE) {
1011 | 				printf("DEBUG: SPECIAL_VALUE found after loop in start count on thread %d counter %d\n",i,counter);
1012 | 				zeros++;
1013 | 			}
1014 | 			if (core_counters[i][counter][1] == SPECIAL_VALUE) {
1015 | 				printf("DEBUG: SPECIAL_VALUE found after loop in end count on thread %d counter %d\n",i,counter);
1016 | 				zeros++;
1017 | 			}
1018 | 		}
1019 | 	}
1020 | #endif
1021 | 
1022 | #ifdef CHA_COUNTS
1023 | 	// ------------------- read the final values of the CHA mesh counters ----------------
1024 | 	for (pkg=0; pkg<2; pkg++) {
1025 | 		for (tile=0; tile<NUM_CHA_USED; tile++) {
1026 | 			for (counter=0; counter<4; counter++) {
1027 | 				msr_num = 0xe00 + 0x10*tile + 0x8 + counter;
1028 | 				pread(msr_fd[pkg],&msr_val,sizeof(msr_val),msr_num);
1029 | 				cha_counts[pkg][tile][counter][1] = msr_val;
1030 | 			}
1031 | 		}
1032 | 	}
1033 | #endif
1034 | 
1035 | #ifdef IMC_COUNTS
1036 | 	// ------------------ read the final values of the IMC counters -----------------
1037 | 	for (socket=0; socket<NUM_SOCKETS; socket++) {
1038 | 		bus = IMC_BUS_Socket[socket];
1039 | 		for (channel=0; channel<NUM_IMC_CHANNELS; channel++) {
1040 | 			device = IMC_Device_Channel[channel];
1041 | 			function = IMC_Function_Channel[channel];
1042 | 			for (counter=0; counter<NUM_IMC_COUNTERS; counter++) {
1043 | 				offset = IMC_PmonCtr_Offset[counter];
1044 | 				index = PCI_cfg_index(bus, device, function, offset);
1045 | 
1046 | 				// read each counter twice to identify rare cases where the low-order bits
1047 | 				// overflow and increment the high-order bits between the two reads.
1048 | 				// Use the second set of values unless (( high_1 != high_0 ) && ( low_1 > low_0))
1049 | 				//   (this indicates that the counter rolled between the 3rd and 4th reads).
1050 | 				low_0 = mmconfig_ptr[index];
1051 | 				high_0 = mmconfig_ptr[index+1];
1052 | 
1053 | 				low_1 = mmconfig_ptr[index];
1054 | 				high_1 = mmconfig_ptr[index+1];
1055 | 
1056 | 				if ( (high_1 != high_0) && (low_1 > low_0) ) {
1057 | 					count = ((uint64_t) high_0) << 32 | (uint64_t) low_0;
1058 | 				} else {
1059 | 					count = ((uint64_t) high_1) << 32 | (uint64_t) low_1;
1060 | 				}
1061 | 				imc_counts[socket][channel][counter][1] = count;
1062 | 			}
1063 | 		}
1064 | 	}
1065 | #endif
1066 | // ================================== END OF PERFORMANCE COUNTER READS AFTER TEST  ==============================================
1067 | 
1068 | 	t0 = 0.0;
1069 | 	t1 = (double) (tsc_end - tsc_start) / TSC_GHz / 1.0e9;
1070 | 	printf("Instrumented code required %f seconds to execute\n",t1-t0);
1071 | 	bandwidth = sizeof(double)*(double)l2_contained_size*(double)inner_repetitions / (t1-t0) / 1e9;
1072 | 	printf("Bandwidth %f GB/s\n",bandwidth);
1073 | 	printf("Bandwidth per core %f GB/s\n",bandwidth/(double)CORES_USED);
1074 | 	printf("Approx Bytes/cycle per core %f\n",bandwidth/(double)CORES_USED/2.0);
1075 | 
1076 | 	expected = (double)l2_contained_size * (double)(inner_repetitions) / (double)CORES_USED;
1077 | 	avg_cycles = (double)(tsc_end - tsc_start) / expected;
1078 | 	printf("Average TSC cycles per element %f\n",avg_cycles);
1079 | 
1080 | 	// clear the arrays for the package-level sums
1081 | 	for (pkg=0; pkg<2; pkg++) {
1082 | 		for (counter=0; counter<4; counter++) {			// no point in summing the cycle counts, so exclude counter 4
1083 | 			core_pkg_sums[pkg][counter] = 0;
1084 | 			fixed_pkg_sums[pkg][counter] = 0;
1085 | 			imc_pkg_sums[pkg][counter] = 0;
1086 | 			cha_pkg_sums[pkg][counter] = 0;
1087 | 		}
1088 | 	}
1089 | 
1090 | 	// compute core package sums and optional print
1091 | 	for (i=0; i<CORES_USED; i++) {
1092 | 		for (counter=0; counter<4; counter++) {
1093 | 			delta = corrected_pmc_delta(fixed_counters[i][counter][1],fixed_counters[i][counter][0],fixed_pmc_width);
1094 | 			fixed_pkg_sums[0][counter] += delta;
1095 | 		}
1096 | 		for (counter=0; counter<4; counter++) {
1097 | #ifdef CHECK_SPECIAL_VALUES
1098 | 			if (core_counters[i][counter][0] == SPECIAL_VALUE) {
1099 | 				printf("DEBUG: SPECIAL_VALUE found in post-processing in start count on thread %d counter %d\n",i,counter);
1100 | 			}
1101 | 			if (core_counters[i][counter][1] == SPECIAL_VALUE) {
1102 | 				printf("DEBUG: SPECIAL_VALUE found in post-processing in end count on thread %d counter %d\n",i,counter);
1103 | 			}
1104 | #endif
1105 | 			delta = corrected_pmc_delta(core_counters[i][counter][1],core_counters[i][counter][0],core_pmc_width);
1106 | #ifdef VERBOSE
1107 | 			printf("CORE %d counter %d end %ld start %ld delta %ld\n",i,counter,core_counters[i][counter][1],core_counters[i][counter][0],delta);
1108 | #endif
1109 | 			core_pkg_sums[0][counter] += delta;
1110 | 		}
1111 | 
1112 | 
1113 | 	}
1114 | 
1115 | 	if (dumpall == 1) {
1116 | 		report = 0;
1117 | 		for (i=0; i<CORES_USED; i++) {
1118 | 			for (counter=0; counter<4; counter++) {
1119 | 				delta = corrected_pmc_delta(core_counters[i][counter][1],core_counters[i][counter][0],core_pmc_width);
1120 | 				printf("CORE %d counter %d end %ld start %ld delta %ld\n",i,counter,core_counters[i][counter][1],core_counters[i][counter][0],delta);
1121 | 			}
1122 | 		}
1123 | 	}
1124 | 	report = 1;
1125 | 	dumpall = 0;
1126 | 
1127 | #ifdef CHA_COUNTS
1128 | 	// print out the differences and compute sums of differences
1129 | 	for (pkg=0; pkg<2; pkg++) {
1130 | 		for (tile=0; tile<NUM_CHA_USED; tile++) {
1131 | 			for (counter=0; counter<4; counter++) {
1132 | 				delta = corrected_pmc_delta(cha_counts[pkg][tile][counter][1],cha_counts[pkg][tile][counter][0],uncore_pmc_width);
1133 | #ifdef VERBOSE
1134 | 				printf("CHA pkg %d tile %d counter %d delta %ld\n",pkg,tile,counter,delta);
1135 | #endif
1136 | 				cha_pkg_sums[pkg][counter] += delta;
1137 | 			}
1138 | 		}
1139 | 	}
1140 | #endif
1141 | #ifdef IMC_COUNTS
1142 | 	for (pkg=0; pkg<2; pkg++) {
1143 | 		for (channel=0; channel<NUM_IMC_CHANNELS; channel++) {
1144 | 			for (counter=0; counter<NUM_IMC_COUNTERS; counter++) {
1145 | 				delta = corrected_pmc_delta(imc_counts[pkg][channel][counter][1],imc_counts[pkg][channel][counter][0],uncore_pmc_width);
1146 | #ifdef VERBOSE
1147 | 				printf("IMC pkg %d channel %d counter %d delta %ld\n",pkg,channel,counter,delta);
1148 | #endif
1149 | 				imc_pkg_sums[pkg][counter] += delta;
1150 | 			}
1151 | 		}
1152 | 	}
1153 | #endif
1154 | 
1155 | 	int max_display_pkg = 1;
1156 | 
1157 | 	printf("Expected AVX512 arithmetic instructions (Event 0xC7, Umask 0x40) %ld\n",l2_contained_size*inner_repetitions/8);
1158 | 	for (pkg=0; pkg<max_display_pkg; pkg++) {
1159 | 		for (counter=0; counter<4; counter++) {
1160 | 			printf("CORE_PKG_SUMS pkg %d counter %d sum_delta %ld\n",pkg,counter,core_pkg_sums[pkg][counter]);
1161 | 		}
1162 | 	}
1163 | 
1164 | 	for (pkg=0; pkg<max_display_pkg; pkg++) {
1165 | 		for (counter=0; counter<4; counter++) {
1166 | 			printf("FIXED_PKG_SUMS pkg %d counter %d sum_delta %ld\n",pkg,counter,fixed_pkg_sums[pkg][counter]);
1167 | 		}
1168 | 	}
1169 | 
1170 | 	// the fixed-function counters are measured inside the OpenMP loop, so they should not be contaminated by 
1171 | 	// spin-waiting....
1172 | 	// Compute per-core metrics here -- note that the fixed-function counter set is (Instr, CoreCyc, RefCyc, TSC)
1173 | 	// 		Utilization = RefCyc/TSC (fixed2/fixed3)
1174 | 	// 		AvgGHz_unhalted = CoreCyc/RefCyc * 2.1  (fixed1/fixed2 * 2.1)
1175 | 	// 		AvgGHz_wall = CoreCyc/TSC * 2.1 (fixed1/fixed3 * 2.1)
1176 | 	// 		IPC = Instr/CoreCyc (fixed0/fixed1)
1177 | 	long delta_inst, delta_core, delta_ref, delta_tsc;
1178 | 	double utilization, avg_ghz, ipc;
1179 | 
1180 | 	printf("CORE_UTILIZATION ");
1181 | 	for (i=0; i<CORES_USED; i++) {
1182 | 		delta_ref  = corrected_pmc_delta(fixed_counters[i][2][1],fixed_counters[i][2][0],fixed_pmc_width);
1183 | 		delta_tsc  = corrected_pmc_delta(fixed_counters[i][3][1],fixed_counters[i][3][0],fixed_pmc_width);
1184 | 		utilization = (double)delta_ref / (double)delta_tsc;
1185 | 		printf("%6.4f ",utilization);
1186 | 	}
1187 | 	printf("\n");
1188 | 
1189 | 	printf("CORE_GHZ ");
1190 | 	for (i=0; i<CORES_USED; i++) {
1191 | 		delta_core = corrected_pmc_delta(fixed_counters[i][1][1],fixed_counters[i][1][0],fixed_pmc_width);
1192 | 		delta_ref  = corrected_pmc_delta(fixed_counters[i][2][1],fixed_counters[i][2][0],fixed_pmc_width);
1193 | 		avg_ghz = (double)delta_core / (double)delta_ref * TSC_GHz;
1194 | 		printf("%6.4f ",avg_ghz);
1195 | 	}
1196 | 	printf("\n");
1197 | 
1198 | 	printf("CORE_IPC ");
1199 | 	for (i=0; i<CORES_USED; i++) {
1200 | 		delta_inst = corrected_pmc_delta(fixed_counters[i][0][1],fixed_counters[i][0][0],fixed_pmc_width);
1201 | 		delta_core = corrected_pmc_delta(fixed_counters[i][1][1],fixed_counters[i][1][0],fixed_pmc_width);
1202 | 		ipc = (double)delta_inst / (double)delta_core;
1203 | 		printf("%6.4f ",ipc);
1204 | 	}
1205 | 	printf("\n");
1206 | 
1207 | 	printf("THREAD_EXECUTION_TIME ");
1208 | 	for (i=0; i<CORES_USED; i++) {
1209 | 		delta_tsc  = corrected_pmc_delta(fixed_counters[i][3][1],fixed_counters[i][3][0],fixed_pmc_width);
1210 | 		t0 = (double)delta_tsc / (TSC_GHz*1.0e9);
1211 | 		printf("%f ",t0);
1212 | 	}
1213 | 	printf("\n");
1214 | 
1215 | 
1216 | 
1217 | #ifdef CHA_COUNTS
1218 | 	for (pkg=0; pkg<max_display_pkg; pkg++) {
1219 | 		for (counter=0; counter<4; counter++) {
1220 | 			printf("CHA_PKG_SUMS pkg %d counter %d sum_delta %ld\n",pkg,counter,cha_pkg_sums[pkg][counter]);
1221 | 		}
1222 | 	}
1223 | #endif
1224 | #ifdef IMC_COUNTS
1225 | 	for (pkg=0; pkg<max_display_pkg; pkg++) {
1226 | 		for (counter=0; counter<4; counter++) {			// no point in summing the cycle counts, so exclude counter 4
1227 | 			printf("IMC_PKG_SUMS pkg %d counter %d sum_delta %ld\n",pkg,counter,imc_pkg_sums[pkg][counter]);
1228 | 		}
1229 | 	}
1230 | #endif
1231 | 
1232 | 	
1233 | 
1234 | 
1235 | 	// for the Snoop Filter set 
1236 | 	// 	expected = expected number of cache lines loaded from L2
1237 | 	// 	sf_evict_rate = #evictions / expected number of loads
1238 | 	expected = 8.0/64.0* (double)l2_contained_size * (double) inner_repetitions;
1239 | 	sf_evict_rate = (double) cha_pkg_sums[0][0] / expected;
1240 | 	printf("SnoopFilterEvictionRate %f\n",sf_evict_rate);
1241 | 
1242 | 	expected = (double)l2_contained_size * (double) (inner_repetitions+1); // adjusted for pre-load of data
1243 | 	printf("Dummy Sum value is %f, expected value %f\n",sum,expected);
1244 | 
1245 | 	expected = (double)l2_contained_size * (double) inner_repetitions;
1246 | 	printf("Expected number of cache lines loaded from L2 %f\n",expected/8.0);
1247 | 	printf("Number of performance counter wraprounds detected %d\n",nwraps);
1248 | #ifdef RETRIES
1249 | 	printf("Number of core performance counter reads retried %d\n",retries);
1250 | #endif
1251 | 	printf("Number of zero values found in the inner loop %d\n",zeros);
1252 | 	// printf("Expected Number of Loads for AVX2 code %ld\n",arraylen/4);	
1253 | 	// printf("Expected Number of Cache Lines loaded %ld\n",arraylen/8);	
1254 | 
1255 | 	for (i=0; i<CORES_USED; i++) {
1256 | 		if (iteration_counts[i] != inner_repetitions) {
1257 | 			printf("ERROR: thread %d iteration_counts %ld expected %ld\n",i,iteration_counts[i],inner_repetitions);
1258 | 		}
1259 | 	}
1260 | 
1261 | 	// per-core performance counter values
1262 | 	for (counter=0; counter<4; counter++) {
1263 | 		printf("CORE_counter %d ",counter);
1264 | 		for (i=0; i<CORES_USED; i++) {
1265 | 			delta = corrected_pmc_delta(core_counters[i][counter][1],core_counters[i][counter][0],core_pmc_width);
1266 | 			printf("%ld ",delta);
1267 | 		}
1268 | 		printf("\n");
1269 | 	}
1270 | 	// per-CHA performance counter values -- socket 0 only
1271 | 	for (counter=0; counter<4; counter++) {
1272 | 		printf("CHA_counter %d ",counter);
1273 | 		for (i=0; i<NUM_CHA_USED; i++) {
1274 | 			delta = corrected_pmc_delta(cha_counts[0][i][counter][1],cha_counts[0][i][counter][0],uncore_pmc_width);
1275 | 			printf("%ld ",delta);
1276 | 		}
1277 | 		printf("\n");
1278 | 	}
1279 | 
1280 | 	printf("Double-check physical address of base of array\n");
1281 | 	pagemapentry = get_pagemap_entry(&array[0]);
1282 | 	printf("  array[0] va 0x%.16lx pagemapentry 0x%.16lx\n",&array[0],pagemapentry);
1283 | }
1284 | 


--------------------------------------------------------------------------------
/low_overhead_timers.c:
--------------------------------------------------------------------------------
  1 | // Some very low-overhead timer/counter interfaces:
  2 | //
  3 | // rdtsc() returns the number of "nominal" processor cycles since the system booted in a 64-bit unsigned integer.
  4 | //       For all recent Intel processors, this counter increments at a fixed rate, independent of the actual
  5 | //       core clock speed or the energy-saving mode.
  6 | // rdtscp() is the same as rdtsc except that it is partially ordered -- it will not execute until all prior
  7 | //       instructions in program order have executed.  (See also full_rdtscp)
  8 | // full_rdtscp() returns the number of "nominal" processor cycles in a 64-bit unsigned integer and also 
  9 | //       modifies its two integer arguments to show the processor socket and processor core that were in use
 10 | //       when the call was made.  (Note: the various cores in a chip usually have very similar values for 
 11 | //       the TSC, but they are allowed to vary by processor.  This function guarantees that you know exactly
 12 | //       which processor the TSC reading came from.)
 13 | // get_core_number() uses the RDTSCP instruction, but returns only the core number in an integer variable.
 14 | // get_socket_number() uses the RDTSCP instruction, but returns only the socket number in an integer variable.
 15 | // rdpmc_instructions() uses a "fixed-function" performance counter to return the count of retired instructions on
 16 | //       the current core in the low-order 48 bits of an unsigned 64-bit integer.
 17 | // rdpmc_actual_cycles() uses a "fixed-function" performance counter to return the count of actual CPU core cycles
 18 | //       executed by the current core.  Core cycles are not accumulated while the processor is in the "HALT" state,
 19 | //       which is used when the operating system has no task(s) to run on a processor core.
 20 | // rdpmc_reference_cycles() uses a "fixed-function" performance counter to return the count of "reference" (or "nominal")
 21 | //       CPU core cycles executed by the current core.  This counts at the same rate as the TSC, but does not count
 22 | //       when the core is in the "HALT" state.  If a timed section of code shows a larger change in TSC than in
 23 | //       rdpmc_reference_cycles, the processor probably spent some time in a HALT state.
 24 | // rdpmc() reads the programmable core performance counter number specified in the input argument.
 25 | //		 No error or bounds checking is performed.
 26 | //
 27 | // get_TSC_frequency() parses the Brand Identification string from the CPUID instruction to get the "nominal"
 28 | //       frequency of the processor, which is also the invariant TSC frequency, and returned as a float value in Hz.
 29 | //       This can then be used to convert TSC cycles to seconds.
 30 | //
 31 | 
 32 | extern inline __attribute__((always_inline)) unsigned long rdtsc()
 33 | {
 34 |    unsigned long a, d;
 35 | 
 36 |    __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
 37 | 
 38 |    return (a | (d << 32));
 39 | }
 40 | 
 41 | 
 42 | extern inline __attribute__((always_inline)) unsigned long rdtscp()
 43 | {
 44 |    unsigned long a, d, c;
 45 | 
 46 |    __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
 47 | 
 48 |    return (a | (d << 32));
 49 | }
 50 | 
 51 | extern inline __attribute__((always_inline)) unsigned long full_rdtscp(int *chip, int *core)
 52 | {
 53 |    unsigned long a, d, c;
 54 | 
 55 |    __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
 56 | 	*chip = (c & 0xFFF000UL)>>12;
 57 | 	*core = c & 0xFFFUL;
 58 | 
 59 |    return (a | (d << 32));
 60 | }
 61 | 
 62 | 
 63 | extern inline __attribute__((always_inline)) int get_core_number()
 64 | {
 65 |    unsigned long a, d, c;
 66 | 
 67 |    __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
 68 | 
 69 |    return ( c & 0xFFFUL );
 70 | }
 71 | 
 72 | extern inline __attribute__((always_inline)) int get_socket_number()
 73 | {
 74 |    unsigned long a, d, c;
 75 | 
 76 |    __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
 77 | 
 78 |    return ( (c & 0xF000UL)>>12 );
 79 | }
 80 | 
 81 | 
 82 | extern inline __attribute__((always_inline)) unsigned long rdpmc_instructions()
 83 | {
 84 |    unsigned long a, d, c;
 85 | 
 86 |    c = (1UL<<30);
 87 |    __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c));
 88 | 
 89 |    return (a | (d << 32));
 90 | }
 91 | 
 92 | extern inline __attribute__((always_inline)) unsigned long rdpmc_actual_cycles()
 93 | {
 94 |    unsigned long a, d, c;
 95 | 
 96 |    c = (1UL<<30)+1;
 97 |    __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c));
 98 | 
 99 |    return (a | (d << 32));
100 | }
101 | 
102 | extern inline __attribute__((always_inline)) unsigned long rdpmc_reference_cycles()
103 | {
104 |    unsigned long a, d, c;
105 | 
106 |    c = (1UL<<30)+2;
107 |    __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c));
108 | 
109 |    return (a | (d << 32));
110 | }
111 | 
112 | extern inline __attribute__((always_inline)) unsigned long rdpmc(int c)
113 | {
114 |         unsigned long a, d;
115 | 
116 |         __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c));
117 | 
118 |         return (a | (d << 32));
119 | }
120 | 
121 | // core performance counter width varies by processor
122 | // the width is contained in bits 23:16 of the EAX register
123 | // after executing the CPUID instruction with an initial EAX
124 | // argument of 0x0a (subleaf 0x0 in ECX).
125 | int get_core_counter_width()
126 | {
127 | 	unsigned int eax, ebx, ecx, edx;
128 | 	unsigned int leaf, subleaf;
129 | 	int width;
130 | 
131 | 	leaf = 0x0000000a;
132 | 	subleaf = 0x0;
133 | 	__asm__ __volatile__ ("cpuid" : \
134 | 	  "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (leaf), "c" (subleaf));
135 | 
136 | 	return((eax & 0x00ff0000) >> 16);
137 | }
138 | 
139 | // fixed-function performance counter width varies by processor
140 | // the width is contained in bits 12:5 of the EDX register
141 | // after executing the CPUID instruction with an initial EAX
142 | // argument of 0x0a (subleaf 0x0 in ECX).
143 | int get_fixed_counter_width()
144 | {
145 | 	unsigned int eax, ebx, ecx, edx;
146 | 	unsigned int leaf, subleaf;
147 | 	int width;
148 | 
149 | 	leaf = 0x0000000a;
150 | 	subleaf = 0x0;
151 | 	__asm__ __volatile__ ("cpuid" : \
152 | 	  "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (leaf), "c" (subleaf));
153 | 
154 | 	return((edx & 0x00001fe0) >> 5);
155 | }
156 | 
157 | // assume that these functions will automatically do the right thing if they are
158 | // included more than once....
159 | #include <stdio.h>
160 | #include <stdlib.h>
161 | 
162 | // Utility routine to compute counter differences taking into account rollover
163 | // when the performance counter width is not known at compile time.  
164 | // Use the "get_counter_width()" function to get the counter width on the
165 | // current system, then use that as the third argument to this function.
166 | // 64-bit counters don't generally roll over, but I added a special case
167 | // for this 
168 | unsigned long corrected_pmc_delta(unsigned long end, unsigned long start, int pmc_width)
169 | {
170 | 	unsigned long error_return=0xffffffffffffffff;
171 | 	unsigned long result;
172 | 	// sanity checks
173 | 	if ((pmc_width <= 0) || (pmc_width > 64)) {
174 | 		fprintf(stderr,"ERROR: corrected_pmc_delta() called with illegal performance counter width %d\n",pmc_width);
175 | 		return(error_return);
176 | 	}
177 | 	// Due to the specifics of unsigned arithmetic, for pmc_width == sizeof(unsigned long),
178 | 	// the simple calculation (end-start) gives the correct delta even if the counter has
179 | 	// rolled (leaving end < start).
180 | 	if (pmc_width == 64) {
181 | 		return (end - start);
182 | 	} else {
183 | 		// for pmc_width < sizeof(unsigned long), rollover must be detected and corrected explicitly
184 | 		if (end >= start) {
185 | 			result = end - start;
186 | 		} else {
187 | 			// I think this works independent of ordering, but this makes the most intuitive sense
188 | 			result = (end + (1UL<<pmc_width)) - start;
189 | 		}
190 | 		return (result);
191 | 	}
192 | }
193 | 
194 | // Ugly, ugly, ugly hack to get nominal frequency from CPUID Brand String
195 | // on Intel processors.
196 | // Converted from C++ to C.
197 | // Only works for products that use "GHz" as the frequency designator,
198 | // not "MHz" or "THz".  So far this works on all processors tested.
199 | // Return value is frequency in Hz, so user will need to divide by 1e9
200 | // if GHz is desired....
201 | float get_TSC_frequency()
202 | {
203 | 	unsigned int eax, ebx, ecx, edx;
204 | 	unsigned int leaf, subleaf;
205 | 	unsigned int  intbuf[12];
206 | 	char *buffer;
207 | 	int i,j,k,base,start,stop,length;
208 | 	float freq_GHz;
209 | 	float frequency;
210 | 
211 | 	subleaf=0;
212 | 
213 | 	base = 0;
214 | 	for (leaf=0x80000002; leaf<0x80000005; leaf++) {
215 | 		// printf("DEBUG: leaf = %x\n",leaf);
216 | 		__asm__ __volatile__ ("cpuid" : \
217 | 		  "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (leaf), "c" (subleaf));
218 | 
219 | 		// printf("leaf = %x, eax = %8.8x, ebx = %8.8x, ecx = %8.8x, edx = %8.8x\n",leaf, eax, ebx, ecx, edx);
220 | 		intbuf[base] = eax;
221 | 		intbuf[base+1] = ebx;
222 | 		intbuf[base+2] = ecx;
223 | 		intbuf[base+3] = edx;
224 | 		base += 4;
225 | 		// printf("  DEBUG: %.8s %.8s %.8s %.8s\n",eax,ebx,ecx,edx);
226 | 	}
227 | 	// for (base=0; base<12; base++) {
228 | 	// 	printf("base[%d] = %8.8x\n",base,intbuf[base]);
229 | 	// }
230 | 	// printf("444444443333333333222222222211111111110000000000\n");
231 | 	// printf("765432109876543210987654321098765432109876543210\n");
232 | 	// printf("%48.48s\n",(char *)&intbuf[0]);
233 | 	buffer = (char *) &intbuf[0];
234 | 	// for (base=0; base<48; base++) {
235 | 	// 	printf("%c",buffer[base]);
236 | 	// }
237 | 	// printf("\n");
238 | 	// printf("000000000011111111112222222222333333333344444444\n");
239 | 	// printf("012345678901234567890123456789012345678901234567\n");
240 | 	// printf("\n");
241 | 	// printf("\n");
242 | 	// printf("Scanning backwards to try to find the frequency digits....\n");
243 | 	for (base=47; base>0; base--){
244 | 		if (buffer[base] == 0x7a) {
245 | 			// printf("Found z at location %d\n",base);
246 | 			if (buffer[base-1] == 0x48) {
247 | 				// printf("Found H at location %d\n",base-1);
248 | 				if (buffer[base-2] == 0x47) {
249 | 					// printf("Found G at location %d\n",base-2);
250 | 					// printf(" -- need to extract string now\n");
251 | 					i = base-3;
252 | 					stop = base-3;
253 | 					// printf("begin reverse search at stop character location %d\n",i);
254 | 					while(buffer[i] != 0x20) {
255 | 						// printf("found a non-blank character %c (%x) at location %d\n",buffer[i],buffer[i],i);
256 | 						i--;
257 | 					}
258 | 					start = i+1;
259 | 					length = stop - start + 1;
260 | 					k = length+1;
261 | 					// for (j=stop; j<start; j--) {
262 | 						// printf("DEBUG: buffer[%d] = %c\n",j,buffer[j]);
263 | 						// k--;
264 | 					// }
265 | 					// printf("DEBUG: starting position of frequency string is %d\n",start);
266 | 					//
267 | 					// note that sscanf will automatically stop when the string changes from digits
268 | 					// to non-digits, so I don't need to NULL-terminate the string in the buffer.
269 | 					//
270 | 					sscanf((char *)&buffer[start],"%f",&freq_GHz);
271 | 					// printf("Frequency is %f GHz\n",freq_GHz);
272 | 					frequency = 1.0e9*freq_GHz;
273 | 					return (frequency);
274 | 				}
275 | 			}
276 | 		}
277 | 	}
278 | 	return(-1.0);
279 | }
280 | 
281 | 


--------------------------------------------------------------------------------
/low_overhead_timers.h:
--------------------------------------------------------------------------------
 1 | // Some timers
 2 | //
 3 | // rdtsc() returns the number of "nominal" processor cycles since the system booted in a 64-bit unsigned integer.
 4 | //       For all recent Intel processors, this counter increments at a fixed rate, independent of the actual
 5 | //       core clock speed or the energy-saving mode.
 6 | // rdtscp() is the same as rdtsc except that it is partially ordered -- it will not execute until all prior
 7 | //       instructions in program order have executed.  (See also full_rdtscp)
 8 | // full_rdtscp() returns the number of "nominal" processor cycles in a 64-bit unsigned integer and also 
 9 | //       modifies its two integer arguments to show the processor socket and processor core that were in use
10 | //       when the call was made.  (Note: the various cores in a chip usually have very similar values for 
11 | //       the TSC, but they are allowed to vary by processor.  This function guarantees that you know exactly
12 | //       which processor the TSC reading came from.)
13 | // get_core_number() uses the RDTSCP instruction, but returns only the core number in an integer variable.
14 | // get_socket_number() uses the RDTSCP instruction, but returns only the socket number in an integer variable.
15 | // rdpmc_instructions() uses a "fixed-function" performance counter to return the count of retired instructions on
16 | //       the current core in the low-order 48 bits of an unsigned 64-bit integer.
17 | // rdpmc_actual_cycles() uses a "fixed-function" performance counter to return the count of actual CPU core cycles
18 | //       executed by the current core.  Core cycles are not accumulated while the processor is in the "HALT" state,
19 | //       which is used when the operating system has no task(s) to run on a processor core.
20 | // rdpmc_reference_cycles() uses a "fixed-function" performance counter to return the count of "reference" (or "nominal")
21 | //       CPU core cycles executed by the current core.  This counts at the same rate as the TSC, but does not count
22 | //       when the core is in the "HALT" state.  If a timed section of code shows a larger change in TSC than in
23 | //       rdpmc_reference_cycles, the processor probably spent some time in a HALT state.
24 | // rdpmc() reads the programmable core performance counter number specified in the input argument.
25 | //		 No error or bounds checking is performed.
26 | //
27 | // get_TSC_frequency() parses the Brand Identification string from the CPUID instruction to get the "nominal"
28 | //       frequency of the processor, which is also the invariant TSC frequency, and returned as a float value in Hz.
29 | //       This can then be used to convert TSC cycles to seconds.
30 | //
31 | 
32 | unsigned long rdtsc();
33 | unsigned long rdtscp();
34 | unsigned long full_rdtscp(int *chip, int *core);
35 | int get_core_number();
36 | int get_socket_number();
37 | unsigned long rdpmc_instructions();
38 | unsigned long rdpmc_actual_cycles();
39 | unsigned long rdpmc_reference_cycles();
40 | unsigned long rdpmc(int c);
41 | int get_core_counter_width();
42 | int get_fixed_counter_width();
43 | unsigned long corrected_pmc_delta(unsigned long end, unsigned long start, int pmc_width);
44 | float get_TSC_frequency();
45 | 


--------------------------------------------------------------------------------
/run_ensemble.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If the root user is going to run the SnoopFilterMapper or SF_test_offsets binaries,
 4 | # the LD_LIBRARY_PATH may need to be defined to point to the OpenMP runtime
 5 | # library.  
 6 | # If the binaries are tagged as "setuid root", the Makefile should be modified
 7 | # to include an "rpath" option pointing to the OpenMP runtime library.
 8 | 
 9 | NUMTRIALS=100
10 | MAXTRIAL=$(( $NUMTRIALS - 1 ))
11 | 
12 | # Pin the threads to sequential cores on socket 0 and do not
13 | # let them switch thread contexts (if HyperThreading is enabled).
14 | # Similar to adding "granularity=fine" to the KMP_AFFINITY variable
15 | export KMP_HW_SUBSET=1s,24c,1t
16 | export KMP_AFFINITY=compact
17 | export OMP_NUM_THREADS=24
18 | 
19 | for TRIAL in `seq 0 $MAXTRIAL`
20 | do
21 | 	LABEL=`printf %.3d $TRIAL`
22 | 	time ./SnoopFilterMapper > log.2MiB.$LABEL
23 | done
24 | 


--------------------------------------------------------------------------------
/ssum.c:
--------------------------------------------------------------------------------
 1 | double ssum (double *a, long vl)
 2 | {
 3 | 	long i;
 4 | 	double sum;
 5 | 
 6 | 	sum = 0.0;
 7 | 	for (i=0; i<vl; i++) {
 8 | 		sum += a[i];
 9 | 	}
10 | 	return (sum);
11 | }
12 | 


--------------------------------------------------------------------------------
/va2pa_lib.c:
--------------------------------------------------------------------------------
  1 | #define _GNU_SOURCE
  2 | 
  3 | #define _XOPEN_SOURCE 500	// required by pread
  4 | #include <stdio.h>			// required by printf
  5 | #include <unistd.h>			// required by pread, close, getpid
  6 | #include <sys/types.h>		// required by open, getpid
  7 | #include <sys/stat.h>		// required by open
  8 | #include <fcntl.h>			// required by open
  9 | 
 10 | // declarations that calling routines will need
 11 | void print_pagemap_entry(unsigned long long pagemap_entry);
 12 | unsigned long long get_pagemap_entry( void * va );
 13 | 
 14 | // -----------------------------------------------------------------------------------------
 15 | // Function to take any pointer and look up the entry in /proc/$pid/pagemap
 16 | //   No error handling -- caller should check errno on a 0 return value.
 17 | //   Does not attempt to interpret bits -- returns them all in an unsigned long long
 18 | //   Returns 0 if the page is not currently mapped, or if an error occurs.
 19 | //   Note that the page shift bits are wrong in 2.6.32 kernels (and 2.6.34 on MIC).
 20 | //     I have not been able to figger out if these bits are correct in any Linux version.
 21 | //
 22 | // John D. McCalpin, mccalpin@tacc.utexas.edu
 23 | // Revised to 2013-04-18
 24 | 
 25 | unsigned long long get_pagemap_entry( void * va )
 26 | {
 27 | 	ssize_t ret;
 28 | 	off_t myoffset;
 29 | 
 30 | 	pid_t mypid;
 31 | 	char filename[32];		// needs 15 characters for the "/proc/" and "/pagemap", plus enough for the PID
 32 | 	unsigned long long result;
 33 | 	static int pagemap_fd;
 34 | 	static int initialized=0;
 35 | 
 36 | 	// on first call: get process pid, open /proc/$pid/pagemap, and save the file descriptor for subsequent calls
 37 | 	if (initialized == 0) {
 38 | 		mypid = getpid();
 39 | 		sprintf(filename,"/proc/%d/pagemap",mypid);
 40 | 		pagemap_fd = open(filename, O_RDONLY);
 41 | 		if (pagemap_fd == 0) {
 42 | 			return(0UL);		// user must check errno if a zero value is returned
 43 | 		}
 44 | 		initialized = 1;
 45 | 	}
 46 | 
 47 | 	myoffset = ((long long) va >> 12) << 3;	 // required to cast void pointer before using it
 48 | 
 49 | 	ret = pread(pagemap_fd, &result, 8, myoffset);
 50 | 	if (ret != 8) {
 51 | 		return (0UL);			// user must check errno if a zero value is returned
 52 | 	}
 53 | 	return(result);
 54 | }
 55 | // -----------------------------------------------------------------------------------------
 56 | 
 57 | 
 58 | // -----------------------------------------------------------------------------------------
 59 | // McCalpin's function to print the PFN (Page Frame Number) from entries returned by get_pagemap_entry()
 60 | //   Warnings are printed if
 61 | //     a. the page is not present (bit 63 != 1), or
 62 | //     b. the page is swapped (bit 62 == 1)
 63 | //   Note that the page shift bits are wrong in 2.6.25 through 2.6.32 kernels, fixed in 2.6.33
 64 | //     Therefore this only prints the page frame number (bits 0..55)
 65 | //   The PFN value will not make sense if the page is swapped (bit 62 set)
 66 | //   The value should be all 0's if the page is unmapped, or if get_pagemap_entry() returned an error.
 67 | //
 68 | // John D. McCalpin, mccalpin@tacc.utexas.edu
 69 | // Revised to 2013-04-18
 70 | 
 71 | #define BIT_IS_SET(x, n)   (((x) & (1UL<<(n)))?1:0)	// more convenient 0/1 result
 72 | 
 73 | void print_pagemap_entry(unsigned long long pagemap_entry)
 74 | {
 75 | 	int logpagesize;
 76 | 	int pagesize;
 77 | 	unsigned long framenumber;
 78 | 	unsigned long tmp;
 79 | 
 80 | 	tmp = BIT_IS_SET(pagemap_entry, 63);		// could also use ( (pagemap_entry >> 63) != 1 ) as the test
 81 | 	if (tmp == 0) {
 82 | 		printf("WARNING in print_pagemap_entry: page is not present.  Result = %.16lx\n",pagemap_entry);
 83 | 	}
 84 | 	tmp = BIT_IS_SET(pagemap_entry, 62);
 85 | 	if (tmp != 0) {
 86 | 		printf("WARNING in print_pagemap_entry: page is swapped.  Result = %.16lx\n",pagemap_entry);
 87 | 	}
 88 | 
 89 | 	framenumber = ( (pagemap_entry<<9) >> 9);	// clear upper 9 bits -- only works for unsigned types
 90 | 
 91 | #ifdef OLDKERNEL
 92 | 	// Page size bits are broken in 2.6.32 kernels (Stampede) and 2.6.34 kernels (MIC)
 93 | 	printf("print_pagemap_entry: argument = 0x%.16lx, framenumber = 0x%.16lx\n",pagemap_entry,framenumber);
 94 | #else
 95 | 	// Dunno if this is fixed in newer kernels -- clearly broken in 2.6.32 and 2.6.34 (MIC)
 96 | 	logpagesize = ( (pagemap_entry<<3) >> 58);	// clear bits 61-63, then shift (original) bit 55 down to 0;
 97 | 	pagesize = 1 << logpagesize;
 98 | 	printf("print_pagemap_entry: logpagesize = %d, pagesize = %d, framenumber = 0x%.16lx\n",logpagesize,pagesize,framenumber);
 99 | #endif
100 | }
101 | 
102 | 


--------------------------------------------------------------------------------