├── .gitignore ├── INSTALL ├── LICENSE ├── Makefile ├── README ├── README.md ├── include ├── atomic_ops.h ├── barrier.h ├── ccbench.h ├── common.h └── pfd.h ├── scripts ├── events_all ├── run_niagara.sh ├── run_opteron.sh ├── run_tilera.sh ├── run_with_confidence.sh └── run_xeon.sh └── src ├── .gitignore ├── barrier.c ├── ccbench.c └── pfd.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.a 2 | *.o 3 | *~ 4 | /#*compilation*# 5 | /#INSTALL# 6 | /#README# 7 | /#d# 8 | /#free.sh# 9 | /#moesi.c# 10 | /#pfd.h# 11 | /#run_opteron.sh# 12 | /*compilation* 13 | /*eshell* 14 | /.#INSTALL 15 | /.#README 16 | /.#README.md 17 | /.#d 18 | /.#moesi.c 19 | /.#moesi.h 20 | /.#pfd.h 21 | /.#run_opteron.sh 22 | /.tmp 23 | /any.sh 24 | /cat_proc.sh 25 | /ccbench 26 | /ccbench.S 27 | /conf.run.tmp 28 | /core 29 | /d 30 | /dd 31 | /free.sh 32 | /help 33 | /moesi 34 | /moesi_old 35 | /moesi_sosp 36 | /moesi_th.c 37 | /msr 38 | /ps_a.sh 39 | /run 40 | /run_with_confidence.sh 41 | /s 42 | /ss 43 | /sss 44 | /xeon_cores 45 | /xeon_dist.out 46 | cscope.* 47 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | Steps to install ccbench: 2 | ------------------------ 3 | 4 | 1. Fix the Makefile (not always necessary) 5 | 6 | The Makefile sets some parameters based on which is the host you are running on. 7 | The parameters are: 8 | * PLATFORM : the platform name used to set platform specific parameters in the code 9 | * CC : the compiler to be used 10 | * CFLAGS : the compilation flags 11 | * LDFLAGS : the libraries to link with 12 | * VER_FLAGS : version flags, such as the platform name 13 | 14 | If a configuration is not specified, the DEFAULT configuration is used (it should work for most x86 platforms). 15 | 16 | 2. Compile for the target platform 17 | 18 | In the base folder of the project execute: 19 | make 20 | 21 | 3. ./moesi -h 22 | 23 | You will get all the details you need in order to use the application. 24 | 25 | 26 | Tested platforms: 27 | ----------------- 28 | 29 | ccbench has been tested on the following platforms: 30 | * UMA and NUMA x86_64 31 | * SPARC (UltraSPARC T2, UltraSPARC T4-4) 32 | * Tilera (Tile-GX36, TILEPro64) 33 | 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Vasileios Trigonakis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SRC = src 2 | INCLUDE = include 3 | 4 | CFLAGS = -O3 -Wall 5 | LDFLAGS = -lm -lrt 6 | VER_FLAGS = -D_GNU_SOURCE 7 | 8 | ifeq ($(VERSION),DEBUG) 9 | CFLAGS = -O0 -ggdb -Wall -g -fno-inline 10 | endif 11 | 12 | UNAME := $(shell uname -n) 13 | 14 | ifeq ($(UNAME), lpd48core) 15 | PLATFORM = OPTERON 16 | CC = gcc 17 | PLATFORM_NUMA = 1 18 | endif 19 | 20 | ifeq ($(UNAME), diassrv8) 21 | PLATFORM = XEON 22 | CC = gcc 23 | PLATFORM_NUMA = 1 24 | endif 25 | 26 | ifeq ($(UNAME), maglite) 27 | PLATFORM = NIAGARA 28 | CC = /opt/csw/bin/gcc 29 | CFLAGS += -m64 -mcpu=v9 -mtune=v9 30 | endif 31 | 32 | ifeq ($(UNAME), parsasrv1.epfl.ch) 33 | PLATFORM = TILERA 34 | CC = tile-gcc 35 | LDFLAGS += -ltmc 36 | endif 37 | 38 | ifeq ($(UNAME), diascld19) 39 | PLATFORM = XEON2 40 | CC = gcc 41 | endif 42 | 43 | ifeq ($(UNAME), diascld9) 44 | PLATFORM = OPTERON2 45 | CC = gcc 46 | endif 47 | 48 | ifeq ($(PLATFORM), ) 49 | PLATFORM = DEFAULT 50 | CC = gcc 51 | endif 52 | 53 | VER_FLAGS += -D$(PLATFORM) 54 | 55 | ifeq ($(PLATFORM_NUMA),1) #give PLATFORM_NUMA=1 for NUMA 56 | LDFLAGS += -lnuma 57 | VER_FLAGS += -DPLATFORM_NUMA 58 | endif 59 | 60 | default: ccbench 61 | 62 | all: ccbench 63 | 64 | ccbench: ccbench.o $(SRC)/pfd.c $(SRC)/barrier.c $(INCLUDE)/common.h $(INCLUDE)/ccbench.h $(INCLUDE)/pfd.h $(INCLUDE)/barrier.h barrier.o pfd.o 65 | $(CC) $(VER_FLAGS) -o ccbench ccbench.o pfd.o barrier.o $(CFLAGS) $(LDFLAGS) -I./$(INCLUDE) 66 | 67 | ccbench.o: $(SRC)/ccbench.c $(INCLUDE)/ccbench.h 68 | $(CC) $(VER_FLAGS) -c $(SRC)/ccbench.c $(CFLAGS) -I./$(INCLUDE) 69 | 70 | pfd.o: $(SRC)/pfd.c $(INCLUDE)/pfd.h 71 | $(CC) $(VER_FLAGS) -c $(SRC)/pfd.c $(CFLAGS) -I./$(INCLUDE) 72 | 73 | barrier.o: $(SRC)/barrier.c $(INCLUDE)/barrier.h 74 | $(CC) $(VER_FLAGS) -c $(SRC)/barrier.c $(CFLAGS) -I./$(INCLUDE) 75 | 76 | clean: 77 | rm -f *.o ccbench 78 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | ccbench is a tool for measuring the cache-coherence latencies of a processor, i.e., the latencies of loads, stores, compare-and-swap (CAS), fetch-and-increment (FAI), test-and-set (TAS), and swap (SWAP). The latencies that ccbench measures can be used to understand and predict the behavior of sharing and synchronization on the underlying hardware platform. 2 | 3 | * Website : http://lpd.epfl.ch/site/ccbench 4 | * Author : Vasileios Trigonakis 5 | * Related Publications: ccbench is a part of the SSYNC synchronization suite 6 | (http://lpd.epfl.ch/site/ssync), developed for: 7 | Everything You Always Wanted to Know about Synchronization but Were Afraid to Ask, 8 | Tudor David, Rachid Guerraoui, Vasileios Trigonakis (alphabetical order), 9 | SOSP '13 - Proceeding of the 24th ACM Symposium on Operating Systems Principles 10 | 11 | 12 | Installation: 13 | ------------- 14 | 15 | Please refer to the INSTALL file. 16 | 17 | 18 | Using ccbench: 19 | -------------- 20 | 21 | Execute: 22 | ./ccbench -h 23 | to get the parameters and the supported events of ccbench 24 | 25 | 26 | Details: 27 | -------- 28 | ccbench brings a single cache line L in the desired MESI state and position in the processor and then 29 | performs that target operation on L. In more details, ccbench takes the following steps: 30 | 1. It uses one (or more) cores to bring L in the desired state and position, 31 | e.g., in a Modified state in the local caches of core 0 in node 0. 32 | 2. It then uses another core in order to perform the target operation, e.g., load from a 33 | modified state that is on the local caches of a core that is on the same node. 34 | 35 | 36 | 37 | Limitations: 38 | ------------ 39 | 40 | Measuring latencies at this low level is not easy. Most of the events work as intended on all platforms. 41 | However, there are some subtle details that one should be aware of in order to "successfully" use 42 | ccbench: 43 | * The memory fences to be used are related to the memory consistency model of the underlying 44 | platform. For instance, on an AMD Opteron Magny-Cours we can measure both loads and stores 45 | without using any fences (ccbench -e0). Contrarily, on an Intel Xeon Westmere-EX, we can 46 | measure the loads with a load fence, but a store needs a full fence (so, ccbench -e8). 47 | * The stride parameter is used to try to fool the hardware prefetchers. This is also a 48 | hardware dependent parameter. 49 | * There are certain cases where you might need to compile ccbench with -O0 flag instead of 50 | the default -O3 to be able to get the results. Known cases: 51 | * on the Tile-GX36, you probably need to compile with -O0 to get sensible number 52 | for the atomic ops 53 | * on UltraSPARC T2, you probably need to compile with -O0 for all operations 54 | except the atomic ops 55 | 56 | 57 | Interpreting the results: 58 | ------------------------- 59 | 60 | The comments prefixed with "#######" explain the results. 61 | 62 | 63 | ####### settings: 64 | 65 | test: LOAD_FROM_MODIFIED / #cores: 2 / #repetitions: 1000 / stride: 4096 (256 kiB) / fence: load/full 66 | core1: 1 / core2: 2 67 | 68 | ####### warnings regarding the profiler correction. If the calculation fails for 10 times (i.e, the 69 | ####### correction calculation does not have a low std deviation, the correction is set manually 70 | ####### to a give (in src/pfd.c) platform-specific value. If the default value is not set, the 71 | ####### avg corrections are still used (this works ok in my experience) 72 | 73 | * warning: avg pfd correction is 20.2 with std deviation: 16.3%. Recalculating. 74 | * warning: setting pfd correction manually 75 | -- pfd correction: 20 (std deviation: 22.2%) 76 | * warning: avg pfd correction is 20.3 with std deviation: 17.0%. Recalculating. 77 | * warning: setting pfd correction manually 78 | -- pfd correction: 20 (std deviation: 22.2%) 79 | 80 | ####### results 81 | 82 | [00] *** Core 0 ********************************************************************************** 83 | 84 | ---- statistics: 85 | 86 | ####### global avg and deviations 87 | 88 | [00] avg : 111.5 abs dev : 2.5 std dev : 4.5 num : 1000 89 | [00] min : 32.0 (element: 779) max : 136.0 (element: 415) 90 | 91 | ####### clustering of values around the global avg. This used as an easy way to remove the outliers 92 | ####### columns: % around the avg / num of sample / % of the total num of sample / avg of the cluster / 93 | ####### absolute deviation of the cluster / standard deviation of the cluster 94 | 95 | [00] 0-10% : 987 ( 98.7% | avg: 111.5 | abs dev: 2.3 | std dev: 3.0 = 2.7%) 96 | [00] 10-25% : 11 ( 1.1% | avg: 126.2 | abs dev: 3.5 | std dev: 4.2 = 3.3%) 97 | [00] 25-50% : 1 ( 0.1% | avg: 65.0 | abs dev: 0.0 | std dev: 0.0 = 0.0%) 98 | [00] 50-75% : 1 ( 0.1% | avg: 32.0 | abs dev: 0.0 | std dev: 0.0 = 0.0%) 99 | [00] 75-100% : 0 ( 0.0% | avg: -nan | abs dev: -nan | std dev: -nan = -nan%) 100 | 101 | [01] *** Core 1 ********************************************************************************** 102 | 103 | ---- statistics: 104 | [01] avg : 112.3 abs dev : 2.5 std dev : 5.4 num : 1000 105 | [01] min : 10.0 (element: 902) max : 133.0 (element: 404) 106 | [01] 0-10% : 989 ( 98.9% | avg: 112.4 | abs dev: 2.2 | std dev: 2.9 = 2.6%) 107 | [01] 10-25% : 9 ( 0.9% | avg: 126.0 | abs dev: 1.8 | std dev: 2.7 = 2.1%) 108 | [01] 25-50% : 0 ( 0.0% | avg: -nan | abs dev: -nan | std dev: -nan = -nan%) 109 | [01] 50-75% : 0 ( 0.0% | avg: -nan | abs dev: -nan | std dev: -nan = -nan%) 110 | [01] 75-100% : 2 ( 0.2% | avg: 13.5 | abs dev: 3.5 | std dev: 3.5 = 25.9%) 111 | 112 | ####### The meaning of the results 113 | 114 | [00] ** Results from Core 0 : store to owned mine (if owned state supported, else exclusive) 115 | [00] ** Results from Core 1 : load from modified (makes it owned, if owned state supported) 116 | 117 | ####### The final value in the cache line that was used / the sum of all loads on this core 118 | ####### These values can be used for ensuring the correctness of some test (e.g., FAI) 119 | 120 | [00] value of cl is 0 / sum is 0 121 | [01] value of cl is 0 / sum is 0 122 | 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ccbench 2 | ======= 3 | 4 | ccbench is a tool for measuring the cache-coherence latencies of a processor, i.e., the latencies of `loads`, `stores`, `compare-and-swap (CAS)`, `fetch-and-increment (FAI)`, `test-and-set (TAS)`, and `swap (SWAP)`. The latencies that ccbench measures can be used to understand and predict the behavior of sharing and synchronization on the underlying hardware platform. 5 | 6 | * Website : http://lpd.epfl.ch/site/ccbench 7 | * Author : Vasileios Trigonakis 8 | * Related Publications: ccbench is a part of the SSYNC synchronization suite 9 | (http://lpd.epfl.ch/site/ssync): 10 | Everything You Always Wanted to Know about Synchronization but Were Afraid to Ask, 11 | Tudor David, Rachid Guerraoui, Vasileios Trigonakis (alphabetical order), 12 | SOSP '13 - Proceeding of the 24th ACM Symposium on Operating Systems Principles 13 | 14 | 15 | Installation: 16 | ------------- 17 | 18 | Please refer to the `INSTALL` file. 19 | 20 | 21 | Using ccbench: 22 | -------------- 23 | 24 | Execute: 25 | `./ccbench -h` 26 | to get the parameters and the supported events of ccbench 27 | 28 | 29 | Details: 30 | -------- 31 | ccbench brings a single cache line L in the desired MESI state and position in the processor and then 32 | performs that target operation on L. In more details, ccbench takes the following steps: 33 | 1 It uses one (or more) cores to bring L in the desired state and position, 34 | e.g., in a Modified state in the local caches of core 0 in node 0. 35 | 2 It then uses another core in order to perform the target operation, e.g., load from a 36 | modified state that is on the local caches of a core that is on the same node. 37 | 38 | 39 | 40 | Limitations: 41 | ------------ 42 | 43 | Measuring latencies at this low level is not easy. Most of the events work as intended on all platforms. 44 | However, there are some subtle details that one should be aware of in order to "successfully" use 45 | ccbench: 46 | * The memory fences to be used are related to the memory consistency model of the underlying platform. For instance, on an `AMD Opteron Magny-Cours` we can measure both `loads` and `stores` without using any fences (`ccbench -e0`). Contrarily, on an `Intel Xeon Westmere-EX`, we can measure the loads with a `load fence`, but a store needs a full fence (so, `ccbench -e8`). 47 | * The stride parameter is used to try to fool the hardware prefetchers. This is also a hardware dependent parameter. 48 | * There are certain cases where you might need to compile ccbench with `-O0` flag instead of the default `-O3` to be able to get the results. Known cases: 49 | * on the Tile-GX36, you probably need to compile with `-O0` to get sensible number for the atomic ops 50 | * on UltraSPARC T2, you probably need to compile with `-O0` for all operations 51 | except the atomic ops 52 | 53 | 54 | Interpreting the results: 55 | ------------------------- 56 | 57 | The comments prefixed with "#>>" explain the results. 58 | 59 |
 60 | #>> settings:
 61 | test: LOAD_FROM_MODIFIED / #cores: 2 / #reps: 1000 / stride: 4096 / fence: load/full
 62 | core1:   1 / core2:   2
 63 | 
 64 | #>> warnings regarding the profiler correction. If the calculation fails for 10 times 
 65 | #>> (i.e, the correction calculation does not have a low std deviation, the correction 
 66 | #>> is manually set to a given (in src/pfd.c) platform-specific value. If the default 
 67 | #>> value is not set, the avg corrections are still used. 
 68 | #>> (This approach works OK in my experience.)
 69 | 
 70 | * warning: avg pfd correction is 20.2 with std deviation: 16.3%. Recalculating.
 71 | * warning: setting pfd correction manually
 72 |  -- pfd correction: 20 (std deviation: 22.2%)
 73 | * warning: avg pfd correction is 20.3 with std deviation: 17.0%. Recalculating.
 74 | * warning: setting pfd correction manually
 75 |  -- pfd correction: 20 (std deviation: 22.2%)
 76 | 
 77 | #>> results
 78 | 
 79 | [00]  *** Core  0 ***************************************************************
 80 | 
 81 |  ---- statistics:
 82 | 
 83 | #>> global avg and deviations
 84 | 
 85 | [00]  avg : 111.5      abs dev : 2.5        std dev : 4.5        num     : 1000
 86 | [00]  min : 32.0       (element:    779)    max     : 136.0      (element:    415)
 87 | 
 88 | #>> clustering of values around the global avg. This approach is used as an easy way 
 89 | #>> of remove outlier measurements. The columns represent:
 90 | #>> % group / num of samples / % of the total num of samples / avg of the cluster /
 91 | #>> absolute deviation of the cluster / standard deviation of the cluster
 92 | 
 93 | [00]   0-10% : 987 ( 98.7% | avg: 111.5 | abs dev:  2.3 | std dev:  3.0 =   2.7%)
 94 | [00]  10-25% : 11  (  1.1% | avg: 126.2 | abs dev:  3.5 | std dev:  4.2 =   3.3%)
 95 | [00]  25-50% : 1   (  0.1% | avg:  65.0 | abs dev:  0.0 | std dev:  0.0 =   0.0%)
 96 | [00]  50-75% : 1   (  0.1% | avg:  32.0 | abs dev:  0.0 | std dev:  0.0 =   0.0%)
 97 | [00] 75-100% : 0   (  0.0% | avg:  -nan | abs dev: -nan | std dev: -nan =  -nan%)
 98 | 
 99 | [01]  *** Core  1 ***************************************************************
100 | 
101 |  ---- statistics:
102 | [01]     avg : 112.3 abs dev : 2.5        std dev : 5.4        num     : 1000
103 | [01]     min : 10.0  (element:    902)    max     : 133.0      (element:    404)
104 | [01]   0-10% : 989 ( 98.9% | avg: 112.4 | abs dev:  2.2 | std dev:  2.9 =   2.6%)
105 | [01]  10-25% : 9   (  0.9% | avg: 126.0 | abs dev:  1.8 | std dev:  2.7 =   2.1%)
106 | [01]  25-50% : 0   (  0.0% | avg:  -nan | abs dev: -nan | std dev: -nan =  -nan%)
107 | [01]  50-75% : 0   (  0.0% | avg:  -nan | abs dev: -nan | std dev: -nan =  -nan%)
108 | [01] 75-100% : 2   (  0.2% | avg:  13.5 | abs dev:  3.5 | std dev:  3.5 =  25.9%)
109 | 
110 | #>> The meaning of the results
111 | 
112 | [00] Results Core 0 : store to owned mine (if owned state supported, else exclusive)
113 | [00] Results Core 1 : load from modified (makes it owned, if owned state supported)
114 | 
115 | #>> The final val in the cache line that was used / the sum of all loads on this core
116 | #>> These values can be used for ensuring the correctness of some test (e.g., FAI)
117 | 
118 | [00]  value of cl is 0    / sum is 0
119 | [01]  value of cl is 0    / sum is 0
120 | 
121 | -------------------------------------------------------------------------------- /include/atomic_ops.h: -------------------------------------------------------------------------------- 1 | /* 2 | * File: atomic_ops.h 3 | * Author: Tudor David 4 | * Description: cross-platform interface to atomic operations 5 | * atomic_ops is part of SSYNC 6 | * 7 | * The MIT License (MIT) 8 | * 9 | * Copyright (C) 2013 Tudor David 10 | * 11 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 12 | * this software and associated documentation files (the "Software"), to deal in 13 | * the Software without restriction, including without limitation the rights to 14 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 15 | * the Software, and to permit persons to whom the Software is furnished to do so, 16 | * subject to the following conditions: 17 | * 18 | * The above copyright notice and this permission notice shall be included in all 19 | * copies or substantial portions of the Software. 20 | * 21 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 23 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 24 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 25 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 26 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | * 28 | */ 29 | 30 | #ifndef _ATOMIC_OPS_H_INCLUDED_ 31 | #define _ATOMIC_OPS_H_INCLUDED_ 32 | 33 | #include 34 | 35 | #ifdef __sparc__ 36 | /* 37 | * sparc code 38 | */ 39 | 40 | # include 41 | 42 | //test-and-set uint8_t 43 | static inline uint8_t tas_uint8(volatile uint8_t *addr) { 44 | uint8_t oldval; 45 | __asm__ __volatile__("ldstub %1,%0" 46 | : "=r"(oldval), "=m"(*addr) 47 | : "m"(*addr) : "memory"); 48 | return oldval; 49 | } 50 | 51 | //Compare-and-swap 52 | # define CAS_PTR(a,b,c) atomic_cas_ptr(a,b,c) 53 | # define CAS_U8(a,b,c) atomic_cas_8(a,b,c) 54 | # define CAS_U16(a,b,c) atomic_cas_16(a,b,c) 55 | # define CAS_U32(a,b,c) atomic_cas_32(a,b,c) 56 | # define CAS_U64(a,b,c) atomic_cas_64(a,b,c) 57 | //Swap 58 | # define SWAP_PTR(a,b) atomic_swap_ptr(a,b) 59 | # define SWAP_U8(a,b) atomic_swap_8(a,b) 60 | # define SWAP_U16(a,b) atomic_swap_16(a,b) 61 | # define SWAP_U32(a,b) atomic_swap_32(a,b) 62 | # define SWAP_U64(a,b) atomic_swap_64(a,b) 63 | //Fetch-and-increment 64 | # define FAI_U8(a) (atomic_inc_8_nv(a)-1) 65 | # define FAI_U16(a) (atomic_inc_16_nv(a)-1) 66 | # define FAI_U32(a) (atomic_inc_32_nv(a)-1) 67 | # define FAI_U64(a) (atomic_inc_64_nv(a)-1) 68 | //Fetch-and-decrement 69 | # define FAD_U8(a) (atomic_dec_8_nv(a,)+1) 70 | # define FAD_U16(a) (atomic_dec_16_nv(a)+1) 71 | # define FAD_U32(a) (atomic_dec_32_nv(a)+1) 72 | # define FAD_U64(a) (atomic_dec_64_nv(a)+1) 73 | //Increment-and-fetch 74 | # define IAF_U8(a) atomic_inc_8_nv(a) 75 | # define IAF_U16(a) atomic_inc_16_nv(a) 76 | # define IAF_U32(a) atomic_inc_32_nv(a) 77 | # define IAF_U64(a) atomic_inc_64_nv(a) 78 | //Decrement-and-fetch 79 | # define DAF_U8(a) atomic_dec_8_nv(a) 80 | # define DAF_U16(a) atomic_dec_16_nv(a) 81 | # define DAF_U32(a) atomic_dec_32_nv(a) 82 | # define DAF_U64(a) atomic_dec_64_nv(a) 83 | //Test-and-set 84 | # define TAS_U8(a) tas_uint8(a) 85 | //Memory barrier 86 | # define MEM_BARRIER asm volatile("membar #LoadLoad | #LoadStore | #StoreLoad | #StoreStore"); 87 | # define _mm_lfence() asm volatile("membar #LoadLoad | #LoadStore"); 88 | # define _mm_sfence() asm volatile("membar #StoreLoad | #StoreStore"); 89 | # define _mm_mfence() asm volatile("membar #LoadLoad | #LoadStore | #StoreLoad | #StoreStore"); 90 | 91 | # define _mm_clflush(x) asm volatile("nop"); 92 | //end of sparc code 93 | #elif defined(__tile__) 94 | /* 95 | * Tilera code 96 | */ 97 | # include 98 | # include 99 | //atomic operations interface 100 | //Compare-and-swap 101 | # define CAS_PTR(a,b,c) arch_atomic_val_compare_and_exchange(a,b,c) 102 | # define CAS_U8(a,b,c) arch_atomic_val_compare_and_exchange(a,b,c) 103 | # define CAS_U16(a,b,c) arch_atomic_val_compare_and_exchange(a,b,c) 104 | # define CAS_U32(a,b,c) arch_atomic_val_compare_and_exchange(a,b,c) 105 | # define CAS_U64(a,b,c) arch_atomic_val_compare_and_exchange(a,b,c) 106 | //Swap 107 | # define SWAP_PTR(a,b) arch_atomic_exchange(a,b) 108 | # define SWAP_U8(a,b) arch_atomic_exchange(a,b) 109 | # define SWAP_U16(a,b) arch_atomic_exchange(a,b) 110 | # define SWAP_U32(a,b) arch_atomic_exchange(a,b) 111 | # define SWAP_U64(a,b) arch_atomic_exchange(a,b) 112 | //Fetch-and-increment 113 | # define FAI_U8(a) arch_atomic_increment(a) 114 | # define FAI_U16(a) arch_atomic_increment(a) 115 | # define FAI_U32(a) arch_atomic_increment(a) 116 | # define FAI_U64(a) arch_atomic_increment(a) 117 | //Fetch-and-decrement 118 | # define FAD_U8(a) arch_atomic_decrement(a) 119 | # define FAD_U16(a) arch_atomic_decrement(a) 120 | # define FAD_U32(a) arch_atomic_decrement(a) 121 | # define FAD_U64(a) arch_atomic_decrement(a) 122 | //Increment-and-fetch 123 | # define IAF_U8(a) (arch_atomic_increment(a)+1) 124 | # define IAF_U16(a) (arch_atomic_increment(a)+1) 125 | # define IAF_U32(a) (arch_atomic_increment(a)+1) 126 | # define IAF_U64(a) (arch_atomic_increment(a)+1) 127 | //Decrement-and-fetch 128 | # define DAF_U8(a) (arch_atomic_decrement(a)-1) 129 | # define DAF_U16(a) (arch_atomic_decrement(a)-1) 130 | # define DAF_U32(a) (arch_atomic_decrement(a)-1) 131 | # define DAF_U64(a) (arch_atomic_decrement(a)-1) 132 | //Test-and-set 133 | # define TAS_U8(a) arch_atomic_val_compare_and_exchange(a,0,0xff) 134 | //Memory barrier 135 | # define MEM_BARRIER arch_atomic_full_barrier() 136 | 137 | # define _mm_lfence() arch_atomic_read_barrier() 138 | # define _mm_sfence() arch_atomic_write_barrier() 139 | # define _mm_mfence() arch_atomic_full_barrier() 140 | 141 | # define _mm_clflush(x) tmc_mem_finv_no_fence((const void*) x, 64); 142 | 143 | //Relax CPU 144 | //define PAUSE cycle_relax() 145 | 146 | //end of tilera code 147 | #else 148 | 149 | /* 150 | * x86 code 151 | */ 152 | 153 | # if defined(__SSE__) 154 | # include 155 | # else 156 | # define _mm_lfence() asm volatile ("lfence" : :) 157 | # define _mm_sfence() asm volatile ("sfence" : :) 158 | # define _mm_mfence() asm volatile ("mfence" : :) 159 | # define _mm_pause() asm volatile ("rep; nop" : : ) 160 | # define _mm_clflush(__A) asm volatile("clflush %0" : "+m" (*(volatile char*)__A)) 161 | # endif 162 | 163 | //Swap pointers 164 | static inline void* swap_pointer(volatile void* ptr, void *x) { 165 | # ifdef __i386__ 166 | __asm__ __volatile__("xchgl %0,%1" 167 | :"=r" ((unsigned) x) 168 | :"m" (*(volatile unsigned *)ptr), "0" (x) 169 | :"memory"); 170 | 171 | return x; 172 | # elif defined(__x86_64__) 173 | __asm__ __volatile__("xchgq %0,%1" 174 | :"=r" ((unsigned long long) x) 175 | :"m" (*(volatile long long *)ptr), "0" ((unsigned long long) x) 176 | :"memory"); 177 | 178 | return x; 179 | # endif 180 | } 181 | 182 | //Swap uint64_t 183 | static inline uint64_t swap_uint64(volatile uint64_t* target, uint64_t x) { 184 | __asm__ __volatile__("xchgq %0,%1" 185 | :"=r" ((uint64_t) x) 186 | :"m" (*(volatile uint64_t *)target), "0" ((uint64_t) x) 187 | :"memory"); 188 | 189 | return x; 190 | } 191 | 192 | //Swap uint32_t 193 | static inline uint32_t swap_uint32(volatile uint32_t* target, uint32_t x) { 194 | __asm__ __volatile__("xchgl %0,%1" 195 | :"=r" ((uint32_t) x) 196 | :"m" (*(volatile uint32_t *)target), "0" ((uint32_t) x) 197 | :"memory"); 198 | 199 | return x; 200 | } 201 | 202 | //Swap uint16_t 203 | static inline uint16_t swap_uint16(volatile uint16_t* target, uint16_t x) { 204 | __asm__ __volatile__("xchgw %0,%1" 205 | :"=r" ((uint16_t) x) 206 | :"m" (*(volatile uint16_t *)target), "0" ((uint16_t) x) 207 | :"memory"); 208 | 209 | return x; 210 | } 211 | 212 | //Swap uint8_t 213 | static inline uint8_t swap_uint8(volatile uint8_t* target, uint8_t x) { 214 | __asm__ __volatile__("xchgb %0,%1" 215 | :"=r" ((uint8_t) x) 216 | :"m" (*(volatile uint8_t *)target), "0" ((uint8_t) x) 217 | :"memory"); 218 | 219 | return x; 220 | } 221 | 222 | //test-and-set uint8_t 223 | static inline uint8_t tas_uint8(volatile uint8_t *addr) { 224 | uint8_t oldval; 225 | __asm__ __volatile__("xchgb %0,%1" 226 | : "=q"(oldval), "=m"(*addr) 227 | : "0"((unsigned char) 0xff), "m"(*addr) : "memory"); 228 | return (uint8_t) oldval; 229 | } 230 | 231 | //atomic operations interface 232 | //Compare-and-swap 233 | # define CAS_PTR(a,b,c) __sync_val_compare_and_swap(a,b,c) 234 | # define CAS_U8(a,b,c) __sync_val_compare_and_swap(a,b,c) 235 | # define CAS_U16(a,b,c) __sync_val_compare_and_swap(a,b,c) 236 | # define CAS_U32(a,b,c) __sync_val_compare_and_swap(a,b,c) 237 | # define CAS_U64(a,b,c) __sync_val_compare_and_swap(a,b,c) 238 | //Swap 239 | # define SWAP_PTR(a,b) swap_pointer(a,b) 240 | # define SWAP_U8(a,b) swap_uint8(a,b) 241 | # define SWAP_U16(a,b) swap_uint16(a,b) 242 | # define SWAP_U32(a,b) swap_uint32(a,b) 243 | # define SWAP_U64(a,b) swap_uint64(a,b) 244 | //Fetch-and-increment 245 | # define FAI_U8(a) __sync_fetch_and_add(a,1) 246 | # define FAI_U16(a) __sync_fetch_and_add(a,1) 247 | # define FAI_U32(a) __sync_fetch_and_add(a,1) 248 | # define FAI_U64(a) __sync_fetch_and_add(a,1) 249 | //Fetch-and-decrement 250 | # define FAD_U8(a) __sync_fetch_and_sub(a,1) 251 | # define FAD_U16(a) __sync_fetch_and_sub(a,1) 252 | # define FAD_U32(a) __sync_fetch_and_sub(a,1) 253 | # define FAD_U64(a) __sync_fetch_and_sub(a,1) 254 | //Increment-and-fetch 255 | # define IAF_U8(a) __sync_add_and_fetch(a,1) 256 | # define IAF_U16(a) __sync_add_and_fetch(a,1) 257 | # define IAF_U32(a) __sync_add_and_fetch(a,1) 258 | # define IAF_U64(a) __sync_add_and_fetch(a,1) 259 | //Decrement-and-fetch 260 | # define DAF_U8(a) __sync_sub_and_fetch(a,1) 261 | # define DAF_U16(a) __sync_sub_and_fetch(a,1) 262 | # define DAF_U32(a) __sync_sub_and_fetch(a,1) 263 | # define DAF_U64(a) __sync_sub_and_fetch(a,1) 264 | //Test-and-set 265 | # define TAS_U8(a) tas_uint8(a) 266 | //Memory barrier 267 | # define MEM_BARRIER __sync_synchronize() 268 | //Relax CPU 269 | //#define PAUSE _mm_pause() 270 | 271 | /*End of x86 code*/ 272 | #endif 273 | 274 | 275 | #endif 276 | 277 | 278 | 279 | -------------------------------------------------------------------------------- /include/barrier.h: -------------------------------------------------------------------------------- 1 | /* 2 | * File: barrier.h 3 | * Author: Vasileios Trigonakis 4 | * Description: barrier structures 5 | * barrier.h is part of ccbench 6 | * 7 | * The MIT License (MIT) 8 | * 9 | * Copyright (C) 2013 Vasileios Trigonakis 10 | * 11 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 12 | * this software and associated documentation files (the "Software"), to deal in 13 | * the Software without restriction, including without limitation the rights to 14 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 15 | * the Software, and to permit persons to whom the Software is furnished to do so, 16 | * subject to the following conditions: 17 | * 18 | * The above copyright notice and this permission notice shall be included in all 19 | * copies or substantial portions of the Software. 20 | * 21 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 23 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 24 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 25 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 26 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | * 28 | */ 29 | 30 | #ifndef BARRIER_H 31 | #define BARRIER_H 32 | 33 | #include "common.h" 34 | #include "atomic_ops.h" 35 | #ifdef __sparc__ 36 | # include 37 | # include 38 | # include 39 | #endif /* __sparc */ 40 | 41 | #define NUM_BARRIERS 16 42 | #define BARRIER_MEM_FILE "/barrier_mem" 43 | 44 | #ifndef ALIGNED 45 | # if __GNUC__ && !SCC 46 | # define ALIGNED(N) __attribute__ ((aligned (N))) 47 | # else 48 | # define ALIGNED(N) 49 | # endif 50 | #endif 51 | 52 | /*barrier type*/ 53 | typedef ALIGNED(64) struct barrier 54 | { 55 | uint64_t num_participants; 56 | volatile uint64_t num_crossing1; 57 | volatile uint64_t num_crossing2; 58 | volatile uint64_t num_crossing3; 59 | int (*color)(int); /*or color function: if return 0 -> no , 1 -> participant. Priority on this */ 60 | } barrier_t; 61 | 62 | 63 | void barriers_init(const uint32_t num_procs); 64 | void barrier_init(const uint32_t barrier_num, const uint64_t participants, int (*color)(int), const uint32_t); 65 | void barrier_wait(const uint32_t barrier_num, const uint32_t id, const uint32_t total_cores); 66 | void barriers_term(); 67 | 68 | #ifdef __sparc__ 69 | # define PAUSE() asm volatile("rd %%ccr, %%g0\n\t" \ 70 | ::: "memory") 71 | #elif defined(__tile__) 72 | #define PAUSE() cycle_relax() 73 | #else 74 | #define PAUSE() _mm_pause() 75 | #endif 76 | 77 | #endif /* BARRIER_H */ 78 | -------------------------------------------------------------------------------- /include/ccbench.h: -------------------------------------------------------------------------------- 1 | /* 2 | * File: ccbench.h 3 | * Author: Vasileios Trigonakis 4 | * Description: definition of ccbench events and help functions 5 | * ccbench.h is part of ccbench 6 | * 7 | * The MIT License (MIT) 8 | * 9 | * Copyright (C) 2013 Vasileios Trigonakis 10 | * 11 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 12 | * this software and associated documentation files (the "Software"), to deal in 13 | * the Software without restriction, including without limitation the rights to 14 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 15 | * the Software, and to permit persons to whom the Software is furnished to do so, 16 | * subject to the following conditions: 17 | * 18 | * The above copyright notice and this permission notice shall be included in all 19 | * copies or substantial portions of the Software. 20 | * 21 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 23 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 24 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 25 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 26 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | * 28 | */ 29 | 30 | #ifndef _H_CCBENCH_ 31 | #define _H_CCBENCH_ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | 53 | #if defined(__amd64__) 54 | # include 55 | #elif defined(__tile__) 56 | # include 57 | # include 58 | # include 59 | extern cpu_set_t cpus; 60 | #endif 61 | 62 | #if defined(PLATFORM_NUMA) 63 | # include 64 | #endif /* PLATFORM_NUMA */ 65 | 66 | #include "common.h" 67 | #include "pfd.h" 68 | #include "barrier.h" 69 | 70 | typedef struct cache_line 71 | { 72 | volatile uint32_t word[16]; 73 | } cache_line_t; 74 | 75 | #define CACHE_LINE_NUM 1024*1024 /* power of 2 pls */ 76 | #define CACHE_LINE_STRIDE_2 2047 77 | 78 | # define LLU unsigned long long int 79 | 80 | extern volatile cache_line_t* cache_line_open(); 81 | extern void cache_line_close(const uint32_t id, const char* name); 82 | 83 | typedef enum 84 | { 85 | STORE_ON_MODIFIED, 86 | STORE_ON_MODIFIED_NO_SYNC, 87 | STORE_ON_EXCLUSIVE, 88 | STORE_ON_SHARED, 89 | STORE_ON_OWNED_MINE, 90 | STORE_ON_OWNED, 91 | STORE_ON_INVALID, 92 | LOAD_FROM_MODIFIED, 93 | LOAD_FROM_EXCLUSIVE, 94 | LOAD_FROM_SHARED, 95 | LOAD_FROM_OWNED, 96 | LOAD_FROM_INVALID, 97 | CAS, 98 | FAI, 99 | TAS, 100 | SWAP, 101 | CAS_ON_MODIFIED, 102 | FAI_ON_MODIFIED, 103 | TAS_ON_MODIFIED, 104 | SWAP_ON_MODIFIED, 105 | CAS_ON_SHARED, 106 | FAI_ON_SHARED, 107 | TAS_ON_SHARED, 108 | SWAP_ON_SHARED, 109 | CAS_CONCURRENT, 110 | FAI_ON_INVALID, 111 | LOAD_FROM_L1, 112 | LOAD_FROM_MEM_SIZE, 113 | LFENCE, 114 | SFENCE, 115 | MFENCE, 116 | PROFILER, 117 | PAUSE, 118 | NOP, 119 | NUM_EVENTS, /* placeholder for printing the num of events */ 120 | } moesi_type_t; 121 | 122 | const char* moesi_type_des[] = 123 | { 124 | "STORE_ON_MODIFIED", 125 | "STORE_ON_MODIFIED_NO_SYNC", 126 | "STORE_ON_EXCLUSIVE", 127 | "STORE_ON_SHARED", 128 | "STORE_ON_OWNED_MINE", 129 | "STORE_ON_OWNED", 130 | "STORE_ON_INVALID", 131 | "LOAD_FROM_MODIFIED", 132 | "LOAD_FROM_EXCLUSIVE", 133 | "LOAD_FROM_SHARED", 134 | "LOAD_FROM_OWNED", 135 | "LOAD_FROM_INVALID", 136 | "CAS", 137 | "FAI", 138 | "TAS", 139 | "SWAP", 140 | "CAS_ON_MODIFIED", 141 | "FAI_ON_MODIFIED", 142 | "TAS_ON_MODIFIED", 143 | "SWAP_ON_MODIFIED", 144 | "CAS_ON_SHARED", 145 | "FAI_ON_SHARED", 146 | "TAS_ON_SHARED", 147 | "SWAP_ON_SHARED", 148 | "CAS_CONCURRENT", 149 | "FAI_ON_INVALID", 150 | "LOAD_FROM_L1", 151 | "LOAD_FROM_MEM_SIZE", 152 | "LFENCE", 153 | "SFENCE", 154 | "MFENCE", 155 | "PROFILER", 156 | "PAUSE", 157 | "NOP", 158 | }; 159 | 160 | 161 | #define DEFAULT_CORES 2 162 | #define DEFAULT_REPS 10000 163 | #define DEFAULT_TEST 0 164 | #define DEFAULT_CORE1 0 165 | #define DEFAULT_CORE2 1 166 | #define DEFAULT_CORE3 2 167 | #define DEFAULT_CORE_OTHERS 0 168 | #define DEFAULT_FLUSH 0 169 | #define DEFAULT_VERBOSE 0 170 | #define DEFAULT_PRINT 100 171 | #define DEFAULT_STRIDE (CACHE_LINE_STRIDE_2 + 1) 172 | #define DEFAULT_FENCE 0 173 | #define DEFAULT_LFENCE 0 174 | #define DEFAULT_SFENCE 0 175 | #define DEFAULT_AO_SUCCESS 0 176 | 177 | 178 | #define CACHE_LINE_MEM_FILE "/cache_line" 179 | 180 | #define B0 _mm_mfence(); barrier_wait(0, ID, test_cores); _mm_mfence(); 181 | #define B1 _mm_mfence(); barrier_wait(2, ID, test_cores); _mm_mfence(); 182 | #define B2 _mm_mfence(); barrier_wait(3, ID, test_cores); _mm_mfence(); 183 | #define B3 _mm_mfence(); barrier_wait(4, ID, test_cores); _mm_mfence(); 184 | #define B4 _mm_mfence(); barrier_wait(5, ID, test_cores); _mm_mfence(); 185 | #define B5 _mm_mfence(); barrier_wait(6, ID, test_cores); _mm_mfence(); 186 | #define B6 _mm_mfence(); barrier_wait(7, ID, test_cores); _mm_mfence(); 187 | #define B7 _mm_mfence(); barrier_wait(8, ID, test_cores); _mm_mfence(); 188 | #define B8 _mm_mfence(); barrier_wait(9, ID, test_cores); _mm_mfence(); 189 | #define B9 _mm_mfence(); barrier_wait(10, ID, test_cores); _mm_mfence(); 190 | #define B10 _mm_mfence(); barrier_wait(11, ID, test_cores); _mm_mfence(); 191 | #define B11 _mm_mfence(); barrier_wait(12, ID, test_cores); _mm_mfence(); 192 | #define B12 _mm_mfence(); barrier_wait(13, ID, test_cores); _mm_mfence(); 193 | #define B13 _mm_mfence(); barrier_wait(14, ID, test_cores); _mm_mfence(); 194 | #define B14 _mm_mfence(); barrier_wait(15, ID, test_cores); _mm_mfence(); 195 | 196 | #define XSTR(s) STR(s) 197 | #define STR(s) #s 198 | 199 | #ifndef ALIGNED 200 | # if __GNUC__ && !SCC 201 | # define ALIGNED(N) __attribute__ ((aligned (N))) 202 | # else 203 | # define ALIGNED(N) 204 | # endif 205 | #endif 206 | 207 | inline void 208 | set_cpu(int cpu) 209 | { 210 | #if defined(__sparc__) 211 | processor_bind(P_LWPID,P_MYID, cpu, NULL); 212 | #elif defined(__tile__) 213 | if (tmc_cpus_set_my_cpu(tmc_cpus_find_nth_cpu(&cpus, cpu)) < 0) 214 | { 215 | tmc_task_die("Failure in 'tmc_cpus_set_my_cpu()'."); 216 | } 217 | 218 | if (cpu != tmc_cpus_get_my_cpu()) 219 | { 220 | PRINT("******* i am not CPU %d", tmc_cpus_get_my_cpu()); 221 | } 222 | 223 | #else 224 | cpu_set_t mask; 225 | CPU_ZERO(&mask); 226 | CPU_SET(cpu, &mask); 227 | if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) != 0) { 228 | printf("Problem with setting processor affinity: %s\n", 229 | strerror(errno)); 230 | exit(3); 231 | } 232 | #endif 233 | 234 | #ifdef OPTERON 235 | uint32_t numa_node = cpu/6; 236 | numa_set_preferred(numa_node); 237 | #elif defined(XEON) 238 | uint32_t numa_node = 0; 239 | if (cpu == 0) 240 | { 241 | numa_node = 4; 242 | } 243 | else if (cpu <= 40) 244 | { 245 | numa_node = (cpu - 1) / 10; 246 | } 247 | else 248 | { 249 | numa_node = cpu / 10; 250 | } 251 | numa_set_preferred(numa_node); 252 | #elif defined(PLATFORM_NUMA) 253 | printf("* You need to define how cores correspond to mem nodes in ccbench.h\n"); 254 | #endif 255 | 256 | } 257 | 258 | inline void 259 | wait_cycles(volatile uint64_t cycles) 260 | { 261 | /* cycles >>= 1; */ 262 | for (cycles; cycles > 0; cycles--) 263 | { 264 | asm volatile ("nop"); 265 | } 266 | } 267 | 268 | /* getticks needs to have a correction because the call itself takes a */ 269 | /* significant number of cycles and skewes the measurement */ 270 | static inline ticks getticks_correction_calc() 271 | { 272 | #define GETTICKS_CALC_REPS 1000000 273 | ticks t_dur = 0; 274 | uint32_t i; 275 | for (i = 0; i < GETTICKS_CALC_REPS; i++) 276 | { 277 | ticks t_start = getticks(); 278 | ticks t_end = getticks(); 279 | t_dur += t_end - t_start; 280 | } 281 | // printf("corr in float %f\n", (t_dur / (double) GETTICKS_CALC_REPS)); 282 | ticks getticks_correction = (ticks)(t_dur / (double) GETTICKS_CALC_REPS); 283 | return getticks_correction; 284 | } 285 | 286 | #define IN_ORDER(id, num_cores) \ 287 | { \ 288 | B0; \ 289 | uint32_t c; \ 290 | for (c = 0; c < num_cores; c++) \ 291 | { \ 292 | if (id == c) \ 293 | { 294 | 295 | #define IN_ORDER_END \ 296 | } \ 297 | B0; \ 298 | } \ 299 | } 300 | 301 | 302 | static inline unsigned long* 303 | seed_rand() 304 | { 305 | unsigned long* seeds; 306 | seeds = (unsigned long*) malloc(3 * sizeof(unsigned long)); 307 | seeds[0] = getticks() % 123456789; 308 | seeds[1] = getticks() % 362436069; 309 | seeds[2] = getticks() % 521288629; 310 | return seeds; 311 | } 312 | 313 | extern unsigned long* seeds; 314 | //Marsaglia's xorshf generator //period 2^96-1 315 | static inline unsigned long 316 | xorshf96(unsigned long* x, unsigned long* y, unsigned long* z) 317 | { 318 | unsigned long t; 319 | (*x) ^= (*x) << 16; 320 | (*x) ^= (*x) >> 5; 321 | (*x) ^= (*x) << 1; 322 | 323 | t = *x; 324 | (*x) = *y; 325 | (*y) = *z; 326 | (*z) = t ^ (*x) ^ (*y); 327 | 328 | return *z; 329 | } 330 | #define clrand() (xorshf96(seeds, seeds + 1, seeds + 2) & (test_stride - 1)) 331 | #define sirand(range) ((xorshf96(seeds, seeds + 1, seeds + 2) % range) + 64) 332 | #define my_random(a, b, c) xorshf96(a, b, c) 333 | 334 | static inline uint32_t pow2roundup (uint32_t x) 335 | { 336 | if (x==0) return 1; 337 | --x; 338 | x |= x >> 1; 339 | x |= x >> 2; 340 | x |= x >> 4; 341 | x |= x >> 8; 342 | x |= x >> 16; 343 | return x+1; 344 | } 345 | #endif /* _H_CCBENCH_ */ 346 | -------------------------------------------------------------------------------- /include/common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * File: common.h 3 | * Author: Vasileios Trigonakis 4 | * Description: helper macros 5 | * common.h is part of ccbench 6 | * 7 | * The MIT License (MIT) 8 | * 9 | * Copyright (C) 2013 Vasileios Trigonakis 10 | * 11 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 12 | * this software and associated documentation files (the "Software"), to deal in 13 | * the Software without restriction, including without limitation the rights to 14 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 15 | * the Software, and to permit persons to whom the Software is furnished to do so, 16 | * subject to the following conditions: 17 | * 18 | * The above copyright notice and this permission notice shall be included in all 19 | * copies or substantial portions of the Software. 20 | * 21 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 23 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 24 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 25 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 26 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | * 28 | */ 29 | 30 | #ifndef _COMMON_H_ 31 | #define _COMMON_H_ 32 | 33 | #include 34 | #include 35 | 36 | #define XSTR(s) STR(s) 37 | #define STR(s) #s 38 | 39 | #define P(args...) printf("[%02d] ", ID); printf(args); printf("\n"); fflush(stdout) 40 | #define PRINT P 41 | 42 | extern uint8_t ID; 43 | #endif 44 | -------------------------------------------------------------------------------- /include/pfd.h: -------------------------------------------------------------------------------- 1 | /* 2 | * File: pfd.h 3 | * Author: Vasileios Trigonakis 4 | * Description: pfd interface, structures, and helper functions 5 | * pfd.h is part of ccbench 6 | * 7 | * The MIT License (MIT) 8 | * 9 | * Copyright (C) 2013 Vasileios Trigonakis 10 | * 11 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 12 | * this software and associated documentation files (the "Software"), to deal in 13 | * the Software without restriction, including without limitation the rights to 14 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 15 | * the Software, and to permit persons to whom the Software is furnished to do so, 16 | * subject to the following conditions: 17 | * 18 | * The above copyright notice and this permission notice shall be included in all 19 | * copies or substantial portions of the Software. 20 | * 21 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 23 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 24 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 25 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 26 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | * 28 | */ 29 | 30 | #ifndef _PFD_H_ 31 | #define _PFD_H_ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include "common.h" 38 | 39 | 40 | typedef uint64_t ticks; 41 | 42 | #if defined(__i386__) 43 | static inline ticks 44 | getticks(void) 45 | { 46 | ticks ret; 47 | 48 | __asm__ __volatile__("rdtsc" : "=A" (ret)); 49 | return ret; 50 | } 51 | #elif defined(__x86_64__) 52 | static inline ticks 53 | getticks(void) 54 | { 55 | unsigned hi, lo; 56 | __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); 57 | return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); 58 | } 59 | #elif defined(__sparc__) 60 | static inline ticks 61 | getticks() 62 | { 63 | ticks ret; 64 | __asm__ __volatile__ ("rd %%tick, %0" : "=r" (ret) : "0" (ret)); 65 | return ret; 66 | } 67 | #elif defined(__tile__) 68 | #include 69 | static inline ticks getticks() 70 | { 71 | return get_cycle_count(); 72 | } 73 | #endif 74 | 75 | 76 | #define DO_TIMINGS 77 | 78 | #if !defined(PREFETCHW) 79 | # if defined(__x86_64__) | defined(__i386__) 80 | # define PREFETCHW(x) asm volatile("prefetchw %0" :: "m" (*(unsigned long *)x)) /* write */ 81 | # elif defined(__sparc__) 82 | # define PREFETCHW(x) __builtin_prefetch((const void*) x, 1, 3) 83 | # elif defined(__tile__) 84 | # define PREFETCHW(x) tmc_mem_prefetch (x, 64) 85 | # else 86 | # warning "You need to define PREFETCHW(x) for your architecture" 87 | # endif 88 | #endif 89 | 90 | typedef struct abs_deviation 91 | { 92 | uint64_t num_vals; 93 | double avg; 94 | double avg_10p; 95 | double avg_25p; 96 | double avg_50p; 97 | double avg_75p; 98 | double avg_rst; 99 | double abs_dev_10p; 100 | double abs_dev_25p; 101 | double abs_dev_50p; 102 | double abs_dev_75p; 103 | double abs_dev_rst; 104 | double abs_dev; 105 | double std_dev_10p; 106 | double std_dev_25p; 107 | double std_dev_50p; 108 | double std_dev_75p; 109 | double std_dev_rst; 110 | double std_dev; 111 | double min_val; 112 | uint64_t min_val_idx; 113 | double max_val; 114 | uint64_t max_val_idx; 115 | uint32_t num_dev_10p; 116 | uint32_t num_dev_25p; 117 | uint32_t num_dev_50p; 118 | uint32_t num_dev_75p; 119 | uint32_t num_dev_rst; 120 | } abs_deviation_t; 121 | 122 | 123 | #define PFD_NUM_STORES 2 124 | #define PFD_PRINT_MAX 200 125 | 126 | extern volatile ticks** pfd_store; 127 | extern volatile ticks* _pfd_s; 128 | extern volatile ticks pfd_correction; 129 | #if !defined(DO_TIMINGS) 130 | # define PFDINIT(num_entries) 131 | # define PFDI(store) 132 | # define PFDO(store, entry) 133 | # define PFDP(store, num_vals) 134 | # define PFDPN(store, num_vals, num_print) 135 | #else /* DO_TIMINGS */ 136 | # define PFDINIT(num_entries) pfd_store_init(num_entries) 137 | 138 | # define PFDI(store) \ 139 | { \ 140 | asm volatile (""); \ 141 | _pfd_s[store] = getticks(); 142 | 143 | 144 | # define PFDO(store, entry) \ 145 | asm volatile (""); \ 146 | pfd_store[store][entry] = getticks() - _pfd_s[store] - pfd_correction; \ 147 | } 148 | 149 | # define PFDOR(store, entry, reps) \ 150 | asm volatile (""); \ 151 | volatile ticks __t = getticks(); \ 152 | pfd_store[store][entry] = (__t - _pfd_s[store] - pfd_correction) / \ 153 | reps; \ 154 | } 155 | 156 | # define PFDPN(store, num_vals, num_print) \ 157 | { \ 158 | uint32_t _i; \ 159 | uint32_t p = num_print; \ 160 | if (p > num_vals) { p = num_vals; } \ 161 | for (_i = 0; _i < p; _i++) \ 162 | { \ 163 | printf("[%3d: %4ld] ", _i, (long int) pfd_store[store][_i]); \ 164 | } \ 165 | abs_deviation_t ad; \ 166 | get_abs_deviation(pfd_store[store], num_vals, &ad); \ 167 | print_abs_deviation(&ad); \ 168 | } 169 | #endif /* !DO_TIMINGS */ 170 | 171 | # define PFDPREFTCH(store, entry) \ 172 | PFDI(store); \ 173 | PFDO(store, entry); 174 | 175 | 176 | 177 | void pfd_store_init(const uint32_t num_entries); 178 | void get_abs_deviation(volatile ticks* vals, const size_t num_vals, abs_deviation_t* abs_dev); 179 | void print_abs_deviation(const abs_deviation_t* abs_dev); 180 | 181 | 182 | #endif /* _PFD_H_ */ 183 | -------------------------------------------------------------------------------- /scripts/events_all: -------------------------------------------------------------------------------- 1 | cache-references,cache-misses,L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores,L1-dcache-store-misses,L1-dcache-prefetches,L1-dcache-prefetch-misses,L1-icache-loads,L1-icache-load-misses,L1-icache-prefetches,L1-icache-prefetch-misses,LLC-loads,LLC-load-misses,LLC-stores,LLC-store-misses,LLC-prefetches,LLC-prefetch-misses -------------------------------------------------------------------------------- /scripts/run_niagara.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #!/bin/bash 3 | 4 | ./ccbench $@; 5 | read; 6 | ./ccbench $@ -y8; 7 | -------------------------------------------------------------------------------- /scripts/run_opteron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./ccbench $@ ; 4 | read; 5 | ./ccbench $@ -y6; 6 | read; 7 | ./ccbench $@ -y12; 8 | read; 9 | ./ccbench $@ -y18; 10 | -------------------------------------------------------------------------------- /scripts/run_tilera.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./run ./ccbench $@ ; 4 | read; 5 | ./run ./ccbench $@ -y35; 6 | -------------------------------------------------------------------------------- /scripts/run_with_confidence.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conf=$1; 4 | shift; 5 | run=$@; 6 | 7 | tmp="conf.run.tmp"; 8 | 9 | 10 | if [ $# -lt 1 ]; 11 | then 12 | echo "Usage: $0 TARGET_CONFIDENCE ./ccbench [PARAMETERS]"; 13 | echo " runs ccbench until there is a run with a clustering"; 14 | echo " around the avg that has >= TARGET_CONFIDENCE percentage"; 15 | echo " of the total samples. The TARGET_CONFIDENCE is decreased"; 16 | echo " by D after F failed attempts (D, F defined in the script)"; 17 | exit; 18 | fi; 19 | 20 | tries_fail=3; 21 | reduce_on_fail=1; 22 | 23 | echo " ** Confidence lvl: $conf"; 24 | 25 | tries=1; 26 | while : 27 | do 28 | ./$run > $tmp; 29 | res=$(cut -d'(' -f2 $tmp | gawk -v c=$conf '/% \|/ { if ($1+0 > c) print $1" --("$0 }'); 30 | 31 | if [ "$res" ]; 32 | then 33 | cat $tmp; 34 | echo " ** in # tries: $tries"; 35 | break; 36 | fi; 37 | 38 | tries=$((tries+1)); 39 | if [ $tries -gt $tries_fail ]; 40 | then 41 | conf=$((conf-reduce_on_fail)); 42 | tries=1; 43 | echo " ** Failed after $tries_fail tries. New confidence lvl: $conf" 44 | fi; 45 | done; 46 | 47 | if [ -f $tmp ]; 48 | then 49 | rm $tmp; 50 | fi; 51 | -------------------------------------------------------------------------------- /scripts/run_xeon.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./ccbench $@ -x3; 4 | read; 5 | ./ccbench $@ -y11 -x3; 6 | read; 7 | ./ccbench $@ -y41 -x3; 8 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | /#pfd.c# 2 | /.#pfd.c 3 | -------------------------------------------------------------------------------- /src/barrier.c: -------------------------------------------------------------------------------- 1 | /* 2 | * File: barrier.c 3 | * Author: Vasileios Trigonakis 4 | * Description: implementation of process barriers 5 | * barrier.c is part of ccbench 6 | * 7 | * The MIT License (MIT) 8 | * 9 | * Copyright (C) 2013 Vasileios Trigonakis 10 | * 11 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 12 | * this software and associated documentation files (the "Software"), to deal in 13 | * the Software without restriction, including without limitation the rights to 14 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 15 | * the Software, and to permit persons to whom the Software is furnished to do so, 16 | * subject to the following conditions: 17 | * 18 | * The above copyright notice and this permission notice shall be included in all 19 | * copies or substantial portions of the Software. 20 | * 21 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 23 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 24 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 25 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 26 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | * 28 | */ 29 | 30 | #include "barrier.h" 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | #ifdef __sparc__ 44 | # include 45 | # include 46 | # include 47 | #endif /* __sparc__ */ 48 | 49 | barrier_t* barriers; 50 | 51 | 52 | int color_all(int id) 53 | { 54 | return 1; 55 | } 56 | 57 | void 58 | barriers_init(const uint32_t num_procs) 59 | { 60 | uint32_t size; 61 | size = NUM_BARRIERS * sizeof(barrier_t); 62 | if (size < 8192) 63 | { 64 | size = 8192; 65 | } 66 | 67 | char keyF[100]; 68 | sprintf(keyF, BARRIER_MEM_FILE); 69 | 70 | int barrierfd = shm_open(keyF, O_CREAT | O_EXCL | O_RDWR, S_IRWXU | S_IRWXG); 71 | if (barrierfd<0) 72 | { 73 | if (errno != EEXIST) 74 | { 75 | perror("In shm_open"); 76 | exit(1); 77 | } 78 | 79 | //this time it is ok if it already exists 80 | barrierfd = shm_open(keyF, O_CREAT | O_RDWR, S_IRWXU | S_IRWXG); 81 | if (barrierfd<0) 82 | { 83 | perror("In shm_open"); 84 | exit(1); 85 | } 86 | } 87 | else 88 | { 89 | if (ftruncate(barrierfd, size) < 0) { 90 | perror("ftruncate failed\n"); 91 | exit(1); 92 | } 93 | } 94 | 95 | void* mem = (void*) mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, barrierfd, 0); 96 | if (mem == NULL) 97 | { 98 | perror("ssmp_mem = NULL\n"); 99 | exit(134); 100 | } 101 | 102 | barriers = (barrier_t*) mem; 103 | 104 | uint32_t bar; 105 | for (bar = 0; bar < NUM_BARRIERS; bar++) 106 | { 107 | barrier_init(bar, 0, color_all, num_procs); 108 | } 109 | } 110 | 111 | void 112 | barrier_init(const uint32_t barrier_num, const uint64_t participants, int (*color)(int), 113 | const uint32_t total_cores) 114 | { 115 | if (barrier_num >= NUM_BARRIERS) 116 | { 117 | return; 118 | } 119 | 120 | 121 | barriers[barrier_num].num_crossing1 = 0; 122 | barriers[barrier_num].num_crossing2 = 0; 123 | barriers[barrier_num].num_crossing3 = 0; 124 | barriers[barrier_num].color = color; 125 | uint32_t ue, num_parts = 0; 126 | for (ue = 0; ue < total_cores; ue++) 127 | { 128 | num_parts += color(ue); 129 | } 130 | barriers[barrier_num].num_participants = num_parts; 131 | 132 | } 133 | 134 | 135 | void 136 | barrier_wait(const uint32_t barrier_num, const uint32_t id, const uint32_t total_cores) 137 | { 138 | _mm_mfence(); 139 | if (barrier_num >= NUM_BARRIERS) 140 | { 141 | return; 142 | } 143 | 144 | // printf("enter: %d : %d\n", barrier_num, id); 145 | 146 | barrier_t *b = &barriers[barrier_num]; 147 | 148 | int (*col)(int); 149 | col = b->color; 150 | 151 | if (col(id) == 0) 152 | { 153 | return; 154 | } 155 | 156 | 157 | b->num_crossing2 = 0; 158 | FAI_U64(&b->num_crossing1); 159 | 160 | while (b->num_crossing1 < b->num_participants) 161 | { 162 | PAUSE(); 163 | _mm_mfence(); 164 | } 165 | 166 | 167 | b->num_crossing3 = 0; 168 | 169 | FAI_U64(&b->num_crossing2); 170 | 171 | while (b->num_crossing2 < b->num_participants) 172 | { 173 | PAUSE(); 174 | _mm_mfence(); 175 | } 176 | 177 | b->num_crossing1 = 0; 178 | 179 | FAI_U64(&b->num_crossing3); 180 | 181 | while (b->num_crossing3 < b->num_participants) 182 | { 183 | PAUSE(); 184 | _mm_mfence(); 185 | } 186 | 187 | // printf("EXIT : %d : %d\n", barrier_num, id); 188 | 189 | } 190 | 191 | void 192 | barriers_term(const uint32_t id) 193 | { 194 | if (id == 0) 195 | { 196 | shm_unlink(BARRIER_MEM_FILE); 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /src/ccbench.c: -------------------------------------------------------------------------------- 1 | /* 2 | * File: ccbench.c 3 | * Author: Vasileios Trigonakis 4 | * Description: the main functionality of ccbench 5 | * ccbench.c is part of ccbench 6 | * 7 | * The MIT License (MIT) 8 | * 9 | * Copyright (C) 2013 Vasileios Trigonakis 10 | * 11 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 12 | * this software and associated documentation files (the "Software"), to deal in 13 | * the Software without restriction, including without limitation the rights to 14 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 15 | * the Software, and to permit persons to whom the Software is furnished to do so, 16 | * subject to the following conditions: 17 | * 18 | * The above copyright notice and this permission notice shall be included in all 19 | * copies or substantial portions of the Software. 20 | * 21 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 23 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 24 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 25 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 26 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | * 28 | */ 29 | 30 | #include "ccbench.h" 31 | 32 | uint8_t ID; 33 | unsigned long* seeds; 34 | 35 | #if defined(__tile__) 36 | cpu_set_t cpus; 37 | #endif 38 | 39 | moesi_type_t test_test = DEFAULT_TEST; 40 | uint32_t test_cores = DEFAULT_CORES; 41 | uint32_t test_reps = DEFAULT_REPS; 42 | uint32_t test_core1 = DEFAULT_CORE1; 43 | uint32_t test_core2 = DEFAULT_CORE2; 44 | uint32_t test_core3 = DEFAULT_CORE3; 45 | uint32_t test_core_others = DEFAULT_CORE_OTHERS; 46 | uint32_t test_flush = DEFAULT_FLUSH; 47 | uint32_t test_verbose = DEFAULT_VERBOSE; 48 | uint32_t test_print = DEFAULT_PRINT; 49 | uint32_t test_stride = DEFAULT_STRIDE; 50 | uint32_t test_fence = DEFAULT_FENCE; 51 | uint32_t test_ao_success = DEFAULT_AO_SUCCESS; 52 | size_t test_mem_size = CACHE_LINE_NUM * sizeof(cache_line_t); 53 | uint32_t test_cache_line_num = CACHE_LINE_NUM; 54 | uint32_t test_lfence = DEFAULT_LFENCE; 55 | uint32_t test_sfence = DEFAULT_SFENCE; 56 | 57 | 58 | static void store_0(volatile cache_line_t* cache_line, volatile uint64_t reps); 59 | static void store_0_no_pf(volatile cache_line_t* cache_line, volatile uint64_t reps); 60 | static void store_0_eventually(volatile cache_line_t* cl, volatile uint64_t reps); 61 | static void store_0_eventually_pfd1(volatile cache_line_t* cl, volatile uint64_t reps); 62 | 63 | static uint64_t load_0(volatile cache_line_t* cache_line, volatile uint64_t reps); 64 | static uint64_t load_next(volatile uint64_t* cl, volatile uint64_t reps); 65 | static uint64_t load_0_eventually(volatile cache_line_t* cl, volatile uint64_t reps); 66 | static uint64_t load_0_eventually_no_pf(volatile cache_line_t* cl); 67 | 68 | static void invalidate(volatile cache_line_t* cache_line, uint64_t index, volatile uint64_t reps); 69 | static uint32_t cas(volatile cache_line_t* cache_line, volatile uint64_t reps); 70 | static uint32_t cas_0_eventually(volatile cache_line_t* cache_line, volatile uint64_t reps); 71 | static uint32_t cas_no_pf(volatile cache_line_t* cache_line, volatile uint64_t reps); 72 | static uint32_t fai(volatile cache_line_t* cache_line, volatile uint64_t reps); 73 | static uint8_t tas(volatile cache_line_t* cl, volatile uint64_t reps); 74 | static uint32_t swap(volatile cache_line_t* cl, volatile uint64_t reps); 75 | 76 | static size_t parse_size(char* optarg); 77 | static void create_rand_list_cl(volatile uint64_t* list, size_t n); 78 | 79 | 80 | int 81 | main(int argc, char **argv) 82 | { 83 | 84 | /* before doing any allocations */ 85 | #if defined(__tile__) 86 | if (tmc_cpus_get_my_affinity(&cpus) != 0) 87 | { 88 | tmc_task_die("Failure in 'tmc_cpus_get_my_affinity()'."); 89 | } 90 | #endif 91 | 92 | #if defined(XEON) 93 | set_cpu(1); 94 | #else 95 | set_cpu(0); 96 | #endif 97 | 98 | struct option long_options[] = 99 | { 100 | // These options don't set a flag 101 | {"help", no_argument, NULL, 'h'}, 102 | {"cores", required_argument, NULL, 'c'}, 103 | {"repetitions", required_argument, NULL, 'r'}, 104 | {"test", required_argument, NULL, 't'}, 105 | {"core1", required_argument, NULL, 'x'}, 106 | {"core2", required_argument, NULL, 'y'}, 107 | {"core3", required_argument, NULL, 'z'}, 108 | {"core-others", required_argument, NULL, 'o'}, 109 | {"stride", required_argument, NULL, 's'}, 110 | {"fence", required_argument, NULL, 'e'}, 111 | {"mem-size", required_argument, NULL, 'm'}, 112 | {"flush", no_argument, NULL, 'f'}, 113 | {"success", no_argument, NULL, 'u'}, 114 | {"verbose", no_argument, NULL, 'v'}, 115 | {"print", required_argument, NULL, 'p'}, 116 | {NULL, 0, NULL, 0} 117 | }; 118 | 119 | int i; 120 | char c; 121 | while(1) 122 | { 123 | i = 0; 124 | c = getopt_long(argc, argv, "hc:r:t:x:m:y:z:o:e:fvup:s:", long_options, &i); 125 | 126 | if(c == -1) 127 | break; 128 | 129 | if(c == 0 && long_options[i].flag == 0) 130 | c = long_options[i].val; 131 | 132 | switch(c) 133 | { 134 | case 0: 135 | /* Flag is automatically set */ 136 | break; 137 | case 'h': 138 | printf("ccbench Copyright (C) 2013 Vasileios Trigonakis \n" 139 | "This program comes with ABSOLUTELY NO WARRANTY.\n" 140 | "This is free software, and you are welcome to redistribute it under certain conditions.\n\n" 141 | "ccbecnh is an application for measuring the cache-coherence latencies, i.e., the latencies of\n" 142 | "of loads, stores, CAS, FAI, TAS, and SWAP\n" 143 | "\n" 144 | "Usage:\n" 145 | " ./ccbench [options...]\n" 146 | "\n" 147 | "Options:\n" 148 | " -h, --help\n" 149 | " Print this message\n" 150 | " -c, --cores \n" 151 | " Number of cores to run the test on (default=" XSTR(DEFAULT_CORES) ")\n" 152 | " -r, --repetitions \n" 153 | " Repetitions of the test case (default=" XSTR(DEFAULT_REPS) ")\n" 154 | " -t, --test \n" 155 | " Test case to run (default=" XSTR(DEFAULT_TEST) "). See below for supported events\n" 156 | " -x, --core1 \n" 157 | " 1st core to use (default=" XSTR(DEFAULT_CORE1) ")\n" 158 | " -y, --core2 \n" 159 | " 2nd core to use (default=" XSTR(DEFAULT_CORE2) ")\n" 160 | " -z, --core3 \n" 161 | " 3rd core to use. Some (most) tests use only 2 cores (default=" XSTR(DEFAULT_CORE3) ")\n" 162 | " -o, --core-others \n" 163 | " Offset for core that the processes with ID > 3 should bind (default=" XSTR(DEFAULT_CORE_OTHERS) ")\n" 164 | " -f, --flush\n" 165 | " Perform a cache line flush before the test (default=" XSTR(DEFAULT_FLUSH) ")\n" 166 | " -s, --stride \n" 167 | " What stride size to use when accessing the cache line(s) (default=" XSTR(DEFAULT_STRIDE) ")\n" 168 | " The application draws a random number X in the [0..(stride-1)] range and applies the target\n" 169 | " operation on this random cache line. The operation is completed when X=0. The stride is used\n" 170 | " in order to fool the hardware prefetchers that could hide the latency we want to measure.\n" 171 | " -e, --fence \n" 172 | " What memory barrier (fence) lvl to use (default=" XSTR(DEFAULT_FENCE) ")\n" 173 | " 0 = no fences / 1 = load-store fences / 2 = full fences / 3 = load-none fences / 4 = none-store fences\n" 174 | " 5 = full-none fences / 6 = none-full fences / 7 = full-store fences / 8 = load-full fences \n" 175 | " -m, --mem-size \n" 176 | " What memory size to use (in cache lines) (default=" XSTR(CACHE_LINE_NUM) ")\n" 177 | " -u, --success\n" 178 | " Make all atomic operations be successfull (e.g, TAS_ON_SHARED)\n" 179 | " -v, --verbose\n" 180 | " Verbose printing of results (default=" XSTR(DEFAULT_VERBOSE) ")\n" 181 | " -p, --print \n" 182 | " If verbose, how many results to print (default=" XSTR(DEFAULT_PRINT) ")\n" 183 | ); 184 | printf("Supported events: \n"); 185 | int ar; 186 | for (ar = 0; ar < NUM_EVENTS; ar++) 187 | { 188 | printf(" %2d - %s\n", ar, moesi_type_des[ar]); 189 | } 190 | 191 | exit(0); 192 | case 'c': 193 | test_cores = atoi(optarg); 194 | break; 195 | case 'r': 196 | test_reps = atoi(optarg); 197 | break; 198 | case 't': 199 | test_test = atoi(optarg); 200 | break; 201 | case 'x': 202 | test_core1 = atoi(optarg); 203 | break; 204 | case 'y': 205 | test_core2 = atoi(optarg); 206 | break; 207 | case 'z': 208 | test_core3 = atoi(optarg); 209 | break; 210 | case 'o': 211 | test_core_others = atoi(optarg); 212 | break; 213 | case 'f': 214 | test_flush = 1; 215 | break; 216 | case 's': 217 | test_stride = pow2roundup(atoi(optarg)); 218 | break; 219 | case 'e': 220 | test_fence = atoi(optarg); 221 | break; 222 | case 'm': 223 | test_mem_size = parse_size(optarg); 224 | printf("Data size : %zu KiB\n", test_mem_size / 1024); 225 | break; 226 | case 'u': 227 | test_ao_success = 1; 228 | break; 229 | case 'v': 230 | test_verbose = 1; 231 | break; 232 | case 'p': 233 | test_verbose = 1; 234 | test_print = atoi(optarg); 235 | break; 236 | case '?': 237 | printf("Use -h or --help for help\n"); 238 | exit(0); 239 | default: 240 | exit(1); 241 | } 242 | } 243 | 244 | 245 | test_cache_line_num = test_mem_size / sizeof(cache_line_t); 246 | 247 | if ((test_test == STORE_ON_EXCLUSIVE || test_test == STORE_ON_INVALID || test_test == LOAD_FROM_INVALID 248 | || test_test == LOAD_FROM_EXCLUSIVE || test_test == LOAD_FROM_SHARED) && !test_flush) 249 | { 250 | assert((test_reps * test_stride) <= test_cache_line_num); 251 | } 252 | 253 | if (test_test != LOAD_FROM_MEM_SIZE) 254 | { 255 | assert(test_stride < test_cache_line_num); 256 | } 257 | 258 | 259 | ID = 0; 260 | printf("test: %20s / #cores: %d / #repetitions: %d / stride: %d (%u kiB)", moesi_type_des[test_test], 261 | test_cores, test_reps, test_stride, (64 * test_stride) / 1024); 262 | if (test_flush) 263 | { 264 | printf(" / flush"); 265 | } 266 | 267 | printf(" / fence: "); 268 | 269 | switch (test_fence) 270 | { 271 | case 1: 272 | printf(" load & store"); 273 | test_lfence = test_sfence = 1; 274 | break; 275 | case 2: 276 | printf(" full"); 277 | test_lfence = test_sfence = 2; 278 | break; 279 | case 3: 280 | printf(" load"); 281 | test_lfence = 1; 282 | test_sfence = 0; 283 | break; 284 | case 4: 285 | printf(" store"); 286 | test_lfence = 0; 287 | test_sfence = 1; 288 | break; 289 | case 5: 290 | printf(" full/none"); 291 | test_lfence = 2; 292 | test_sfence = 0; 293 | break; 294 | case 6: 295 | printf(" none/full"); 296 | test_lfence = 0; 297 | test_sfence = 2; 298 | break; 299 | case 7: 300 | printf(" full/store"); 301 | test_lfence = 2; 302 | test_sfence = 1; 303 | break; 304 | case 8: 305 | printf(" load/full"); 306 | test_lfence = 1; 307 | test_sfence = 2; 308 | break; 309 | case 9: 310 | printf(" double write"); 311 | test_lfence = 0; 312 | test_sfence = 3; 313 | break; 314 | default: 315 | printf(" none"); 316 | test_lfence = test_sfence = 0; 317 | break; 318 | } 319 | 320 | printf("\n"); 321 | 322 | printf("core1: %3u / core2: %3u ", test_core1, test_core2); 323 | if (test_cores >= 3) 324 | { 325 | printf("/ core3: %3u", test_core3); 326 | } 327 | printf("\n"); 328 | 329 | barriers_init(test_cores); 330 | seeds = seed_rand(); 331 | 332 | volatile cache_line_t* cache_line = cache_line_open(); 333 | 334 | int rank; 335 | for (rank = 1; rank < test_cores; rank++) 336 | { 337 | pid_t child = fork(); 338 | if (child < 0) 339 | { 340 | P("Failure in fork():\n%s", strerror(errno)); 341 | } 342 | else if (child == 0) 343 | { 344 | goto fork_done; 345 | } 346 | } 347 | rank = 0; 348 | 349 | fork_done: 350 | ID = rank; 351 | size_t core = 0; 352 | switch (ID) 353 | { 354 | case 0: 355 | core = test_core1; 356 | break; 357 | case 1: 358 | core = test_core2; 359 | break; 360 | case 2: 361 | core = test_core3; 362 | break; 363 | default: 364 | core = ID - test_core_others; 365 | } 366 | 367 | #if defined(NIAGARA) 368 | if (test_cores <= 8 && test_cores > 3) 369 | { 370 | if (ID == 0) 371 | { 372 | PRINT(" ** spreading the 8 threads on the 8 real cores"); 373 | } 374 | core = ID * 8; 375 | } 376 | #endif 377 | 378 | set_cpu(core); 379 | 380 | #if defined(__tile__) 381 | tmc_cmem_init(0); /* initialize shared memory */ 382 | #endif /* TILERA */ 383 | 384 | volatile uint64_t* cl = (volatile uint64_t*) cache_line; 385 | 386 | B0; 387 | if (ID < 3) 388 | { 389 | PFDINIT(test_reps); 390 | } 391 | B0; 392 | 393 | /* /\********************************************************************************* */ 394 | /* * main functionality */ 395 | /* *********************************************************************************\/ */ 396 | 397 | uint64_t sum = 0; 398 | 399 | volatile uint64_t reps; 400 | for (reps = 0; reps < test_reps; reps++) 401 | { 402 | if (test_flush) 403 | { 404 | _mm_mfence(); 405 | _mm_clflush((void*) cache_line); 406 | _mm_mfence(); 407 | } 408 | 409 | B0; /* BARRIER 0 */ 410 | 411 | switch (test_test) 412 | { 413 | case STORE_ON_MODIFIED: /* 0 */ 414 | { 415 | switch (ID) 416 | { 417 | case 0: 418 | store_0_eventually(cache_line, reps); 419 | B1; /* BARRIER 1 */ 420 | break; 421 | case 1: 422 | B1; /* BARRIER 1 */ 423 | store_0_eventually(cache_line, reps); 424 | break; 425 | default: 426 | B1; /* BARRIER 1 */ 427 | break; 428 | } 429 | break; 430 | } 431 | case STORE_ON_MODIFIED_NO_SYNC: /* 1 */ 432 | { 433 | switch (ID) 434 | { 435 | case 0: 436 | case 1: 437 | case 2: 438 | store_0(cache_line, reps); 439 | break; 440 | default: 441 | store_0_no_pf(cache_line, reps); 442 | break; 443 | } 444 | break; 445 | } 446 | case STORE_ON_EXCLUSIVE: /* 2 */ 447 | { 448 | switch (ID) 449 | { 450 | case 0: 451 | sum += load_0_eventually(cache_line, reps); 452 | B1; /* BARRIER 1 */ 453 | break; 454 | case 1: 455 | B1; /* BARRIER 1 */ 456 | store_0_eventually(cache_line, reps); 457 | break; 458 | default: 459 | B1; /* BARRIER 1 */ 460 | break; 461 | } 462 | 463 | if (!test_flush) 464 | { 465 | cache_line += test_stride; 466 | } 467 | break; 468 | } 469 | case STORE_ON_SHARED: /* 3 */ 470 | { 471 | switch (ID) 472 | { 473 | case 0: 474 | sum += load_0_eventually(cache_line, reps); 475 | B1; /* BARRIER 1 */ 476 | B2; /* BARRIER 2 */ 477 | break; 478 | case 1: 479 | B1; /* BARRIER 1 */ 480 | B2; /* BARRIER 2 */ 481 | store_0_eventually(cache_line, reps); 482 | break; 483 | case 2: 484 | B1; /* BARRIER 1 */ 485 | sum += load_0_eventually(cache_line, reps); 486 | B2; /* BARRIER 2 */ 487 | break; 488 | default: 489 | B1; /* BARRIER 1 */ 490 | sum += load_0_eventually_no_pf(cache_line); 491 | B2; /* BARRIER 2 */ 492 | break; 493 | } 494 | break; 495 | } 496 | case STORE_ON_OWNED_MINE: /* 4 */ 497 | { 498 | switch (ID) 499 | { 500 | case 0: 501 | B1; /* BARRIER 1 */ 502 | sum += load_0_eventually(cache_line, reps); 503 | B2; /* BARRIER 2 */ 504 | break; 505 | case 1: 506 | store_0_eventually(cache_line, reps); 507 | B1; /* BARRIER 1 */ 508 | B2; /* BARRIER 2 */ 509 | store_0_eventually_pfd1(cache_line, reps); 510 | break; 511 | default: 512 | B1; /* BARRIER 1 */ 513 | sum += load_0_eventually_no_pf(cache_line); 514 | B2; /* BARRIER 2 */ 515 | break; 516 | } 517 | break; 518 | } 519 | case STORE_ON_OWNED: /* 5 */ 520 | { 521 | switch (ID) 522 | { 523 | case 0: 524 | store_0_eventually(cache_line, reps); 525 | B1; /* BARRIER 1 */ 526 | B2; /* BARRIER 2 */ 527 | break; 528 | case 1: 529 | B1; /* BARRIER 1 */ 530 | sum += load_0_eventually(cache_line, reps); 531 | B2; /* BARRIER 2 */ 532 | store_0_eventually_pfd1(cache_line, reps); 533 | break; 534 | default: 535 | B1; /* BARRIER 1 */ 536 | sum += load_0_eventually_no_pf(cache_line); 537 | B2; /* BARRIER 2 */ 538 | break; 539 | } 540 | break; 541 | } 542 | case STORE_ON_INVALID: /* 6 */ 543 | { 544 | switch (ID) 545 | { 546 | case 0: 547 | B1; 548 | /* store_0_eventually(cache_line, reps); */ 549 | store_0(cache_line, reps); 550 | if (!test_flush) 551 | { 552 | cache_line += test_stride; 553 | } 554 | break; 555 | case 1: 556 | invalidate(cache_line, 0, reps); 557 | if (!test_flush) 558 | { 559 | cache_line += test_stride; 560 | } 561 | B1; 562 | break; 563 | default: 564 | B1; 565 | break; 566 | } 567 | break; 568 | } 569 | case LOAD_FROM_MODIFIED: /* 7 */ 570 | { 571 | switch (ID) 572 | { 573 | case 0: 574 | store_0_eventually(cache_line, reps); 575 | B1; 576 | break; 577 | case 1: 578 | B1; /* BARRIER 1 */ 579 | sum += load_0_eventually(cache_line, reps); 580 | break; 581 | default: 582 | B1; 583 | break; 584 | } 585 | break; 586 | } 587 | case LOAD_FROM_EXCLUSIVE: /* 8 */ 588 | { 589 | switch (ID) 590 | { 591 | case 0: 592 | sum += load_0_eventually(cache_line, reps); 593 | B1; /* BARRIER 1 */ 594 | 595 | if (!test_flush) 596 | { 597 | cache_line += test_stride; 598 | } 599 | break; 600 | case 1: 601 | B1; /* BARRIER 1 */ 602 | sum += load_0_eventually(cache_line, reps); 603 | 604 | if (!test_flush) 605 | { 606 | cache_line += test_stride; 607 | } 608 | break; 609 | default: 610 | B1; /* BARRIER 1 */ 611 | break; 612 | } 613 | break; 614 | } 615 | case LOAD_FROM_SHARED: /* 9 */ 616 | { 617 | switch (ID) 618 | { 619 | case 0: 620 | sum += load_0_eventually(cache_line, reps); 621 | B1; /* BARRIER 1 */ 622 | B2; /* BARRIER 2 */ 623 | break; 624 | case 1: 625 | B1; /* BARRIER 1 */ 626 | sum += load_0_eventually(cache_line, reps); 627 | B2; /* BARRIER 2 */ 628 | break; 629 | case 2: 630 | B1; /* BARRIER 1 */ 631 | B2; /* BARRIER 2 */ 632 | sum += load_0_eventually(cache_line, reps); 633 | break; 634 | default: 635 | B1; /* BARRIER 1 */ 636 | sum += load_0_eventually_no_pf(cache_line); 637 | B2; /* BARRIER 2 */ 638 | break; 639 | } 640 | 641 | if (!test_flush) 642 | { 643 | cache_line += test_stride; 644 | } 645 | break; 646 | } 647 | case LOAD_FROM_OWNED: /* 10 */ 648 | { 649 | switch (ID) 650 | { 651 | case 0: 652 | store_0_eventually(cache_line, reps); 653 | B1; /* BARRIER 1 */ 654 | B2; /* BARRIER 2 */ 655 | break; 656 | case 1: 657 | B1; /* BARRIER 1 */ 658 | sum += load_0_eventually(cache_line, reps); 659 | B2; /* BARRIER 2 */ 660 | break; 661 | case 2: 662 | B1; /* BARRIER 1 */ 663 | B2; /* BARRIER 2 */ 664 | sum += load_0_eventually(cache_line, reps); 665 | break; 666 | default: 667 | B1; /* BARRIER 1 */ 668 | B2; /* BARRIER 2 */ 669 | break; 670 | } 671 | break; 672 | } 673 | case LOAD_FROM_INVALID: /* 11 */ 674 | { 675 | switch (ID) 676 | { 677 | case 0: 678 | B1; /* BARRIER 1 */ 679 | sum += load_0_eventually(cache_line, reps); /* sum += load_0(cache_line, reps); */ 680 | break; 681 | case 1: 682 | invalidate(cache_line, 0, reps); 683 | B1; /* BARRIER 1 */ 684 | break; 685 | default: 686 | B1; /* BARRIER 1 */ 687 | break; 688 | } 689 | 690 | if (!test_flush) 691 | { 692 | cache_line += test_stride; 693 | } 694 | break; 695 | } 696 | case CAS: /* 12 */ 697 | { 698 | switch (ID) 699 | { 700 | case 0: 701 | sum += cas_0_eventually(cache_line, reps); 702 | B1; /* BARRIER 1 */ 703 | break; 704 | case 1: 705 | B1; /* BARRIER 1 */ 706 | sum += cas_0_eventually(cache_line, reps); 707 | break; 708 | default: 709 | B1; /* BARRIER 1 */ 710 | break; 711 | } 712 | break; 713 | } 714 | case FAI: /* 13 */ 715 | { 716 | switch (ID) 717 | { 718 | case 0: 719 | sum += fai(cache_line, reps); 720 | B1; /* BARRIER 1 */ 721 | break; 722 | case 1: 723 | B1; /* BARRIER 1 */ 724 | sum += fai(cache_line, reps); 725 | break; 726 | default: 727 | B1; /* BARRIER 1 */ 728 | break; 729 | } 730 | break; 731 | } 732 | case TAS: /* 14 */ 733 | { 734 | switch (ID) 735 | { 736 | case 0: 737 | sum += tas(cache_line, reps); 738 | B1; /* BARRIER 1 */ 739 | B2; /* BARRIER 2 */ 740 | break; 741 | case 1: 742 | B1; /* BARRIER 1 */ 743 | sum += tas(cache_line, reps); 744 | _mm_mfence(); 745 | cache_line->word[0] = 0; 746 | B2; /* BARRIER 2 */ 747 | break; 748 | default: 749 | B1; /* BARRIER 1 */ 750 | B2; /* BARRIER 2 */ 751 | break; 752 | } 753 | break; 754 | } 755 | case SWAP: /* 15 */ 756 | { 757 | switch (ID) 758 | { 759 | case 0: 760 | sum += swap(cache_line, reps); 761 | B1; /* BARRIER 1 */ 762 | break; 763 | case 1: 764 | B1; /* BARRIER 1 */ 765 | sum += swap(cache_line, reps); 766 | break; 767 | default: 768 | B1; /* BARRIER 1 */ 769 | break; 770 | } 771 | break; 772 | } 773 | case CAS_ON_MODIFIED: /* 16 */ 774 | { 775 | switch (ID) 776 | { 777 | case 0: 778 | store_0_eventually(cache_line, reps); 779 | if (test_ao_success) 780 | { 781 | cache_line->word[0] = reps & 0x01; 782 | } 783 | B1; /* BARRIER 1 */ 784 | break; 785 | case 1: 786 | B1; /* BARRIER 1 */ 787 | sum += cas_0_eventually(cache_line, reps); 788 | break; 789 | default: 790 | B1; /* BARRIER 1 */ 791 | break; 792 | } 793 | break; 794 | } 795 | case FAI_ON_MODIFIED: /* 17 */ 796 | { 797 | switch (ID) 798 | { 799 | case 0: 800 | store_0_eventually(cache_line, reps); 801 | B1; /* BARRIER 1 */ 802 | break; 803 | case 1: 804 | B1; /* BARRIER 1 */ 805 | sum += fai(cache_line, reps); 806 | break; 807 | default: 808 | B1; /* BARRIER 1 */ 809 | break; 810 | } 811 | break; 812 | } 813 | case TAS_ON_MODIFIED: /* 18 */ 814 | { 815 | switch (ID) 816 | { 817 | case 0: 818 | store_0_eventually(cache_line, reps); 819 | if (!test_ao_success) 820 | { 821 | cache_line->word[0] = 0xFFFFFFFF; 822 | _mm_mfence(); 823 | } 824 | B1; /* BARRIER 1 */ 825 | break; 826 | case 1: 827 | B1; /* BARRIER 1 */ 828 | sum += tas(cache_line, reps); 829 | break; 830 | default: 831 | B1; /* BARRIER 1 */ 832 | break; 833 | } 834 | break; 835 | } 836 | case SWAP_ON_MODIFIED: /* 19 */ 837 | { 838 | switch (ID) 839 | { 840 | case 0: 841 | store_0_eventually(cache_line, reps); 842 | B1; /* BARRIER 1 */ 843 | break; 844 | case 1: 845 | B1; /* BARRIER 1 */ 846 | sum += swap(cache_line, reps); 847 | break; 848 | default: 849 | B1; /* BARRIER 1 */ 850 | break; 851 | } 852 | break; 853 | } 854 | case CAS_ON_SHARED: /* 20 */ 855 | { 856 | switch (ID) 857 | { 858 | case 0: 859 | sum += load_0_eventually(cache_line, reps); 860 | B1; /* BARRIER 1 */ 861 | B2; /* BARRIER 2 */ 862 | break; 863 | case 1: 864 | B1; /* BARRIER 1 */ 865 | B2; /* BARRIER 2 */ 866 | sum += cas_0_eventually(cache_line, reps); 867 | break; 868 | case 2: 869 | B1; /* BARRIER 1 */ 870 | sum += load_0_eventually(cache_line, reps); 871 | B2; /* BARRIER 2 */ 872 | break; 873 | default: 874 | B1; /* BARRIER 1 */ 875 | sum += load_0_eventually_no_pf(cache_line); 876 | B2; /* BARRIER 2 */ 877 | break; 878 | } 879 | break; 880 | } 881 | case FAI_ON_SHARED: /* 21 */ 882 | { 883 | switch (ID) 884 | { 885 | case 0: 886 | sum += load_0_eventually(cache_line, reps); 887 | B1; /* BARRIER 1 */ 888 | B2; /* BARRIER 2 */ 889 | break; 890 | case 1: 891 | B1; /* BARRIER 1 */ 892 | B2; /* BARRIER 2 */ 893 | sum += fai(cache_line, reps); 894 | break; 895 | case 2: 896 | B1; /* BARRIER 1 */ 897 | sum += load_0_eventually(cache_line, reps); 898 | B2; /* BARRIER 2 */ 899 | break; 900 | default: 901 | B1; /* BARRIER 1 */ 902 | sum += load_0_eventually_no_pf(cache_line); 903 | B2; /* BARRIER 2 */ 904 | break; 905 | } 906 | break; 907 | } 908 | case TAS_ON_SHARED: /* 22 */ 909 | { 910 | switch (ID) 911 | { 912 | case 0: 913 | if (test_ao_success) 914 | { 915 | cache_line->word[0] = 0; 916 | } 917 | else 918 | { 919 | cache_line->word[0] = 0xFFFFFFFF; 920 | } 921 | sum += load_0_eventually(cache_line, reps); 922 | B1; /* BARRIER 1 */ 923 | B2; /* BARRIER 2 */ 924 | break; 925 | case 1: 926 | B1; /* BARRIER 1 */ 927 | B2; /* BARRIER 2 */ 928 | sum += tas(cache_line, reps); 929 | break; 930 | case 2: 931 | B1; /* BARRIER 1 */ 932 | sum += load_0_eventually(cache_line, reps); 933 | B2; /* BARRIER 2 */ 934 | break; 935 | default: 936 | B1; /* BARRIER 1 */ 937 | sum += load_0_eventually_no_pf(cache_line); 938 | B2; /* BARRIER 2 */ 939 | break; 940 | } 941 | break; 942 | } 943 | case SWAP_ON_SHARED: /* 23 */ 944 | { 945 | switch (ID) 946 | { 947 | case 0: 948 | sum += load_0_eventually(cache_line, reps); 949 | B1; /* BARRIER 1 */ 950 | B2; /* BARRIER 2 */ 951 | break; 952 | case 1: 953 | B1; /* BARRIER 1 */ 954 | B2; /* BARRIER 2 */ 955 | sum += swap(cache_line, reps); 956 | break; 957 | case 2: 958 | B1; /* BARRIER 1 */ 959 | sum += load_0_eventually(cache_line, reps); 960 | B2; /* BARRIER 2 */ 961 | break; 962 | default: 963 | B1; /* BARRIER 1 */ 964 | sum += load_0_eventually_no_pf(cache_line); 965 | B2; /* BARRIER 2 */ 966 | break; 967 | } 968 | break; 969 | } 970 | case CAS_CONCURRENT: /* 24 */ 971 | { 972 | switch (ID) 973 | { 974 | case 0: 975 | case 1: 976 | sum += cas(cache_line, reps); 977 | break; 978 | default: 979 | sum += cas_no_pf(cache_line, reps); 980 | break; 981 | } 982 | break; 983 | } 984 | case FAI_ON_INVALID: /* 25 */ 985 | { 986 | switch (ID) 987 | { 988 | case 0: 989 | B1; /* BARRIER 1 */ 990 | sum += fai(cache_line, reps); 991 | break; 992 | case 1: 993 | invalidate(cache_line, 0, reps); 994 | B1; /* BARRIER 1 */ 995 | break; 996 | default: 997 | B1; /* BARRIER 1 */ 998 | break; 999 | } 1000 | 1001 | if (!test_flush) 1002 | { 1003 | cache_line += test_stride; 1004 | } 1005 | break; 1006 | } 1007 | case LOAD_FROM_L1: /* 26 */ 1008 | { 1009 | if (ID == 0) 1010 | { 1011 | sum += load_0(cache_line, reps); 1012 | sum += load_0(cache_line, reps); 1013 | sum += load_0(cache_line, reps); 1014 | } 1015 | break; 1016 | } 1017 | case LOAD_FROM_MEM_SIZE: /* 27 */ 1018 | { 1019 | if (ID < 3) 1020 | { 1021 | sum += load_next(cl, reps); 1022 | } 1023 | } 1024 | break; 1025 | case LFENCE: /* 28 */ 1026 | if (ID < 2) 1027 | { 1028 | PFDI(0); 1029 | _mm_lfence(); 1030 | PFDO(0, reps); 1031 | } 1032 | break; 1033 | case SFENCE: /* 29 */ 1034 | if (ID < 2) 1035 | { 1036 | PFDI(0); 1037 | _mm_sfence(); 1038 | PFDO(0, reps); 1039 | } 1040 | break; 1041 | case MFENCE: /* 30 */ 1042 | if (ID < 2) 1043 | { 1044 | PFDI(0); 1045 | _mm_mfence(); 1046 | PFDO(0, reps); 1047 | } 1048 | break; 1049 | case PAUSE: /* 31 */ 1050 | if (ID < 2) 1051 | { 1052 | PFDI(0); 1053 | _mm_pause(); 1054 | PFDO(0, reps); 1055 | } 1056 | break; 1057 | case NOP: /* 32 */ 1058 | if (ID < 2) 1059 | { 1060 | PFDI(0); 1061 | asm volatile ("nop"); 1062 | PFDO(0, reps); 1063 | } 1064 | break; 1065 | case PROFILER: /* 30 */ 1066 | default: 1067 | PFDI(0); 1068 | asm volatile (""); 1069 | PFDO(0, reps); 1070 | break; 1071 | } 1072 | 1073 | B3; /* BARRIER 3 */ 1074 | } 1075 | 1076 | if (!test_verbose) 1077 | { 1078 | test_print = 0; 1079 | } 1080 | 1081 | uint32_t id; 1082 | for (id = 0; id < test_cores; id++) 1083 | { 1084 | if (ID == id && ID < 3) 1085 | { 1086 | switch (test_test) 1087 | { 1088 | case STORE_ON_OWNED_MINE: 1089 | case STORE_ON_OWNED: 1090 | if (ID < 2) 1091 | { 1092 | PRINT(" *** Core %2d ************************************************************************************", ID); 1093 | PFDPN(0, test_reps, test_print); 1094 | if (ID == 1) 1095 | { 1096 | PFDPN(1, test_reps, test_print); 1097 | } 1098 | } 1099 | break; 1100 | case CAS_CONCURRENT: 1101 | if (ID < 2) 1102 | { 1103 | PRINT(" *** Core %2d ************************************************************************************", ID); 1104 | PFDPN(0, test_reps, test_print); 1105 | } 1106 | break; 1107 | case LOAD_FROM_L1: 1108 | if (ID < 1) 1109 | { 1110 | PRINT(" *** Core %2d ************************************************************************************", ID); 1111 | PFDPN(0, test_reps, test_print); 1112 | } 1113 | break; 1114 | case LOAD_FROM_MEM_SIZE: 1115 | if (ID < 3) 1116 | { 1117 | PRINT(" *** Core %2d ************************************************************************************", ID); 1118 | PFDPN(0, test_reps, test_print); 1119 | } 1120 | break; 1121 | default: 1122 | PRINT(" *** Core %2d ************************************************************************************", ID); 1123 | PFDPN(0, test_reps, test_print); 1124 | } 1125 | } 1126 | B0; 1127 | } 1128 | B10; 1129 | 1130 | 1131 | if (ID == 0) 1132 | { 1133 | switch (test_test) 1134 | { 1135 | case STORE_ON_MODIFIED: 1136 | { 1137 | if (test_flush) 1138 | { 1139 | PRINT(" ** Results from Core 0 : store on invalid"); 1140 | PRINT(" ** Results from Core 1 : store on modified"); 1141 | } 1142 | else 1143 | { 1144 | PRINT(" ** Results from Core 0 and 1 : store on modified"); 1145 | } 1146 | break; 1147 | } 1148 | case STORE_ON_MODIFIED_NO_SYNC: 1149 | { 1150 | if (test_flush) 1151 | { 1152 | PRINT(" ** Results do not make sense"); 1153 | } 1154 | else 1155 | { 1156 | PRINT(" ** Results from Core 0 and 1 : store on modified while another core is " 1157 | "also trying to do the same"); 1158 | } 1159 | break; 1160 | } 1161 | case STORE_ON_EXCLUSIVE: 1162 | { 1163 | if (test_flush) 1164 | { 1165 | PRINT(" ** Results from Core 0 : load from invalid"); 1166 | } 1167 | else 1168 | { 1169 | PRINT(" ** Results from Core 0 : load from invalid, BUT could have prefetching"); 1170 | } 1171 | PRINT(" ** Results from Core 1 : store on exclusive"); 1172 | break; 1173 | } 1174 | case STORE_ON_SHARED: 1175 | { 1176 | PRINT(" ** Results from Core 0 & 2: load from modified and exclusive or shared, respectively"); 1177 | PRINT(" ** Results from Core 1 : store on shared"); 1178 | if (test_cores < 3) 1179 | { 1180 | PRINT(" ** Need >=3 processes to achieve STORE_ON_SHARED"); 1181 | } 1182 | break; 1183 | } 1184 | case STORE_ON_OWNED_MINE: 1185 | { 1186 | PRINT(" ** Results from Core 0 : load from modified (makes it owned, if owned state is supported)"); 1187 | if (test_flush) 1188 | { 1189 | PRINT(" ** Results 1 from Core 1 : store to invalid"); 1190 | } 1191 | else 1192 | { 1193 | PRINT(" ** Results 1 from Core 1 : store to modified mine"); 1194 | } 1195 | 1196 | PRINT(" ** Results 2 from Core 1 : store to owned mine (if owned is supported, else exclusive)"); 1197 | break; 1198 | } 1199 | case STORE_ON_OWNED: 1200 | { 1201 | if (test_flush) 1202 | { 1203 | PRINT(" ** Results from Core 0 : store to modified"); 1204 | } 1205 | else 1206 | { 1207 | PRINT(" ** Results from Core 0 : store to invalid"); 1208 | } 1209 | PRINT(" ** Results 1 from Core 1 : load from modified (makes it owned, if owned state is supported)"); 1210 | PRINT(" ** Results 2 from Core 1 : store to owned (if owned is supported, else exclusive mine)"); 1211 | break; 1212 | } 1213 | case LOAD_FROM_MODIFIED: 1214 | { 1215 | if (test_flush) 1216 | { 1217 | PRINT(" ** Results from Core 0 : store to invalid"); 1218 | } 1219 | else 1220 | { 1221 | PRINT(" ** Results from Core 0 : store to owned mine (if owned state supported, else exclusive)"); 1222 | } 1223 | 1224 | PRINT(" ** Results from Core 1 : load from modified (makes it owned, if owned state supported)"); 1225 | 1226 | break; 1227 | } 1228 | case LOAD_FROM_EXCLUSIVE: 1229 | { 1230 | if (test_flush) 1231 | { 1232 | PRINT(" ** Results from Core 0 : load from invalid"); 1233 | } 1234 | else 1235 | { 1236 | PRINT(" ** Results from Core 0 : load from invalid, BUT could have prefetching"); 1237 | } 1238 | PRINT(" ** Results from Core 1 : load from exclusive"); 1239 | 1240 | break; 1241 | } 1242 | case STORE_ON_INVALID: 1243 | { 1244 | PRINT(" ** Results from Core 0 : store on invalid"); 1245 | PRINT(" ** Results from Core 1 : cache line flush"); 1246 | break; 1247 | } 1248 | case LOAD_FROM_INVALID: 1249 | { 1250 | PRINT(" ** Results from Core 0 : load from invalid"); 1251 | PRINT(" ** Results from Core 1 : cache line flush"); 1252 | break; 1253 | } 1254 | case LOAD_FROM_SHARED: 1255 | { 1256 | if (test_flush) 1257 | { 1258 | PRINT(" ** Results from Core 0 : load from invalid"); 1259 | } 1260 | else 1261 | { 1262 | PRINT(" ** Results from Core 0 : load from invalid, BUT could have prefetching"); 1263 | } 1264 | PRINT(" ** Results from Core 1 : load from exclusive"); 1265 | if (test_cores >= 3) 1266 | { 1267 | PRINT(" ** Results from Core 2 : load from shared"); 1268 | } 1269 | else 1270 | { 1271 | PRINT(" ** Need >=3 processes to achieve LOAD_FROM_SHARED"); 1272 | } 1273 | break; 1274 | } 1275 | case LOAD_FROM_OWNED: 1276 | { 1277 | if (test_flush) 1278 | { 1279 | PRINT(" ** Results from Core 0 : store to invalid"); 1280 | } 1281 | else 1282 | { 1283 | PRINT(" ** Results from Core 0 : store to owned mine (if owned is supported, else shared)"); 1284 | } 1285 | PRINT(" ** Results from Core 1 : load from modified"); 1286 | if (test_cores == 3) 1287 | { 1288 | PRINT(" ** Results from Core 2 : load from owned"); 1289 | } 1290 | else 1291 | { 1292 | PRINT(" ** Need 3 processes to achieve LOAD_FROM_OWNED"); 1293 | } 1294 | break; 1295 | } 1296 | case CAS: 1297 | { 1298 | PRINT(" ** Results from Core 0 : CAS successfull"); 1299 | PRINT(" ** Results from Core 1 : CAS unsuccessfull"); 1300 | break; 1301 | } 1302 | case FAI: 1303 | { 1304 | PRINT(" ** Results from Cores 0 & 1: FAI"); 1305 | break; 1306 | } 1307 | case TAS: 1308 | { 1309 | PRINT(" ** Results from Core 0 : TAS successfull"); 1310 | PRINT(" ** Results from Core 1 : TAS unsuccessfull"); 1311 | break; 1312 | } 1313 | case SWAP: 1314 | { 1315 | PRINT(" ** Results from Cores 0 & 1: SWAP"); 1316 | break; 1317 | } 1318 | case CAS_ON_MODIFIED: 1319 | { 1320 | PRINT(" ** Results from Core 0 : store on modified"); 1321 | uint32_t succ = 50 + test_ao_success * 50; 1322 | PRINT(" ** Results from Core 1 : CAS on modified (%d%% successfull)", succ); 1323 | break; 1324 | } 1325 | case FAI_ON_MODIFIED: 1326 | { 1327 | PRINT(" ** Results from Core 0 : store on modified"); 1328 | PRINT(" ** Results from Core 1 : FAI on modified"); 1329 | break; 1330 | } 1331 | case TAS_ON_MODIFIED: 1332 | { 1333 | PRINT(" ** Results from Core 0 : store on modified"); 1334 | uint32_t succ = test_ao_success * 100; 1335 | PRINT(" ** Results from Core 1 : TAS on modified (%d%% successfull)", succ); 1336 | break; 1337 | } 1338 | case SWAP_ON_MODIFIED: 1339 | { 1340 | PRINT(" ** Results from Core 0 : store on modified"); 1341 | PRINT(" ** Results from Core 1 : SWAP on modified"); 1342 | break; 1343 | } 1344 | case CAS_ON_SHARED: 1345 | { 1346 | PRINT(" ** Results from Core 0 : load from modified"); 1347 | PRINT(" ** Results from Core 1 : CAS on shared (100%% successfull)"); 1348 | PRINT(" ** Results from Core 2 : load from exlusive or shared"); 1349 | if (test_cores < 3) 1350 | { 1351 | PRINT(" ** Need >=3 processes to achieve CAS_ON_SHARED"); 1352 | } 1353 | break; 1354 | } 1355 | case FAI_ON_SHARED: 1356 | { 1357 | PRINT(" ** Results from Core 0 : load from modified"); 1358 | PRINT(" ** Results from Core 1 : FAI on shared"); 1359 | PRINT(" ** Results from Core 2 : load from exlusive or shared"); 1360 | if (test_cores < 3) 1361 | { 1362 | PRINT(" ** Need >=3 processes to achieve FAI_ON_SHARED"); 1363 | } 1364 | break; 1365 | } 1366 | case TAS_ON_SHARED: 1367 | { 1368 | PRINT(" ** Results from Core 0 : load from L1"); 1369 | uint32_t succ = test_ao_success * 100; 1370 | PRINT(" ** Results from Core 1 : TAS on shared (%d%% successfull)", succ); 1371 | PRINT(" ** Results from Core 2 : load from exlusive or shared"); 1372 | if (test_cores < 3) 1373 | { 1374 | PRINT(" ** Need >=3 processes to achieve TAS_ON_SHARED"); 1375 | } 1376 | break; 1377 | } 1378 | case SWAP_ON_SHARED: 1379 | { 1380 | PRINT(" ** Results from Core 0 : load from modified"); 1381 | PRINT(" ** Results from Core 1 : SWAP on shared"); 1382 | PRINT(" ** Results from Core 2 : load from exlusive or shared"); 1383 | if (test_cores < 3) 1384 | { 1385 | PRINT(" ** Need >=3 processes to achieve SWAP_ON_SHARED"); 1386 | } 1387 | break; 1388 | } 1389 | case CAS_CONCURRENT: 1390 | { 1391 | PRINT(" ** Results from Cores 0 & 1: CAS concurrent"); 1392 | break; 1393 | } 1394 | case FAI_ON_INVALID: 1395 | { 1396 | PRINT(" ** Results from Core 0 : FAI on invalid"); 1397 | PRINT(" ** Results from Core 1 : cache line flush"); 1398 | break; 1399 | } 1400 | case LOAD_FROM_L1: 1401 | { 1402 | PRINT(" ** Results from Core 0: load from L1"); 1403 | break; 1404 | } 1405 | case LOAD_FROM_MEM_SIZE: 1406 | { 1407 | PRINT(" ** Results from Corees 0 & 1 & 2: load from random %zu KiB", test_mem_size / 1024); 1408 | break; 1409 | } 1410 | case LFENCE: 1411 | { 1412 | PRINT(" ** Results from Cores 0 & 1: load fence"); 1413 | break; 1414 | } 1415 | case SFENCE: 1416 | { 1417 | PRINT(" ** Results from Cores 0 & 1: store fence"); 1418 | break; 1419 | } 1420 | case MFENCE: 1421 | { 1422 | PRINT(" ** Results from Cores 0 & 1: full fence"); 1423 | break; 1424 | } 1425 | case PROFILER: 1426 | { 1427 | PRINT(" ** Results from Cores 0 & 1: empty profiler region (start_prof - empty - stop_prof"); 1428 | break; 1429 | } 1430 | 1431 | default: 1432 | break; 1433 | } 1434 | } 1435 | 1436 | B0; 1437 | 1438 | 1439 | if (ID < 3) 1440 | { 1441 | PRINT(" value of cl is %-10u / sum is %llu", cache_line->word[0], (LLU) sum); 1442 | } 1443 | cache_line_close(ID, "cache_line"); 1444 | barriers_term(ID); 1445 | return 0; 1446 | 1447 | } 1448 | 1449 | uint32_t 1450 | cas(volatile cache_line_t* cl, volatile uint64_t reps) 1451 | { 1452 | uint8_t o = reps & 0x1; 1453 | uint8_t no = !o; 1454 | volatile uint32_t r; 1455 | 1456 | PFDI(0); 1457 | r = CAS_U32(cl->word, o, no); 1458 | PFDO(0, reps); 1459 | 1460 | return (r == o); 1461 | } 1462 | 1463 | uint32_t 1464 | cas_no_pf(volatile cache_line_t* cl, volatile uint64_t reps) 1465 | { 1466 | uint8_t o = reps & 0x1; 1467 | uint8_t no = !o; 1468 | volatile uint32_t r; 1469 | r = CAS_U32(cl->word, o, no); 1470 | 1471 | return (r == o); 1472 | } 1473 | 1474 | uint32_t 1475 | cas_0_eventually(volatile cache_line_t* cl, volatile uint64_t reps) 1476 | { 1477 | uint8_t o = reps & 0x1; 1478 | uint8_t no = !o; 1479 | volatile uint32_t r; 1480 | 1481 | uint32_t cln = 0; 1482 | do 1483 | { 1484 | cln = clrand(); 1485 | volatile cache_line_t* cl1 = cl + cln; 1486 | PFDI(0); 1487 | r = CAS_U32(cl1->word, o, no); 1488 | PFDO(0, reps); 1489 | } 1490 | while (cln > 0); 1491 | 1492 | return (r == o); 1493 | } 1494 | 1495 | uint32_t 1496 | fai(volatile cache_line_t* cl, volatile uint64_t reps) 1497 | { 1498 | volatile uint32_t t = 0; 1499 | 1500 | uint32_t cln = 0; 1501 | do 1502 | { 1503 | cln = clrand(); 1504 | volatile cache_line_t* cl1 = cl + cln; 1505 | PFDI(0); 1506 | t = FAI_U32(cl1->word); 1507 | PFDO(0, reps); 1508 | } 1509 | while (cln > 0); 1510 | 1511 | return t; 1512 | } 1513 | 1514 | uint8_t 1515 | tas(volatile cache_line_t* cl, volatile uint64_t reps) 1516 | { 1517 | volatile uint8_t r; 1518 | 1519 | uint32_t cln = 0; 1520 | do 1521 | { 1522 | cln = clrand(); 1523 | volatile cache_line_t* cl1 = cl + cln; 1524 | #if defined(TILERA) 1525 | volatile uint32_t* b = (volatile uint32_t*) cl1->word; 1526 | #else 1527 | volatile uint8_t* b = (volatile uint8_t*) cl1->word; 1528 | #endif 1529 | 1530 | PFDI(0); 1531 | r = TAS_U8(b); 1532 | PFDO(0, reps); 1533 | } 1534 | while (cln > 0); 1535 | 1536 | return (r != 255); 1537 | } 1538 | 1539 | uint32_t 1540 | swap(volatile cache_line_t* cl, volatile uint64_t reps) 1541 | { 1542 | volatile uint32_t res; 1543 | 1544 | uint32_t cln = 0; 1545 | do 1546 | { 1547 | cln = clrand(); 1548 | volatile cache_line_t* cl1 = cl + cln; 1549 | PFDI(0); 1550 | res = SWAP_U32(cl1->word, ID); 1551 | PFDO(0, reps); 1552 | } 1553 | while (cln > 0); 1554 | 1555 | _mm_mfence(); 1556 | return res; 1557 | } 1558 | 1559 | void 1560 | store_0(volatile cache_line_t* cl, volatile uint64_t reps) 1561 | { 1562 | if (test_sfence == 0) 1563 | { 1564 | PFDI(0); 1565 | cl->word[0] = reps; 1566 | PFDO(0, reps); 1567 | } 1568 | else if (test_sfence == 1) 1569 | { 1570 | PFDI(0); 1571 | cl->word[0] = reps; 1572 | _mm_sfence(); 1573 | PFDO(0, reps); 1574 | } 1575 | else if (test_sfence == 2) 1576 | { 1577 | PFDI(0); 1578 | cl->word[0] = reps; 1579 | _mm_mfence(); 1580 | PFDO(0, reps); 1581 | } 1582 | } 1583 | 1584 | void 1585 | store_0_no_pf(volatile cache_line_t* cl, volatile uint64_t reps) 1586 | { 1587 | cl->word[0] = reps; 1588 | if (test_sfence == 1) 1589 | { 1590 | _mm_sfence(); 1591 | } 1592 | else if (test_sfence == 2) 1593 | { 1594 | _mm_mfence(); 1595 | } 1596 | } 1597 | 1598 | static void 1599 | store_0_eventually_sf(volatile cache_line_t* cl, volatile uint64_t reps) 1600 | { 1601 | volatile uint32_t cln = 0; 1602 | do 1603 | { 1604 | cln = clrand(); 1605 | volatile uint32_t *w = &cl[cln].word[0]; 1606 | PFDI(0); 1607 | w[0] = cln; 1608 | _mm_sfence(); 1609 | PFDO(0, reps); 1610 | } 1611 | while (cln > 0); 1612 | } 1613 | 1614 | static void 1615 | store_0_eventually_mf(volatile cache_line_t* cl, volatile uint64_t reps) 1616 | { 1617 | volatile uint32_t cln = 0; 1618 | do 1619 | { 1620 | cln = clrand(); 1621 | volatile uint32_t *w = &cl[cln].word[0]; 1622 | PFDI(0); 1623 | w[0] = cln; 1624 | _mm_mfence(); 1625 | PFDO(0, reps); 1626 | } 1627 | while (cln > 0); 1628 | } 1629 | 1630 | static void 1631 | store_0_eventually_nf(volatile cache_line_t* cl, volatile uint64_t reps) 1632 | { 1633 | volatile uint32_t cln = 0; 1634 | do 1635 | { 1636 | cln = clrand(); 1637 | volatile uint32_t *w = &cl[cln].word[0]; 1638 | PFDI(0); 1639 | w[0] = cln; 1640 | PFDO(0, reps); 1641 | } 1642 | while (cln > 0); 1643 | } 1644 | 1645 | static void 1646 | store_0_eventually_dw(volatile cache_line_t* cl, volatile uint64_t reps) 1647 | { 1648 | volatile uint32_t cln = 0; 1649 | do 1650 | { 1651 | cln = clrand(); 1652 | volatile uint32_t *w = &cl[cln].word[0]; 1653 | PFDI(0); 1654 | w[0] = cln; 1655 | w[16] = cln; 1656 | PFDO(0, reps); 1657 | } 1658 | while (cln > 0); 1659 | } 1660 | 1661 | void 1662 | store_0_eventually(volatile cache_line_t* cl, volatile uint64_t reps) 1663 | { 1664 | if (test_sfence == 0) 1665 | { 1666 | store_0_eventually_nf(cl, reps); 1667 | } 1668 | else if (test_sfence == 1) 1669 | { 1670 | store_0_eventually_sf(cl, reps); 1671 | } 1672 | else if (test_sfence == 2) 1673 | { 1674 | store_0_eventually_mf(cl, reps); 1675 | } 1676 | else if (test_sfence == 3) 1677 | { 1678 | store_0_eventually_dw(cl, reps); 1679 | } 1680 | /* _mm_mfence(); */ 1681 | } 1682 | 1683 | 1684 | static void 1685 | store_0_eventually_pfd1_sf(volatile cache_line_t* cl, volatile uint64_t reps) 1686 | { 1687 | volatile uint32_t cln = 0; 1688 | do 1689 | { 1690 | cln = clrand(); 1691 | volatile uint32_t *w = &cl[cln].word[0]; 1692 | PFDI(1); 1693 | w[0] = cln; 1694 | _mm_sfence(); 1695 | PFDO(1, reps); 1696 | } 1697 | while (cln > 0); 1698 | } 1699 | 1700 | static void 1701 | store_0_eventually_pfd1_mf(volatile cache_line_t* cl, volatile uint64_t reps) 1702 | { 1703 | volatile uint32_t cln = 0; 1704 | do 1705 | { 1706 | cln = clrand(); 1707 | volatile uint32_t *w = &cl[cln].word[0]; 1708 | PFDI(1); 1709 | w[0] = cln; 1710 | _mm_mfence(); 1711 | PFDO(1, reps); 1712 | } 1713 | while (cln > 0); 1714 | } 1715 | 1716 | static void 1717 | store_0_eventually_pfd1_nf(volatile cache_line_t* cl, volatile uint64_t reps) 1718 | { 1719 | volatile uint32_t cln = 0; 1720 | do 1721 | { 1722 | cln = clrand(); 1723 | volatile uint32_t *w = &cl[cln].word[0]; 1724 | PFDI(1); 1725 | w[0] = cln; 1726 | PFDO(1, reps); 1727 | } 1728 | while (cln > 0); 1729 | } 1730 | 1731 | void 1732 | store_0_eventually_pfd1(volatile cache_line_t* cl, volatile uint64_t reps) 1733 | { 1734 | if (test_sfence == 0) 1735 | { 1736 | store_0_eventually_pfd1_nf(cl, reps); 1737 | } 1738 | else if (test_sfence == 1) 1739 | { 1740 | store_0_eventually_pfd1_sf(cl, reps); 1741 | } 1742 | else if (test_sfence == 2) 1743 | { 1744 | store_0_eventually_pfd1_mf(cl, reps); 1745 | } 1746 | /* _mm_mfence(); */ 1747 | } 1748 | 1749 | static uint64_t 1750 | load_0_eventually_lf(volatile cache_line_t* cl, volatile uint64_t reps) 1751 | { 1752 | volatile uint32_t cln = 0; 1753 | volatile uint64_t val = 0; 1754 | 1755 | do 1756 | { 1757 | cln = clrand(); 1758 | volatile uint32_t* w = &cl[cln].word[0]; 1759 | PFDI(0); 1760 | val = w[0]; 1761 | _mm_lfence(); 1762 | PFDO(0, reps); 1763 | } 1764 | while (cln > 0); 1765 | return val; 1766 | } 1767 | 1768 | static uint64_t 1769 | load_0_eventually_mf(volatile cache_line_t* cl, volatile uint64_t reps) 1770 | { 1771 | volatile uint32_t cln = 0; 1772 | volatile uint64_t val = 0; 1773 | 1774 | do 1775 | { 1776 | cln = clrand(); 1777 | volatile uint32_t* w = &cl[cln].word[0]; 1778 | PFDI(0); 1779 | val = w[0]; 1780 | _mm_mfence(); 1781 | PFDO(0, reps); 1782 | } 1783 | while (cln > 0); 1784 | return val; 1785 | } 1786 | 1787 | static uint64_t 1788 | load_0_eventually_nf(volatile cache_line_t* cl, volatile uint64_t reps) 1789 | { 1790 | volatile uint32_t cln = 0; 1791 | volatile uint64_t val = 0; 1792 | 1793 | do 1794 | { 1795 | cln = clrand(); 1796 | volatile uint32_t* w = &cl[cln].word[0]; 1797 | PFDI(0); 1798 | val = w[0]; 1799 | PFDO(0, reps); 1800 | } 1801 | while (cln > 0); 1802 | return val; 1803 | } 1804 | 1805 | 1806 | uint64_t 1807 | load_0_eventually(volatile cache_line_t* cl, volatile uint64_t reps) 1808 | { 1809 | uint64_t val = 0; 1810 | if (test_lfence == 0) 1811 | { 1812 | val = load_0_eventually_nf(cl, reps); 1813 | } 1814 | else if (test_lfence == 1) 1815 | { 1816 | val = load_0_eventually_lf(cl, reps); 1817 | } 1818 | else if (test_lfence == 2) 1819 | { 1820 | val = load_0_eventually_mf(cl, reps); 1821 | } 1822 | _mm_mfence(); 1823 | return val; 1824 | } 1825 | 1826 | uint64_t 1827 | load_0_eventually_no_pf(volatile cache_line_t* cl) 1828 | { 1829 | uint32_t cln = 0; 1830 | uint64_t sum = 0; 1831 | do 1832 | { 1833 | cln = clrand(); 1834 | volatile uint32_t *w = &cl[cln].word[0]; 1835 | sum = w[0]; 1836 | } 1837 | while (cln > 0); 1838 | 1839 | _mm_mfence(); 1840 | return sum; 1841 | } 1842 | 1843 | static uint64_t 1844 | load_0_lf(volatile cache_line_t* cl, volatile uint64_t reps) 1845 | { 1846 | volatile uint32_t val = 0; 1847 | volatile uint32_t* p = (volatile uint32_t*) &cl->word[0]; 1848 | PFDI(0); 1849 | val = p[0]; 1850 | _mm_lfence(); 1851 | PFDO(0, reps); 1852 | return val; 1853 | } 1854 | 1855 | static uint64_t 1856 | load_0_mf(volatile cache_line_t* cl, volatile uint64_t reps) 1857 | { 1858 | volatile uint32_t val = 0; 1859 | volatile uint32_t* p = (volatile uint32_t*) &cl->word[0]; 1860 | PFDI(0); 1861 | val = p[0]; 1862 | _mm_mfence(); 1863 | PFDO(0, reps); 1864 | return val; 1865 | } 1866 | 1867 | static uint64_t 1868 | load_0_nf(volatile cache_line_t* cl, volatile uint64_t reps) 1869 | { 1870 | volatile uint32_t val = 0; 1871 | volatile uint32_t* p = (volatile uint32_t*) &cl->word[0]; 1872 | PFDI(0); 1873 | val = p[0]; 1874 | PFDO(0, reps); 1875 | return val; 1876 | } 1877 | 1878 | 1879 | uint64_t 1880 | load_0(volatile cache_line_t* cl, volatile uint64_t reps) 1881 | { 1882 | uint64_t val = 0; 1883 | if (test_lfence == 0) 1884 | { 1885 | val = load_0_nf(cl, reps); 1886 | } 1887 | else if (test_lfence == 1) 1888 | { 1889 | val = load_0_lf(cl, reps); 1890 | } 1891 | else if (test_lfence == 2) 1892 | { 1893 | val = load_0_mf(cl, reps); 1894 | } 1895 | _mm_mfence(); 1896 | return val; 1897 | } 1898 | 1899 | static uint64_t 1900 | load_next_lf(volatile uint64_t* cl, volatile uint64_t reps) 1901 | { 1902 | const size_t do_reps = test_cache_line_num; 1903 | PFDI(0); 1904 | int i; 1905 | for (i = 0; i < do_reps; i++) 1906 | { 1907 | cl = (uint64_t*) *cl; 1908 | _mm_lfence(); 1909 | } 1910 | PFDOR(0, reps, do_reps); 1911 | return *cl; 1912 | 1913 | } 1914 | 1915 | static uint64_t 1916 | load_next_mf(volatile uint64_t* cl, volatile uint64_t reps) 1917 | { 1918 | const size_t do_reps = test_cache_line_num; 1919 | PFDI(0); 1920 | int i; 1921 | for (i = 0; i < do_reps; i++) 1922 | { 1923 | cl = (uint64_t*) *cl; 1924 | _mm_mfence(); 1925 | } 1926 | PFDOR(0, reps, do_reps); 1927 | return *cl; 1928 | 1929 | } 1930 | 1931 | static uint64_t 1932 | load_next_nf(volatile uint64_t* cl, volatile uint64_t reps) 1933 | { 1934 | const size_t do_reps = test_cache_line_num; 1935 | PFDI(0); 1936 | int i; 1937 | for (i = 0; i < do_reps; i++) 1938 | { 1939 | cl = (uint64_t*) *cl; 1940 | } 1941 | PFDOR(0, reps, do_reps); 1942 | return *cl; 1943 | } 1944 | 1945 | uint64_t 1946 | load_next(volatile uint64_t* cl, volatile uint64_t reps) 1947 | { 1948 | uint64_t val = 0; 1949 | if (test_lfence == 0) 1950 | { 1951 | val = load_next_nf(cl, reps); 1952 | } 1953 | else if (test_lfence == 1) 1954 | { 1955 | val = load_next_lf(cl, reps); 1956 | } 1957 | else if (test_lfence == 2) 1958 | { 1959 | val = load_next_mf(cl, reps); 1960 | } 1961 | return val; 1962 | } 1963 | 1964 | void 1965 | invalidate(volatile cache_line_t* cl, uint64_t index, volatile uint64_t reps) 1966 | { 1967 | PFDI(0); 1968 | _mm_clflush((void*) (cl + index)); 1969 | PFDO(0, reps); 1970 | _mm_mfence(); 1971 | } 1972 | 1973 | static size_t 1974 | parse_size(char* optarg) 1975 | { 1976 | size_t test_mem_size_multi = 1; 1977 | char multi = optarg[strlen(optarg) - 1]; 1978 | if (multi == 'b' || multi == 'B') 1979 | { 1980 | optarg[strlen(optarg) - 1] = optarg[strlen(optarg)]; 1981 | multi = optarg[strlen(optarg) - 1]; 1982 | } 1983 | 1984 | if (multi == 'k' || multi == 'K') 1985 | { 1986 | test_mem_size_multi = 1024; 1987 | optarg[strlen(optarg) - 1] = optarg[strlen(optarg)]; 1988 | } 1989 | else if (multi == 'm' || multi == 'M') 1990 | { 1991 | test_mem_size_multi = 1024 * 1024LL; 1992 | optarg[strlen(optarg) - 1] = optarg[strlen(optarg)]; 1993 | } 1994 | else if (multi == 'g' || multi == 'G') 1995 | { 1996 | test_mem_size_multi = 1024 * 1024 * 1024LL; 1997 | optarg[strlen(optarg) - 1] = optarg[strlen(optarg)]; 1998 | } 1999 | 2000 | return test_mem_size_multi * atoi(optarg); 2001 | } 2002 | 2003 | volatile cache_line_t* 2004 | cache_line_open() 2005 | { 2006 | uint64_t size = test_cache_line_num * sizeof(cache_line_t); 2007 | 2008 | #if defined(__tile__) 2009 | tmc_alloc_t alloc = TMC_ALLOC_INIT; 2010 | tmc_alloc_set_shared(&alloc); 2011 | /* tmc_alloc_set_home(&alloc, TMC_ALLOC_HOME_HASH); */ 2012 | /* tmc_alloc_set_home(&alloc, MAP_CACHE_NO_LOCAL); */ 2013 | tmc_alloc_set_home(&alloc, TMC_ALLOC_HOME_HERE); 2014 | /* tmc_alloc_set_home(&alloc, TMC_ALLOC_HOME_TASK); */ 2015 | 2016 | volatile cache_line_t* cache_line = (volatile cache_line_t*) tmc_alloc_map(&alloc, size); 2017 | if (cache_line == NULL) 2018 | { 2019 | tmc_task_die("Failed to allocate memory."); 2020 | } 2021 | 2022 | tmc_cmem_init(0); /* initialize shared memory */ 2023 | 2024 | 2025 | cache_line->word[0] = 0; 2026 | 2027 | #else /* !__tile__ ****************************************************************************************/ 2028 | char keyF[100]; 2029 | sprintf(keyF, CACHE_LINE_MEM_FILE); 2030 | 2031 | int ssmpfd = shm_open(keyF, O_CREAT | O_EXCL | O_RDWR, S_IRWXU | S_IRWXG); 2032 | if (ssmpfd < 0) 2033 | { 2034 | if (errno != EEXIST) 2035 | { 2036 | perror("In shm_open"); 2037 | exit(1); 2038 | } 2039 | 2040 | 2041 | ssmpfd = shm_open(keyF, O_CREAT | O_RDWR, S_IRWXU | S_IRWXG); 2042 | if (ssmpfd < 0) 2043 | { 2044 | perror("In shm_open"); 2045 | exit(1); 2046 | } 2047 | } 2048 | else { 2049 | // P("%s newly openned", keyF); 2050 | if (ftruncate(ssmpfd, size) < 0) { 2051 | perror("ftruncate failed\n"); 2052 | exit(1); 2053 | } 2054 | } 2055 | 2056 | volatile cache_line_t* cache_line = 2057 | (volatile cache_line_t *) mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, ssmpfd, 0); 2058 | if (cache_line == NULL) 2059 | { 2060 | perror("cache_line = NULL\n"); 2061 | exit(134); 2062 | } 2063 | 2064 | #endif /* __tile ********************************************************************************************/ 2065 | memset((void*) cache_line, '1', size); 2066 | 2067 | if (ID == 0) 2068 | { 2069 | uint32_t cl; 2070 | for (cl = 0; cl < test_cache_line_num; cl++) 2071 | { 2072 | cache_line[cl].word[0] = 0; 2073 | _mm_clflush((void*) (cache_line + cl)); 2074 | } 2075 | 2076 | if (test_test == LOAD_FROM_MEM_SIZE) 2077 | { 2078 | create_rand_list_cl((volatile uint64_t*) cache_line, test_mem_size / sizeof(uint64_t)); 2079 | } 2080 | 2081 | 2082 | } 2083 | 2084 | _mm_mfence(); 2085 | return cache_line; 2086 | } 2087 | 2088 | static void 2089 | create_rand_list_cl(volatile uint64_t* list, size_t n) 2090 | { 2091 | size_t per_cl = sizeof(cache_line_t) / sizeof(uint64_t); 2092 | n /= per_cl; 2093 | 2094 | unsigned long* s = seed_rand(); 2095 | s[0] = 0xB9E4E2F1F1E2E3D5L; 2096 | s[1] = 0xF1E2E3D5B9E4E2F1L; 2097 | s[2] = 0x9B3A0FA212342345L; 2098 | 2099 | uint8_t* used = calloc(n * per_cl, sizeof(uint8_t)); 2100 | assert (used != NULL); 2101 | 2102 | size_t idx = 0; 2103 | size_t used_num = 0; 2104 | while (used_num < n - 1) 2105 | { 2106 | used[idx] = 1; 2107 | used_num++; 2108 | 2109 | size_t nxt; 2110 | do 2111 | { 2112 | nxt = (my_random(s, s+1, s+2) % n) * per_cl; 2113 | } 2114 | while (used[nxt]); 2115 | 2116 | list[idx] = (uint64_t) (list + nxt); 2117 | idx = nxt; 2118 | } 2119 | list[idx] = (uint64_t) (list); /* close the loop! */ 2120 | 2121 | free(s); 2122 | free(used); 2123 | } 2124 | 2125 | void 2126 | cache_line_close(const uint32_t id, const char* name) 2127 | { 2128 | #if !defined(__tile__) 2129 | if (id == 0) 2130 | { 2131 | char keyF[100]; 2132 | sprintf(keyF, CACHE_LINE_MEM_FILE); 2133 | shm_unlink(keyF); 2134 | } 2135 | #else 2136 | tmc_cmem_close(); 2137 | #endif 2138 | } 2139 | 2140 | -------------------------------------------------------------------------------- /src/pfd.c: -------------------------------------------------------------------------------- 1 | /* 2 | * File: pfd.c 3 | * Author: Vasileios Trigonakis 4 | * Description: a fine-grained profiler based on rdtsc 5 | * pfd.c is part of ccbench 6 | * 7 | * The MIT License (MIT) 8 | * 9 | * Copyright (C) 2013 Vasileios Trigonakis 10 | * 11 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 12 | * this software and associated documentation files (the "Software"), to deal in 13 | * the Software without restriction, including without limitation the rights to 14 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 15 | * the Software, and to permit persons to whom the Software is furnished to do so, 16 | * subject to the following conditions: 17 | * 18 | * The above copyright notice and this permission notice shall be included in all 19 | * copies or substantial portions of the Software. 20 | * 21 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 23 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 24 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 25 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 26 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | * 28 | */ 29 | 30 | #include "pfd.h" 31 | #include 32 | #include "atomic_ops.h" 33 | 34 | volatile ticks** pfd_store; 35 | volatile ticks* _pfd_s; 36 | volatile ticks pfd_correction; 37 | 38 | void 39 | pfd_store_init(uint32_t num_entries) 40 | { 41 | _pfd_s = (volatile ticks*) malloc(PFD_NUM_STORES * sizeof(ticks)); 42 | pfd_store = (volatile ticks**) malloc(PFD_NUM_STORES * sizeof(ticks*)); 43 | assert(_pfd_s != NULL && pfd_store != NULL); 44 | 45 | volatile uint32_t i; 46 | for (i = 0; i < PFD_NUM_STORES; i++) 47 | { 48 | pfd_store[i] = (ticks*) malloc(num_entries * sizeof(ticks)); 49 | assert(pfd_store[i] != NULL); 50 | PREFETCHW((void*) &pfd_store[i][0]); 51 | } 52 | 53 | int32_t tries = 10; 54 | uint32_t print_warning = 0; 55 | 56 | 57 | #if defined(XEON) || defined(OPTERON2) || defined(XEON2) || defined(DEFAULT) 58 | /* enforcing max freq if freq scaling is enabled */ 59 | volatile uint64_t speed; 60 | for (speed = 0; speed < 20e7; speed++) 61 | { 62 | asm volatile (""); 63 | } 64 | #endif /* XEON */ 65 | 66 | pfd_correction = 0; 67 | 68 | #define PFD_CORRECTION_CONF 3 69 | retry: 70 | for (i = 0; i < num_entries; i++) 71 | { 72 | PFDI(0); 73 | asm volatile (""); 74 | PFDO(0, i); 75 | } 76 | 77 | abs_deviation_t ad; 78 | get_abs_deviation(pfd_store[0], num_entries, &ad); 79 | double std_pp = 100 * (1 - (ad.avg - ad.std_dev) / ad.avg); 80 | 81 | if (std_pp > PFD_CORRECTION_CONF) 82 | { 83 | if (print_warning++ == 1) /* print warning if 2 failed attempts */ 84 | { 85 | printf("* warning: avg pfd correction is %.1f with std deviation: %.1f%%. Recalculating.\n", 86 | ad.avg, std_pp); 87 | } 88 | if (tries-- > 0) 89 | { 90 | goto retry; 91 | } 92 | else 93 | { 94 | printf("* warning: setting pfd correction manually\n"); 95 | #if defined(OPTERON) 96 | ad.avg = 64; 97 | #elif defined(OPTERON2) 98 | ad.avg = 68; 99 | #elif defined(XEON) || defined(XEON2) 100 | ad.avg = 20; 101 | #elif defined(NIAGARA) 102 | ad.avg = 76; 103 | #else 104 | printf("* warning: no default value for pfd correction is provided (fix in src/pfd.c)\n"); 105 | #endif 106 | } 107 | } 108 | 109 | pfd_correction = ad.avg; 110 | assert(pfd_correction > 0); 111 | 112 | printf("* set pfd correction: %llu (std deviation: %.1f%%)\n", (long long unsigned int) pfd_correction, std_pp); 113 | } 114 | 115 | static inline 116 | double absd(double x) 117 | { 118 | if (x >= 0) 119 | { 120 | return x; 121 | } 122 | else 123 | { 124 | return -x; 125 | } 126 | } 127 | 128 | 129 | #define llu long long unsigned int 130 | void 131 | print_abs_deviation(const abs_deviation_t* abs_dev) 132 | { 133 | printf("\n ---- statistics:\n"); 134 | PRINT(" avg : %-10.1f abs dev : %-10.1f std dev : %-10.1f num : %llu", 135 | abs_dev->avg, abs_dev->abs_dev, abs_dev->std_dev, (llu) abs_dev->num_vals); 136 | PRINT(" min : %-10.1f (element: %6llu) max : %-10.1f (element: %6llu)", abs_dev->min_val, 137 | (llu) abs_dev->min_val_idx, abs_dev->max_val, (llu) abs_dev->max_val_idx); 138 | double v10p = 100 * 139 | (1 - (abs_dev->num_vals - abs_dev->num_dev_10p) / (double) abs_dev->num_vals); 140 | double std_10pp = 100 * (1 - (abs_dev->avg_10p - abs_dev->std_dev_10p) / abs_dev->avg_10p); 141 | PRINT(" 0-10%% : %-10u ( %5.1f%% | avg: %6.1f | abs dev: %6.1f | std dev: %6.1f = %5.1f%% )", 142 | abs_dev->num_dev_10p, v10p, abs_dev->avg_10p, abs_dev->abs_dev_10p, abs_dev->std_dev_10p, std_10pp); 143 | double v25p = 100 144 | * (1 - (abs_dev->num_vals - abs_dev->num_dev_25p) / (double) abs_dev->num_vals); 145 | double std_25pp = 100 * (1 - (abs_dev->avg_25p - abs_dev->std_dev_25p) / abs_dev->avg_25p); 146 | PRINT(" 10-25%% : %-10u ( %5.1f%% | avg: %6.1f | abs dev: %6.1f | std dev: %6.1f = %5.1f%% )", 147 | abs_dev->num_dev_25p, v25p, abs_dev->avg_25p, abs_dev->abs_dev_25p, abs_dev->std_dev_25p, std_25pp); 148 | double v50p = 100 * 149 | (1 - (abs_dev->num_vals - abs_dev->num_dev_50p) / (double) abs_dev->num_vals); 150 | double std_50pp = 100 * (1 - (abs_dev->avg_50p - abs_dev->std_dev_50p) / abs_dev->avg_50p); 151 | PRINT(" 25-50%% : %-10u ( %5.1f%% | avg: %6.1f | abs dev: %6.1f | std dev: %6.1f = %5.1f%% )", 152 | abs_dev->num_dev_50p, v50p, abs_dev->avg_50p, abs_dev->abs_dev_50p, abs_dev->std_dev_50p, std_50pp); 153 | double v75p = 100 * 154 | (1 - (abs_dev->num_vals - abs_dev->num_dev_75p) / (double) abs_dev->num_vals); 155 | double std_75pp = 100 * (1 - (abs_dev->avg_75p - abs_dev->std_dev_75p) / abs_dev->avg_75p); 156 | PRINT(" 50-75%% : %-10u ( %5.1f%% | avg: %6.1f | abs dev: %6.1f | std dev: %6.1f = %5.1f%% )", 157 | abs_dev->num_dev_75p, v75p, abs_dev->avg_75p, abs_dev->abs_dev_75p, abs_dev->std_dev_75p, std_75pp); 158 | double vrest = 100 * 159 | (1 - (abs_dev->num_vals - abs_dev->num_dev_rst) / (double) abs_dev->num_vals); 160 | double std_rspp = 100 * (1 - (abs_dev->avg_rst - abs_dev->std_dev_rst) / abs_dev->avg_rst); 161 | PRINT("75-100%% : %-10u ( %5.1f%% | avg: %6.1f | abs dev: %6.1f | std dev: %6.1f = %5.1f%% )\n", 162 | abs_dev->num_dev_rst, vrest, abs_dev->avg_rst, abs_dev->abs_dev_rst, abs_dev->std_dev_rst, std_rspp); 163 | } 164 | 165 | #define PFD_VAL_UP_LIMIT 1500 /* do not consider values higher than this value */ 166 | 167 | void 168 | get_abs_deviation(volatile ticks* vals, const size_t num_vals, abs_deviation_t* abs_dev) 169 | { 170 | abs_dev->num_vals = num_vals; 171 | ticks sum_vals = 0; 172 | uint32_t i; 173 | for (i = 0; i < num_vals; i++) 174 | { 175 | if ((int64_t) vals[i] < 0 || vals[i] > PFD_VAL_UP_LIMIT) 176 | { 177 | vals[i] = 0; 178 | } 179 | sum_vals += vals[i]; 180 | } 181 | 182 | double avg = sum_vals / (double) num_vals; 183 | abs_dev->avg = avg; 184 | double max_val = 0; 185 | double min_val = DBL_MAX; 186 | uint64_t max_val_idx = 0, min_val_idx = 0; 187 | uint32_t num_dev_10p = 0; ticks sum_vals_10p = 0; double dev_10p = 0.1 * avg; 188 | uint32_t num_dev_25p = 0; ticks sum_vals_25p = 0; double dev_25p = 0.25 * avg; 189 | uint32_t num_dev_50p = 0; ticks sum_vals_50p = 0; double dev_50p = 0.5 * avg; 190 | uint32_t num_dev_75p = 0; ticks sum_vals_75p = 0; double dev_75p = 0.75 * avg; 191 | uint32_t num_dev_rst = 0; ticks sum_vals_rst = 0; 192 | 193 | double sum_adev = 0; /* abs deviation */ 194 | double sum_stdev = 0; /* std deviation */ 195 | for (i = 0; i < num_vals; i++) 196 | { 197 | double diff = vals[i] - avg; 198 | double ad = absd(diff); 199 | if (vals[i] > max_val) 200 | { 201 | max_val = vals[i]; 202 | max_val_idx = i; 203 | } 204 | else if (vals[i] < min_val) 205 | { 206 | min_val = vals[i]; 207 | min_val_idx = i; 208 | } 209 | 210 | if (ad <= dev_10p) 211 | { 212 | num_dev_10p++; 213 | sum_vals_10p += vals[i]; 214 | } 215 | else if (ad <= dev_25p) 216 | { 217 | num_dev_25p++; 218 | sum_vals_25p += vals[i]; 219 | } 220 | else if (ad <= dev_50p) 221 | { 222 | num_dev_50p++; 223 | sum_vals_50p += vals[i]; 224 | } 225 | else if (ad <= dev_75p) 226 | { 227 | num_dev_75p++; 228 | sum_vals_75p += vals[i]; 229 | } 230 | else 231 | { 232 | num_dev_rst++; 233 | sum_vals_rst += vals[i]; 234 | } 235 | 236 | sum_adev += ad; 237 | sum_stdev += ad*ad; 238 | } 239 | abs_dev->min_val = min_val; 240 | abs_dev->min_val_idx = min_val_idx; 241 | abs_dev->max_val = max_val; 242 | abs_dev->max_val_idx = max_val_idx; 243 | abs_dev->num_dev_10p = num_dev_10p; 244 | abs_dev->num_dev_25p = num_dev_25p; 245 | abs_dev->num_dev_50p = num_dev_50p; 246 | abs_dev->num_dev_75p = num_dev_75p; 247 | abs_dev->num_dev_rst = num_dev_rst; 248 | 249 | abs_dev->avg_10p = sum_vals_10p / (double) num_dev_10p; 250 | abs_dev->avg_25p = sum_vals_25p / (double) num_dev_25p; 251 | abs_dev->avg_50p = sum_vals_50p / (double) num_dev_50p; 252 | abs_dev->avg_75p = sum_vals_75p / (double) num_dev_75p; 253 | abs_dev->avg_rst = sum_vals_rst / (double) num_dev_rst; 254 | 255 | double sum_adev_10p = 0, sum_adev_25p = 0, sum_adev_50p = 0, sum_adev_75p = 0, sum_adev_rst = 0; 256 | double sum_stdev_10p = 0, sum_stdev_25p = 0, sum_stdev_50p = 0, sum_stdev_75p = 0, sum_stdev_rst = 0; 257 | 258 | /* pass again to calculate the deviations for the 10/25..p */ 259 | for (i = 0; i < num_vals; i++) 260 | { 261 | double diff = vals[i] - avg; 262 | double ad = absd(diff); 263 | if (ad <= dev_10p) 264 | { 265 | double diff = vals[i] - abs_dev->avg_10p; 266 | double ad = absd(diff); 267 | sum_adev_10p += ad; 268 | sum_stdev_10p += (ad*ad); 269 | } 270 | else if (ad <= dev_25p) 271 | { 272 | double diff = vals[i] - abs_dev->avg_25p; 273 | double ad = absd(diff); 274 | sum_adev_25p += ad; 275 | sum_stdev_25p += (ad*ad); 276 | } 277 | else if (ad <= dev_50p) 278 | { 279 | double diff = vals[i] - abs_dev->avg_50p; 280 | double ad = absd(diff); 281 | sum_adev_50p += ad; 282 | sum_stdev_50p += (ad*ad); 283 | } 284 | else if (ad <= dev_75p) 285 | { 286 | double diff = vals[i] - abs_dev->avg_75p; 287 | double ad = absd(diff); 288 | sum_adev_75p += ad; 289 | sum_stdev_75p += (ad*ad); 290 | } 291 | else 292 | { 293 | double diff = vals[i] - abs_dev->avg_rst; 294 | double ad = absd(diff); 295 | sum_adev_rst += ad; 296 | sum_stdev_rst += (ad*ad); 297 | } 298 | } 299 | 300 | abs_dev->abs_dev_10p = sum_adev_10p / num_dev_10p; 301 | abs_dev->abs_dev_25p = sum_adev_25p / num_dev_25p; 302 | abs_dev->abs_dev_50p = sum_adev_50p / num_dev_50p; 303 | abs_dev->abs_dev_75p = sum_adev_75p / num_dev_75p; 304 | abs_dev->abs_dev_rst = sum_adev_rst / num_dev_rst; 305 | 306 | abs_dev->std_dev_10p = sqrt(sum_stdev_10p / num_dev_10p); 307 | abs_dev->std_dev_25p = sqrt(sum_stdev_25p / num_dev_25p); 308 | abs_dev->std_dev_50p = sqrt(sum_stdev_50p / num_dev_50p); 309 | abs_dev->std_dev_75p = sqrt(sum_stdev_75p / num_dev_75p); 310 | abs_dev->std_dev_rst = sqrt(sum_stdev_rst / num_dev_rst); 311 | 312 | double adev = sum_adev / num_vals; 313 | abs_dev->abs_dev = adev; 314 | double stdev = sqrt(sum_stdev / num_vals); 315 | abs_dev->std_dev = stdev; 316 | } 317 | --------------------------------------------------------------------------------