├── .gitignore
├── INSTALL
├── LICENSE
├── Makefile
├── README
├── README.md
├── include
    ├── atomic_ops.h
    ├── barrier.h
    ├── ccbench.h
    ├── common.h
    └── pfd.h
├── scripts
    ├── events_all
    ├── run_niagara.sh
    ├── run_opteron.sh
    ├── run_tilera.sh
    ├── run_with_confidence.sh
    └── run_xeon.sh
└── src
    ├── .gitignore
    ├── barrier.c
    ├── ccbench.c
    └── pfd.c


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.a
 2 | *.o
 3 | *~
 4 | /#*compilation*#
 5 | /#INSTALL#
 6 | /#README#
 7 | /#d#
 8 | /#free.sh#
 9 | /#moesi.c#
10 | /#pfd.h#
11 | /#run_opteron.sh#
12 | /*compilation*
13 | /*eshell*
14 | /.#INSTALL
15 | /.#README
16 | /.#README.md
17 | /.#d
18 | /.#moesi.c
19 | /.#moesi.h
20 | /.#pfd.h
21 | /.#run_opteron.sh
22 | /.tmp
23 | /any.sh
24 | /cat_proc.sh
25 | /ccbench
26 | /ccbench.S
27 | /conf.run.tmp
28 | /core
29 | /d
30 | /dd
31 | /free.sh
32 | /help
33 | /moesi
34 | /moesi_old
35 | /moesi_sosp
36 | /moesi_th.c
37 | /msr
38 | /ps_a.sh
39 | /run
40 | /run_with_confidence.sh
41 | /s
42 | /ss
43 | /sss
44 | /xeon_cores
45 | /xeon_dist.out
46 | cscope.*
47 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
 1 | Steps to install ccbench:
 2 | ------------------------
 3 | 
 4 | 1. Fix the Makefile (not always necessary)
 5 |    
 6 |    The Makefile sets some parameters based on which is the host you are running on.
 7 |    The parameters are:
 8 |        * PLATFORM : the platform name used to set platform specific parameters in the code
 9 |        * CC : the compiler to be used
10 |        * CFLAGS : the compilation flags
11 |        * LDFLAGS : the libraries to link with
12 |        * VER_FLAGS : version flags, such as the platform name
13 | 
14 |    If a configuration is not specified, the DEFAULT configuration is used (it should work for most x86 platforms).
15 | 
16 | 2. Compile for the target platform
17 | 
18 |    In the base folder of the project execute:
19 |       make	  
20 | 
21 | 3. ./moesi -h 
22 | 
23 |    You will get all the details you need in order to use the application. 
24 | 
25 | 
26 | Tested platforms:
27 | -----------------
28 | 
29 | ccbench has been tested on the following platforms:
30 | 	* UMA and NUMA x86_64
31 | 	* SPARC (UltraSPARC T2, UltraSPARC T4-4)
32 | 	* Tilera (Tile-GX36, TILEPro64)
33 | 
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Vasileios Trigonakis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SRC = src
 2 | INCLUDE = include
 3 | 
 4 | CFLAGS = -O3 -Wall
 5 | LDFLAGS = -lm -lrt
 6 | VER_FLAGS = -D_GNU_SOURCE
 7 | 
 8 | ifeq ($(VERSION),DEBUG) 
 9 | CFLAGS =  -O0 -ggdb -Wall -g  -fno-inline
10 | endif
11 | 
12 | UNAME := $(shell uname -n)
13 | 
14 | ifeq ($(UNAME), lpd48core)
15 | PLATFORM = OPTERON
16 | CC = gcc
17 | PLATFORM_NUMA = 1
18 | endif
19 | 
20 | ifeq ($(UNAME), diassrv8)
21 | PLATFORM = XEON
22 | CC = gcc
23 | PLATFORM_NUMA = 1
24 | endif
25 | 
26 | ifeq ($(UNAME), maglite)
27 | PLATFORM = NIAGARA
28 | CC = /opt/csw/bin/gcc 
29 | CFLAGS += -m64 -mcpu=v9 -mtune=v9
30 | endif
31 | 
32 | ifeq ($(UNAME), parsasrv1.epfl.ch)
33 | PLATFORM = TILERA
34 | CC = tile-gcc
35 | LDFLAGS += -ltmc
36 | endif
37 | 
38 | ifeq ($(UNAME), diascld19)
39 | PLATFORM = XEON2
40 | CC = gcc
41 | endif
42 | 
43 | ifeq ($(UNAME), diascld9)
44 | PLATFORM = OPTERON2
45 | CC = gcc
46 | endif
47 | 
48 | ifeq ($(PLATFORM), )
49 | PLATFORM = DEFAULT
50 | CC = gcc
51 | endif
52 | 
53 | VER_FLAGS += -D$(PLATFORM)
54 | 
55 | ifeq ($(PLATFORM_NUMA),1) #give PLATFORM_NUMA=1 for NUMA
56 | LDFLAGS += -lnuma
57 | VER_FLAGS += -DPLATFORM_NUMA
58 | endif 
59 | 
60 | default: ccbench
61 | 
62 | all: ccbench
63 | 
64 | ccbench: ccbench.o $(SRC)/pfd.c $(SRC)/barrier.c $(INCLUDE)/common.h $(INCLUDE)/ccbench.h $(INCLUDE)/pfd.h $(INCLUDE)/barrier.h barrier.o pfd.o
65 | 	$(CC) $(VER_FLAGS) -o ccbench ccbench.o pfd.o barrier.o $(CFLAGS) $(LDFLAGS) -I./$(INCLUDE) 
66 | 
67 | ccbench.o: $(SRC)/ccbench.c $(INCLUDE)/ccbench.h
68 | 	$(CC) $(VER_FLAGS) -c $(SRC)/ccbench.c $(CFLAGS) -I./$(INCLUDE) 
69 | 
70 | pfd.o: $(SRC)/pfd.c $(INCLUDE)/pfd.h
71 | 	$(CC) $(VER_FLAGS) -c $(SRC)/pfd.c $(CFLAGS) -I./$(INCLUDE)	
72 | 
73 | barrier.o: $(SRC)/barrier.c $(INCLUDE)/barrier.h
74 | 	$(CC) $(VER_FLAGS) -c $(SRC)/barrier.c $(CFLAGS) -I./$(INCLUDE) 
75 | 
76 | clean:
77 | 	rm -f *.o ccbench
78 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | ccbench is a tool for measuring the cache-coherence latencies of a processor, i.e., the latencies of loads, stores, compare-and-swap (CAS), fetch-and-increment (FAI), test-and-set (TAS), and swap (SWAP). The latencies that ccbench measures can be used to understand and predict the behavior of sharing and synchronization on the underlying hardware platform.
  2 | 
  3 | * Website             : http://lpd.epfl.ch/site/ccbench
  4 | * Author              : Vasileios Trigonakis <vasileios.trigonakis@epfl.ch>
  5 | * Related Publications: ccbench is a part of the SSYNC synchronization suite
  6 |   (http://lpd.epfl.ch/site/ssync), developed for:
  7 |   Everything You Always Wanted to Know about Synchronization but Were Afraid to Ask, 
  8 |   Tudor David, Rachid Guerraoui, Vasileios Trigonakis (alphabetical order), 
  9 |   SOSP '13 - Proceeding of the 24th ACM Symposium on Operating Systems Principles
 10 | 
 11 | 
 12 | Installation:
 13 | -------------
 14 | 
 15 | Please refer to the INSTALL file.
 16 | 
 17 | 
 18 | Using ccbench:
 19 | --------------
 20 | 
 21 | Execute:
 22 | 	./ccbench -h
 23 | to get the parameters and the supported events of ccbench
 24 | 
 25 | 
 26 | Details:
 27 | --------
 28 | ccbench brings a single cache line L in the desired MESI state and position in the processor and then 
 29 | performs that target operation on L. In more details, ccbench takes the following steps:
 30 | 	 1. It uses one (or more) cores to bring L in the desired state and position, 
 31 | 	    e.g., in a Modified state in the local caches of core 0 in node 0.
 32 | 	 2. It then uses another core in order to perform the target operation, e.g., load from a
 33 | 	    modified state that is on the local caches of a core that is on the same node.
 34 | 
 35 | 
 36 | 
 37 | Limitations:
 38 | ------------
 39 | 
 40 | Measuring latencies at this low level is not easy. Most of the events work as intended on all platforms.
 41 | However, there are some subtle details that one should be aware of in order to "successfully" use
 42 | ccbench:
 43 | 	* The memory fences to be used are related to the memory consistency model of the underlying
 44 | 	  platform. For instance, on an AMD Opteron Magny-Cours we can measure both loads and stores
 45 | 	  without using any fences (ccbench -e0). Contrarily, on an Intel Xeon Westmere-EX, we can
 46 | 	  measure the loads with a load fence, but a store needs a full fence (so, ccbench -e8).
 47 | 	* The stride parameter is used to try to fool the hardware prefetchers. This is also a
 48 | 	  hardware dependent parameter.
 49 | 	* There are certain cases where you might need to compile ccbench with -O0 flag instead of
 50 | 	  the default -O3 to be able to get the results. Known cases:
 51 | 	      * on the Tile-GX36, you probably need to compile with -O0 to get sensible number 
 52 | 	      	for the atomic ops
 53 | 	      * on UltraSPARC T2, you probably need to compile with -O0 for all operations 
 54 | 	      	except the atomic ops
 55 | 
 56 | 
 57 | Interpreting the results:
 58 | -------------------------
 59 | 
 60 | The comments prefixed with "#######" explain the results.
 61 | 
 62 | 
 63 | ####### settings:
 64 | 
 65 | test:   LOAD_FROM_MODIFIED  / #cores: 2 / #repetitions: 1000 / stride: 4096 (256 kiB)  / fence:  load/full
 66 | core1:   1 / core2:   2
 67 | 
 68 | ####### warnings regarding the profiler correction. If the calculation fails for 10 times (i.e, the
 69 | ####### correction calculation does not have a low std deviation, the correction is set manually
 70 | ####### to a give (in src/pfd.c) platform-specific value. If the default value is not set, the
 71 | ####### avg corrections are still used (this works ok in my experience)
 72 | 
 73 | * warning: avg pfd correction is 20.2 with std deviation: 16.3%. Recalculating.
 74 | * warning: setting pfd correction manually
 75 |  -- pfd correction: 20 (std deviation: 22.2%)
 76 | * warning: avg pfd correction is 20.3 with std deviation: 17.0%. Recalculating.
 77 | * warning: setting pfd correction manually
 78 |  -- pfd correction: 20 (std deviation: 22.2%)
 79 | 
 80 | ####### results
 81 | 
 82 | [00]  *** Core  0 **********************************************************************************
 83 | 
 84 |  ---- statistics:
 85 | 
 86 | ####### global avg and deviations
 87 | 
 88 | [00]     avg : 111.5      abs dev : 2.5        std dev : 4.5        num     : 1000
 89 | [00]     min : 32.0       (element:    779)    max     : 136.0      (element:    415)
 90 | 
 91 | ####### clustering of values around the global avg. This used as an easy way to remove the outliers
 92 | ####### columns: % around the avg / num of sample / % of the total num of sample / avg of the cluster /
 93 | ####### absolute deviation of the cluster / standard deviation of the cluster
 94 | 
 95 | [00]   0-10% : 987        ( 98.7%  |  avg:   111.5  |  abs dev:    2.3  |  std dev:    3.0 =   2.7%)
 96 | [00]  10-25% : 11         (  1.1%  |  avg:   126.2  |  abs dev:    3.5  |  std dev:    4.2 =   3.3%)
 97 | [00]  25-50% : 1          (  0.1%  |  avg:    65.0  |  abs dev:    0.0  |  std dev:    0.0 =   0.0%)
 98 | [00]  50-75% : 1          (  0.1%  |  avg:    32.0  |  abs dev:    0.0  |  std dev:    0.0 =   0.0%)
 99 | [00] 75-100% : 0          (  0.0%  |  avg:    -nan  |  abs dev:   -nan  |  std dev:   -nan =  -nan%)
100 | 
101 | [01]  *** Core  1 **********************************************************************************
102 | 
103 |  ---- statistics:
104 | [01]     avg : 112.3      abs dev : 2.5        std dev : 5.4        num     : 1000
105 | [01]     min : 10.0       (element:    902)    max     : 133.0      (element:    404)
106 | [01]   0-10% : 989        ( 98.9%  |  avg:   112.4  |  abs dev:    2.2  |  std dev:    2.9 =   2.6%)
107 | [01]  10-25% : 9          (  0.9%  |  avg:   126.0  |  abs dev:    1.8  |  std dev:    2.7 =   2.1%)
108 | [01]  25-50% : 0          (  0.0%  |  avg:    -nan  |  abs dev:   -nan  |  std dev:   -nan =  -nan%)
109 | [01]  50-75% : 0          (  0.0%  |  avg:    -nan  |  abs dev:   -nan  |  std dev:   -nan =  -nan%)
110 | [01] 75-100% : 2          (  0.2%  |  avg:    13.5  |  abs dev:    3.5  |  std dev:    3.5 =  25.9%)
111 | 
112 | ####### The meaning of the results
113 | 
114 | [00]  ** Results from Core 0 : store to owned mine (if owned state supported, else exclusive)
115 | [00]  ** Results from Core 1 : load from modified (makes it owned, if owned state supported)
116 | 
117 | ####### The final value in the cache line that was used / the sum of all loads on this core
118 | ####### These values can be used for ensuring the correctness of some test (e.g., FAI)
119 | 
120 | [00]  value of cl is 0          / sum is 0
121 | [01]  value of cl is 0          / sum is 0
122 | 
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ccbench
  2 | =======
  3 | 
  4 | ccbench is a tool for measuring the cache-coherence latencies of a processor, i.e., the latencies of `loads`, `stores`, `compare-and-swap (CAS)`, `fetch-and-increment (FAI)`, `test-and-set (TAS)`, and `swap (SWAP)`. The latencies that ccbench measures can be used to understand and predict the behavior of sharing and synchronization on the underlying hardware platform.
  5 | 
  6 | * Website             : http://lpd.epfl.ch/site/ccbench
  7 | * Author              : Vasileios Trigonakis <vasileios.trigonakis@epfl.ch>
  8 | * Related Publications: ccbench is a part of the SSYNC synchronization suite
  9 |   (http://lpd.epfl.ch/site/ssync):  
 10 |   Everything You Always Wanted to Know about Synchronization but Were Afraid to Ask,   
 11 |   Tudor David, Rachid Guerraoui, Vasileios Trigonakis (alphabetical order),   
 12 |   SOSP '13 - Proceeding of the 24th ACM Symposium on Operating Systems Principles
 13 | 
 14 | 
 15 | Installation:
 16 | -------------
 17 | 
 18 | Please refer to the `INSTALL` file.
 19 | 
 20 | 
 21 | Using ccbench:
 22 | --------------
 23 | 
 24 | Execute:
 25 | 	`./ccbench -h`
 26 | to get the parameters and the supported events of ccbench
 27 | 
 28 | 
 29 | Details:
 30 | --------
 31 | ccbench brings a single cache line L in the desired MESI state and position in the processor and then 
 32 | performs that target operation on L. In more details, ccbench takes the following steps:
 33 | 	 1 It uses one (or more) cores to bring L in the desired state and position, 
 34 | 	    e.g., in a Modified state in the local caches of core 0 in node 0.
 35 | 	 2 It then uses another core in order to perform the target operation, e.g., load from a
 36 | 	    modified state that is on the local caches of a core that is on the same node.
 37 | 
 38 | 
 39 | 
 40 | Limitations:
 41 | ------------
 42 | 
 43 | Measuring latencies at this low level is not easy. Most of the events work as intended on all platforms.
 44 | However, there are some subtle details that one should be aware of in order to "successfully" use
 45 | ccbench:
 46 | * The memory fences to be used are related to the memory consistency model of the underlying platform. For instance, on an `AMD Opteron Magny-Cours` we can measure both `loads` and `stores` without using any fences (`ccbench -e0`). Contrarily, on an `Intel Xeon Westmere-EX`, we can measure the loads with a `load fence`, but a store needs a full fence (so, `ccbench -e8`).
 47 | * The stride parameter is used to try to fool the hardware prefetchers. This is also a hardware dependent parameter.
 48 | * There are certain cases where you might need to compile ccbench with `-O0` flag instead of the default `-O3` to be able to get the results. Known cases:
 49 |   * on the Tile-GX36, you probably need to compile with `-O0` to get sensible number for the atomic ops
 50 |   * on UltraSPARC T2, you probably need to compile with `-O0` for all operations 
 51 | 	      	except the atomic ops
 52 | 
 53 | 
 54 | Interpreting the results:
 55 | -------------------------
 56 | 
 57 | The comments prefixed with "#>>" explain the results.
 58 | 
 59 | <pre>
 60 | #>> settings:
 61 | test: LOAD_FROM_MODIFIED / #cores: 2 / #reps: 1000 / stride: 4096 / fence: load/full
 62 | core1:   1 / core2:   2
 63 | 
 64 | #>> warnings regarding the profiler correction. If the calculation fails for 10 times 
 65 | #>> (i.e, the correction calculation does not have a low std deviation, the correction 
 66 | #>> is manually set to a given (in src/pfd.c) platform-specific value. If the default 
 67 | #>> value is not set, the avg corrections are still used. 
 68 | #>> (This approach works OK in my experience.)
 69 | 
 70 | * warning: avg pfd correction is 20.2 with std deviation: 16.3%. Recalculating.
 71 | * warning: setting pfd correction manually
 72 |  -- pfd correction: 20 (std deviation: 22.2%)
 73 | * warning: avg pfd correction is 20.3 with std deviation: 17.0%. Recalculating.
 74 | * warning: setting pfd correction manually
 75 |  -- pfd correction: 20 (std deviation: 22.2%)
 76 | 
 77 | #>> results
 78 | 
 79 | [00]  *** Core  0 ***************************************************************
 80 | 
 81 |  ---- statistics:
 82 | 
 83 | #>> global avg and deviations
 84 | 
 85 | [00]  avg : 111.5      abs dev : 2.5        std dev : 4.5        num     : 1000
 86 | [00]  min : 32.0       (element:    779)    max     : 136.0      (element:    415)
 87 | 
 88 | #>> clustering of values around the global avg. This approach is used as an easy way 
 89 | #>> of remove outlier measurements. The columns represent:
 90 | #>> % group / num of samples / % of the total num of samples / avg of the cluster /
 91 | #>> absolute deviation of the cluster / standard deviation of the cluster
 92 | 
 93 | [00]   0-10% : 987 ( 98.7% | avg: 111.5 | abs dev:  2.3 | std dev:  3.0 =   2.7%)
 94 | [00]  10-25% : 11  (  1.1% | avg: 126.2 | abs dev:  3.5 | std dev:  4.2 =   3.3%)
 95 | [00]  25-50% : 1   (  0.1% | avg:  65.0 | abs dev:  0.0 | std dev:  0.0 =   0.0%)
 96 | [00]  50-75% : 1   (  0.1% | avg:  32.0 | abs dev:  0.0 | std dev:  0.0 =   0.0%)
 97 | [00] 75-100% : 0   (  0.0% | avg:  -nan | abs dev: -nan | std dev: -nan =  -nan%)
 98 | 
 99 | [01]  *** Core  1 ***************************************************************
100 | 
101 |  ---- statistics:
102 | [01]     avg : 112.3 abs dev : 2.5        std dev : 5.4        num     : 1000
103 | [01]     min : 10.0  (element:    902)    max     : 133.0      (element:    404)
104 | [01]   0-10% : 989 ( 98.9% | avg: 112.4 | abs dev:  2.2 | std dev:  2.9 =   2.6%)
105 | [01]  10-25% : 9   (  0.9% | avg: 126.0 | abs dev:  1.8 | std dev:  2.7 =   2.1%)
106 | [01]  25-50% : 0   (  0.0% | avg:  -nan | abs dev: -nan | std dev: -nan =  -nan%)
107 | [01]  50-75% : 0   (  0.0% | avg:  -nan | abs dev: -nan | std dev: -nan =  -nan%)
108 | [01] 75-100% : 2   (  0.2% | avg:  13.5 | abs dev:  3.5 | std dev:  3.5 =  25.9%)
109 | 
110 | #>> The meaning of the results
111 | 
112 | [00] Results Core 0 : store to owned mine (if owned state supported, else exclusive)
113 | [00] Results Core 1 : load from modified (makes it owned, if owned state supported)
114 | 
115 | #>> The final val in the cache line that was used / the sum of all loads on this core
116 | #>> These values can be used for ensuring the correctness of some test (e.g., FAI)
117 | 
118 | [00]  value of cl is 0    / sum is 0
119 | [01]  value of cl is 0    / sum is 0
120 | </pre>
121 | 


--------------------------------------------------------------------------------
/include/atomic_ops.h:
--------------------------------------------------------------------------------
  1 | /*   
  2 |  *   File: atomic_ops.h
  3 |  *   Author: Tudor David <tudor.david@epfl.ch>
  4 |  *   Description: cross-platform interface to atomic operations
  5 |  *   atomic_ops is part of SSYNC <http://lpd.epfl.ch/site/ssync>
  6 |  *
  7 |  * The MIT License (MIT)
  8 |  *
  9 |  * Copyright (C) 2013  Tudor David
 10 |  *
 11 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 12 |  * this software and associated documentation files (the "Software"), to deal in
 13 |  * the Software without restriction, including without limitation the rights to
 14 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 15 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 16 |  * subject to the following conditions:
 17 |  *
 18 |  * The above copyright notice and this permission notice shall be included in all
 19 |  * copies or substantial portions of the Software.
 20 |  *
 21 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 22 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 23 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 24 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 25 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 26 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 27 |  *
 28 |  */
 29 | 
 30 | #ifndef _ATOMIC_OPS_H_INCLUDED_
 31 | #define _ATOMIC_OPS_H_INCLUDED_
 32 | 
 33 | #include <inttypes.h>
 34 | 
 35 | #ifdef __sparc__
 36 | /*
 37 |  *  sparc code
 38 |  */
 39 | 
 40 | #  include <atomic.h>
 41 | 
 42 | //test-and-set uint8_t
 43 | static inline uint8_t tas_uint8(volatile uint8_t *addr) {
 44 | uint8_t oldval;
 45 |   __asm__ __volatile__("ldstub %1,%0"
 46 |         : "=r"(oldval), "=m"(*addr)
 47 |         : "m"(*addr) : "memory");
 48 |     return oldval;
 49 | }
 50 | 
 51 | //Compare-and-swap
 52 | #  define CAS_PTR(a,b,c) atomic_cas_ptr(a,b,c)
 53 | #  define CAS_U8(a,b,c) atomic_cas_8(a,b,c)
 54 | #  define CAS_U16(a,b,c) atomic_cas_16(a,b,c)
 55 | #  define CAS_U32(a,b,c) atomic_cas_32(a,b,c)
 56 | #  define CAS_U64(a,b,c) atomic_cas_64(a,b,c)
 57 | //Swap
 58 | #  define SWAP_PTR(a,b) atomic_swap_ptr(a,b)
 59 | #  define SWAP_U8(a,b) atomic_swap_8(a,b)
 60 | #  define SWAP_U16(a,b) atomic_swap_16(a,b)
 61 | #  define SWAP_U32(a,b) atomic_swap_32(a,b)
 62 | #  define SWAP_U64(a,b) atomic_swap_64(a,b)
 63 | //Fetch-and-increment
 64 | #  define FAI_U8(a) (atomic_inc_8_nv(a)-1)
 65 | #  define FAI_U16(a) (atomic_inc_16_nv(a)-1)
 66 | #  define FAI_U32(a) (atomic_inc_32_nv(a)-1)
 67 | #  define FAI_U64(a) (atomic_inc_64_nv(a)-1)
 68 | //Fetch-and-decrement
 69 | #  define FAD_U8(a) (atomic_dec_8_nv(a,)+1)
 70 | #  define FAD_U16(a) (atomic_dec_16_nv(a)+1)
 71 | #  define FAD_U32(a) (atomic_dec_32_nv(a)+1)
 72 | #  define FAD_U64(a) (atomic_dec_64_nv(a)+1)
 73 | //Increment-and-fetch
 74 | #  define IAF_U8(a) atomic_inc_8_nv(a)
 75 | #  define IAF_U16(a) atomic_inc_16_nv(a)
 76 | #  define IAF_U32(a) atomic_inc_32_nv(a)
 77 | #  define IAF_U64(a) atomic_inc_64_nv(a)
 78 | //Decrement-and-fetch
 79 | #  define DAF_U8(a) atomic_dec_8_nv(a)
 80 | #  define DAF_U16(a) atomic_dec_16_nv(a)
 81 | #  define DAF_U32(a) atomic_dec_32_nv(a)
 82 | #  define DAF_U64(a) atomic_dec_64_nv(a)
 83 | //Test-and-set
 84 | #  define TAS_U8(a) tas_uint8(a)
 85 | //Memory barrier
 86 | #  define MEM_BARRIER  asm volatile("membar #LoadLoad | #LoadStore | #StoreLoad | #StoreStore"); 
 87 | #  define _mm_lfence() asm volatile("membar #LoadLoad | #LoadStore");
 88 | #  define _mm_sfence() asm volatile("membar #StoreLoad | #StoreStore"); 
 89 | #  define _mm_mfence() asm volatile("membar #LoadLoad | #LoadStore | #StoreLoad | #StoreStore"); 
 90 | 
 91 | #  define _mm_clflush(x) asm volatile("nop");
 92 | //end of sparc code
 93 | #elif defined(__tile__)
 94 | /*
 95 |  *  Tilera code
 96 |  */
 97 | #  include <arch/atomic.h>
 98 | #  include <arch/cycle.h>
 99 | //atomic operations interface
100 | //Compare-and-swap
101 | #  define CAS_PTR(a,b,c) arch_atomic_val_compare_and_exchange(a,b,c)
102 | #  define CAS_U8(a,b,c)  arch_atomic_val_compare_and_exchange(a,b,c)
103 | #  define CAS_U16(a,b,c) arch_atomic_val_compare_and_exchange(a,b,c)
104 | #  define CAS_U32(a,b,c) arch_atomic_val_compare_and_exchange(a,b,c)
105 | #  define CAS_U64(a,b,c) arch_atomic_val_compare_and_exchange(a,b,c)
106 | //Swap
107 | #  define SWAP_PTR(a,b) arch_atomic_exchange(a,b)
108 | #  define SWAP_U8(a,b) arch_atomic_exchange(a,b)
109 | #  define SWAP_U16(a,b) arch_atomic_exchange(a,b)
110 | #  define SWAP_U32(a,b) arch_atomic_exchange(a,b)
111 | #  define SWAP_U64(a,b) arch_atomic_exchange(a,b)
112 | //Fetch-and-increment
113 | #  define FAI_U8(a) arch_atomic_increment(a)
114 | #  define FAI_U16(a) arch_atomic_increment(a)
115 | #  define FAI_U32(a) arch_atomic_increment(a)
116 | #  define FAI_U64(a) arch_atomic_increment(a)
117 | //Fetch-and-decrement
118 | #  define FAD_U8(a) arch_atomic_decrement(a)
119 | #  define FAD_U16(a) arch_atomic_decrement(a)
120 | #  define FAD_U32(a) arch_atomic_decrement(a)
121 | #  define FAD_U64(a) arch_atomic_decrement(a)
122 | //Increment-and-fetch
123 | #  define IAF_U8(a) (arch_atomic_increment(a)+1)
124 | #  define IAF_U16(a) (arch_atomic_increment(a)+1)
125 | #  define IAF_U32(a) (arch_atomic_increment(a)+1)
126 | #  define IAF_U64(a) (arch_atomic_increment(a)+1)
127 | //Decrement-and-fetch
128 | #  define DAF_U8(a) (arch_atomic_decrement(a)-1)
129 | #  define DAF_U16(a) (arch_atomic_decrement(a)-1)
130 | #  define DAF_U32(a) (arch_atomic_decrement(a)-1)
131 | #  define DAF_U64(a) (arch_atomic_decrement(a)-1)
132 | //Test-and-set
133 | #  define TAS_U8(a) arch_atomic_val_compare_and_exchange(a,0,0xff)
134 | //Memory barrier
135 | #  define MEM_BARRIER arch_atomic_full_barrier()
136 | 
137 | #  define _mm_lfence() arch_atomic_read_barrier()
138 | #  define _mm_sfence() arch_atomic_write_barrier()
139 | #  define _mm_mfence() arch_atomic_full_barrier()
140 | 
141 | #  define _mm_clflush(x)   tmc_mem_finv_no_fence((const void*) x, 64);
142 | 
143 | //Relax CPU
144 | //define PAUSE cycle_relax()
145 | 
146 | //end of tilera code
147 | #else
148 | 
149 | /*
150 |  *  x86 code
151 |  */
152 | 
153 | #  if defined(__SSE__)
154 | #    include <xmmintrin.h>
155 | #  else
156 | #    define _mm_lfence() asm volatile ("lfence" : :)
157 | #    define _mm_sfence() asm volatile ("sfence" : :)
158 | #    define _mm_mfence() asm volatile ("mfence" : :)
159 | #    define _mm_pause()  asm volatile ("rep; nop" : : )
160 | #    define _mm_clflush(__A)  asm volatile("clflush %0" : "+m" (*(volatile char*)__A))
161 | #  endif
162 | 
163 | //Swap pointers
164 | static inline void* swap_pointer(volatile void* ptr, void *x) {
165 | #  ifdef __i386__
166 |    __asm__ __volatile__("xchgl %0,%1"
167 |         :"=r" ((unsigned) x)
168 |         :"m" (*(volatile unsigned *)ptr), "0" (x)
169 |         :"memory");
170 | 
171 |   return x;
172 | #  elif defined(__x86_64__)
173 |   __asm__ __volatile__("xchgq %0,%1"
174 |         :"=r" ((unsigned long long) x)
175 |         :"m" (*(volatile long long *)ptr), "0" ((unsigned long long) x)
176 |         :"memory");
177 | 
178 |   return x;
179 | #  endif
180 | }
181 | 
182 | //Swap uint64_t
183 | static inline uint64_t swap_uint64(volatile uint64_t* target,  uint64_t x) {
184 |   __asm__ __volatile__("xchgq %0,%1"
185 |         :"=r" ((uint64_t) x)
186 |         :"m" (*(volatile uint64_t *)target), "0" ((uint64_t) x)
187 |         :"memory");
188 | 
189 |   return x;
190 | }
191 | 
192 | //Swap uint32_t
193 | static inline uint32_t swap_uint32(volatile uint32_t* target,  uint32_t x) {
194 |   __asm__ __volatile__("xchgl %0,%1"
195 |         :"=r" ((uint32_t) x)
196 |         :"m" (*(volatile uint32_t *)target), "0" ((uint32_t) x)
197 |         :"memory");
198 | 
199 |   return x;
200 | }
201 | 
202 | //Swap uint16_t
203 | static inline uint16_t swap_uint16(volatile uint16_t* target,  uint16_t x) {
204 |   __asm__ __volatile__("xchgw %0,%1"
205 |         :"=r" ((uint16_t) x)
206 |         :"m" (*(volatile uint16_t *)target), "0" ((uint16_t) x)
207 |         :"memory");
208 | 
209 |   return x;
210 | }
211 | 
212 | //Swap uint8_t
213 | static inline uint8_t swap_uint8(volatile uint8_t* target,  uint8_t x) {
214 |   __asm__ __volatile__("xchgb %0,%1"
215 |         :"=r" ((uint8_t) x)
216 |         :"m" (*(volatile uint8_t *)target), "0" ((uint8_t) x)
217 |         :"memory");
218 | 
219 |   return x;
220 | }
221 | 
222 | //test-and-set uint8_t
223 | static inline uint8_t tas_uint8(volatile uint8_t *addr) {
224 | uint8_t oldval;
225 |   __asm__ __volatile__("xchgb %0,%1"
226 |         : "=q"(oldval), "=m"(*addr)
227 |         : "0"((unsigned char) 0xff), "m"(*addr) : "memory");
228 |     return (uint8_t) oldval;
229 | }
230 | 
231 | //atomic operations interface
232 | //Compare-and-swap
233 | #  define CAS_PTR(a,b,c) __sync_val_compare_and_swap(a,b,c)
234 | #  define CAS_U8(a,b,c) __sync_val_compare_and_swap(a,b,c)
235 | #  define CAS_U16(a,b,c) __sync_val_compare_and_swap(a,b,c)
236 | #  define CAS_U32(a,b,c) __sync_val_compare_and_swap(a,b,c)
237 | #  define CAS_U64(a,b,c) __sync_val_compare_and_swap(a,b,c)
238 | //Swap
239 | #  define SWAP_PTR(a,b) swap_pointer(a,b)
240 | #  define SWAP_U8(a,b) swap_uint8(a,b)
241 | #  define SWAP_U16(a,b) swap_uint16(a,b)
242 | #  define SWAP_U32(a,b) swap_uint32(a,b)
243 | #  define SWAP_U64(a,b) swap_uint64(a,b)
244 | //Fetch-and-increment
245 | #  define FAI_U8(a) __sync_fetch_and_add(a,1)
246 | #  define FAI_U16(a) __sync_fetch_and_add(a,1)
247 | #  define FAI_U32(a) __sync_fetch_and_add(a,1)
248 | #  define FAI_U64(a) __sync_fetch_and_add(a,1)
249 | //Fetch-and-decrement
250 | #  define FAD_U8(a) __sync_fetch_and_sub(a,1)
251 | #  define FAD_U16(a) __sync_fetch_and_sub(a,1)
252 | #  define FAD_U32(a) __sync_fetch_and_sub(a,1)
253 | #  define FAD_U64(a) __sync_fetch_and_sub(a,1)
254 | //Increment-and-fetch
255 | #  define IAF_U8(a) __sync_add_and_fetch(a,1)
256 | #  define IAF_U16(a) __sync_add_and_fetch(a,1)
257 | #  define IAF_U32(a) __sync_add_and_fetch(a,1)
258 | #  define IAF_U64(a) __sync_add_and_fetch(a,1)
259 | //Decrement-and-fetch
260 | #  define DAF_U8(a) __sync_sub_and_fetch(a,1)
261 | #  define DAF_U16(a) __sync_sub_and_fetch(a,1)
262 | #  define DAF_U32(a) __sync_sub_and_fetch(a,1)
263 | #  define DAF_U64(a) __sync_sub_and_fetch(a,1)
264 | //Test-and-set
265 | #  define TAS_U8(a) tas_uint8(a)
266 | //Memory barrier
267 | #  define MEM_BARRIER __sync_synchronize()
268 | //Relax CPU
269 | //#define PAUSE _mm_pause()
270 | 
271 | /*End of x86 code*/
272 | #endif
273 | 
274 | 
275 | #endif
276 | 
277 | 
278 | 
279 | 


--------------------------------------------------------------------------------
/include/barrier.h:
--------------------------------------------------------------------------------
 1 | /*   
 2 |  *   File: barrier.h
 3 |  *   Author: Vasileios Trigonakis <vasileios.trigonakis@epfl.ch>
 4 |  *   Description: barrier structures
 5 |  *   barrier.h is part of ccbench
 6 |  *
 7 |  * The MIT License (MIT)
 8 |  *
 9 |  * Copyright (C) 2013  Vasileios Trigonakis
10 |  *
11 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
12 |  * this software and associated documentation files (the "Software"), to deal in
13 |  * the Software without restriction, including without limitation the rights to
14 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
15 |  * the Software, and to permit persons to whom the Software is furnished to do so,
16 |  * subject to the following conditions:
17 |  *
18 |  * The above copyright notice and this permission notice shall be included in all
19 |  * copies or substantial portions of the Software.
20 |  *
21 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
23 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
24 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
25 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
26 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 |  *
28 |  */
29 | 
30 | #ifndef BARRIER_H
31 | #define	BARRIER_H
32 | 
33 | #include "common.h"
34 | #include "atomic_ops.h"
35 | #ifdef __sparc__
36 | #  include <sys/types.h>
37 | #  include <sys/processor.h>
38 | #  include <sys/procset.h>
39 | #endif /* __sparc */
40 | 
41 | #define NUM_BARRIERS 16
42 | #define BARRIER_MEM_FILE "/barrier_mem"
43 | 
44 | #ifndef ALIGNED
45 | #  if __GNUC__ && !SCC
46 | #    define ALIGNED(N) __attribute__ ((aligned (N)))
47 | #  else
48 | #    define ALIGNED(N)
49 | #  endif
50 | #endif
51 | 
52 | /*barrier type*/
53 | typedef ALIGNED(64) struct barrier
54 | {
55 |   uint64_t num_participants;
56 |   volatile uint64_t num_crossing1;
57 |   volatile uint64_t num_crossing2;
58 |   volatile uint64_t num_crossing3;
59 |   int (*color)(int); /*or color function: if return 0 -> no , 1 -> participant. Priority on this */
60 | } barrier_t;
61 | 
62 | 
63 | void barriers_init(const uint32_t num_procs);
64 | void barrier_init(const uint32_t barrier_num, const uint64_t participants, int (*color)(int), const uint32_t);
65 | void barrier_wait(const uint32_t barrier_num, const uint32_t id, const uint32_t total_cores);
66 | void barriers_term();
67 | 
68 | #ifdef __sparc__
69 | #  define PAUSE()    asm volatile("rd    %%ccr, %%g0\n\t"	\
70 | 				::: "memory")
71 | #elif defined(__tile__)
72 | #define PAUSE() cycle_relax()
73 | #else
74 | #define PAUSE() _mm_pause()
75 | #endif
76 | 
77 | #endif	/* BARRIER_H */
78 | 


--------------------------------------------------------------------------------
/include/ccbench.h:
--------------------------------------------------------------------------------
  1 | /*   
  2 |  *   File: ccbench.h
  3 |  *   Author: Vasileios Trigonakis <vasileios.trigonakis@epfl.ch>
  4 |  *   Description: definition of ccbench events and help functions
  5 |  *   ccbench.h is part of ccbench
  6 |  *
  7 |  * The MIT License (MIT)
  8 |  *
  9 |  * Copyright (C) 2013  Vasileios Trigonakis
 10 |  *
 11 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 12 |  * this software and associated documentation files (the "Software"), to deal in
 13 |  * the Software without restriction, including without limitation the rights to
 14 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 15 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 16 |  * subject to the following conditions:
 17 |  *
 18 |  * The above copyright notice and this permission notice shall be included in all
 19 |  * copies or substantial portions of the Software.
 20 |  *
 21 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 22 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 23 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 24 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 25 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 26 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 27 |  *
 28 |  */
 29 | 
 30 | #ifndef _H_CCBENCH_
 31 | #define _H_CCBENCH_
 32 | 
 33 | #include <sys/time.h>
 34 | #include <sys/types.h>
 35 | #include <sys/stat.h>
 36 | #include <sys/mman.h>
 37 | #include <stdio.h>
 38 | #include <stddef.h>
 39 | #include <stdint.h>
 40 | #include <stdlib.h>
 41 | #include <inttypes.h>
 42 | #include <unistd.h>
 43 | #include <time.h>
 44 | #include <string.h>
 45 | #include <errno.h>
 46 | #include <unistd.h>
 47 | #include <fcntl.h>
 48 | #include <sched.h>
 49 | #include <assert.h>
 50 | #include <float.h>
 51 | #include <getopt.h>
 52 | 
 53 | #if defined(__amd64__)
 54 | #  include <emmintrin.h>
 55 | #elif defined(__tile__)
 56 | #  include <tmc/mem.h>
 57 | #  include <tmc/cmem.h>
 58 | #  include <tmc/cpus.h>
 59 | extern cpu_set_t cpus;
 60 | #endif	
 61 | 
 62 | #if defined(PLATFORM_NUMA)
 63 | #  include <numa.h>
 64 | #endif	/* PLATFORM_NUMA */
 65 | 
 66 | #include "common.h"
 67 | #include "pfd.h"
 68 | #include "barrier.h"
 69 | 
 70 | typedef struct cache_line
 71 | {
 72 |   volatile uint32_t word[16];
 73 | } cache_line_t;
 74 | 
 75 | #define CACHE_LINE_NUM      1024*1024 /* power of 2 pls */
 76 | #define CACHE_LINE_STRIDE_2 2047
 77 | 
 78 | # define LLU unsigned long long int
 79 | 
 80 | extern volatile cache_line_t* cache_line_open();
 81 | extern void cache_line_close(const uint32_t id, const char* name);
 82 | 
 83 | typedef enum
 84 |   {
 85 |     STORE_ON_MODIFIED,
 86 |     STORE_ON_MODIFIED_NO_SYNC,
 87 |     STORE_ON_EXCLUSIVE,
 88 |     STORE_ON_SHARED,
 89 |     STORE_ON_OWNED_MINE,
 90 |     STORE_ON_OWNED,
 91 |     STORE_ON_INVALID,
 92 |     LOAD_FROM_MODIFIED,
 93 |     LOAD_FROM_EXCLUSIVE,
 94 |     LOAD_FROM_SHARED,
 95 |     LOAD_FROM_OWNED,
 96 |     LOAD_FROM_INVALID,
 97 |     CAS,
 98 |     FAI,
 99 |     TAS,
100 |     SWAP,
101 |     CAS_ON_MODIFIED,
102 |     FAI_ON_MODIFIED,
103 |     TAS_ON_MODIFIED,
104 |     SWAP_ON_MODIFIED,
105 |     CAS_ON_SHARED,
106 |     FAI_ON_SHARED,
107 |     TAS_ON_SHARED,
108 |     SWAP_ON_SHARED,
109 |     CAS_CONCURRENT,
110 |     FAI_ON_INVALID,
111 |     LOAD_FROM_L1,
112 |     LOAD_FROM_MEM_SIZE,
113 |     LFENCE,
114 |     SFENCE,
115 |     MFENCE,
116 |     PROFILER,
117 |     PAUSE,
118 |     NOP,
119 |     NUM_EVENTS,			/* placeholder for printing the num of events */
120 |   } moesi_type_t;
121 | 
122 | const char* moesi_type_des[] =
123 |   {
124 |     "STORE_ON_MODIFIED",
125 |     "STORE_ON_MODIFIED_NO_SYNC",
126 |     "STORE_ON_EXCLUSIVE",
127 |     "STORE_ON_SHARED",
128 |     "STORE_ON_OWNED_MINE",
129 |     "STORE_ON_OWNED",
130 |     "STORE_ON_INVALID",
131 |     "LOAD_FROM_MODIFIED",
132 |     "LOAD_FROM_EXCLUSIVE",
133 |     "LOAD_FROM_SHARED",
134 |     "LOAD_FROM_OWNED",
135 |     "LOAD_FROM_INVALID",
136 |     "CAS",
137 |     "FAI",
138 |     "TAS",
139 |     "SWAP",
140 |     "CAS_ON_MODIFIED",
141 |     "FAI_ON_MODIFIED",
142 |     "TAS_ON_MODIFIED",
143 |     "SWAP_ON_MODIFIED",
144 |     "CAS_ON_SHARED",
145 |     "FAI_ON_SHARED",
146 |     "TAS_ON_SHARED",
147 |     "SWAP_ON_SHARED",
148 |     "CAS_CONCURRENT",
149 |     "FAI_ON_INVALID",
150 |     "LOAD_FROM_L1",
151 |     "LOAD_FROM_MEM_SIZE",
152 |     "LFENCE",
153 |     "SFENCE",
154 |     "MFENCE",
155 |     "PROFILER",
156 |     "PAUSE",
157 |     "NOP",
158 |   };
159 | 
160 | 
161 | #define DEFAULT_CORES       2
162 | #define DEFAULT_REPS        10000
163 | #define DEFAULT_TEST        0
164 | #define DEFAULT_CORE1       0
165 | #define DEFAULT_CORE2       1
166 | #define DEFAULT_CORE3       2
167 | #define DEFAULT_CORE_OTHERS 0
168 | #define DEFAULT_FLUSH       0
169 | #define DEFAULT_VERBOSE     0
170 | #define DEFAULT_PRINT       100
171 | #define DEFAULT_STRIDE      (CACHE_LINE_STRIDE_2 + 1)
172 | #define DEFAULT_FENCE       0
173 | #define DEFAULT_LFENCE       0
174 | #define DEFAULT_SFENCE       0
175 | #define DEFAULT_AO_SUCCESS  0
176 | 
177 | 
178 | #define CACHE_LINE_MEM_FILE "/cache_line"
179 | 
180 | #define B0 _mm_mfence(); barrier_wait(0, ID, test_cores); _mm_mfence();
181 | #define B1 _mm_mfence(); barrier_wait(2, ID, test_cores); _mm_mfence();
182 | #define B2 _mm_mfence(); barrier_wait(3, ID, test_cores); _mm_mfence();
183 | #define B3 _mm_mfence(); barrier_wait(4, ID, test_cores); _mm_mfence();
184 | #define B4 _mm_mfence(); barrier_wait(5, ID, test_cores); _mm_mfence();
185 | #define B5 _mm_mfence(); barrier_wait(6, ID, test_cores); _mm_mfence();
186 | #define B6 _mm_mfence(); barrier_wait(7, ID, test_cores); _mm_mfence();
187 | #define B7 _mm_mfence(); barrier_wait(8, ID, test_cores); _mm_mfence();
188 | #define B8 _mm_mfence(); barrier_wait(9, ID, test_cores); _mm_mfence();
189 | #define B9 _mm_mfence(); barrier_wait(10, ID, test_cores); _mm_mfence();
190 | #define B10 _mm_mfence(); barrier_wait(11, ID, test_cores); _mm_mfence();
191 | #define B11 _mm_mfence(); barrier_wait(12, ID, test_cores); _mm_mfence();
192 | #define B12 _mm_mfence(); barrier_wait(13, ID, test_cores); _mm_mfence();
193 | #define B13 _mm_mfence(); barrier_wait(14, ID, test_cores); _mm_mfence();
194 | #define B14 _mm_mfence(); barrier_wait(15, ID, test_cores); _mm_mfence();
195 | 
196 | #define XSTR(s)                         STR(s)
197 | #define STR(s)                          #s
198 | 
199 | #ifndef ALIGNED
200 | #  if __GNUC__ && !SCC
201 | #    define ALIGNED(N) __attribute__ ((aligned (N)))
202 | #  else
203 | #    define ALIGNED(N)
204 | #  endif
205 | #endif
206 | 
207 | inline void
208 | set_cpu(int cpu) 
209 | {
210 | #if defined(__sparc__)
211 |   processor_bind(P_LWPID,P_MYID, cpu, NULL);
212 | #elif defined(__tile__)
213 |   if (tmc_cpus_set_my_cpu(tmc_cpus_find_nth_cpu(&cpus, cpu)) < 0)
214 |     {
215 |       tmc_task_die("Failure in 'tmc_cpus_set_my_cpu()'.");
216 |     }
217 | 
218 |   if (cpu != tmc_cpus_get_my_cpu())
219 |     {
220 |       PRINT("******* i am not CPU %d", tmc_cpus_get_my_cpu());
221 |     }
222 | 
223 | #else
224 |   cpu_set_t mask;
225 |   CPU_ZERO(&mask);
226 |   CPU_SET(cpu, &mask);
227 |   if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) != 0) {
228 |     printf("Problem with setting processor affinity: %s\n",
229 | 	   strerror(errno));
230 |     exit(3);
231 |   }
232 | #endif
233 | 
234 | #ifdef OPTERON
235 |   uint32_t numa_node = cpu/6;
236 |   numa_set_preferred(numa_node);  
237 | #elif defined(XEON)
238 |   uint32_t numa_node = 0;
239 |   if (cpu == 0)
240 |     {
241 |       numa_node = 4;
242 |     }
243 |   else if (cpu <= 40)
244 |     {
245 |       numa_node = (cpu - 1) / 10;
246 |     }
247 |   else
248 |     {
249 |       numa_node = cpu / 10;
250 |     }
251 |   numa_set_preferred(numa_node);  
252 | #elif defined(PLATFORM_NUMA)
253 |   printf("* You need to define how cores correspond to mem nodes in ccbench.h\n");
254 | #endif 
255 |   
256 | }
257 | 
258 | inline void 
259 | wait_cycles(volatile uint64_t cycles)
260 | {
261 |   /* cycles >>= 1; */
262 |   for (cycles; cycles > 0; cycles--)
263 |     {
264 |       asm volatile ("nop");
265 |     }
266 | }
267 | 
268 |   /* getticks needs to have a correction because the call itself takes a */
269 |   /* significant number of cycles and skewes the measurement */
270 | static inline ticks getticks_correction_calc() 
271 | {
272 | #define GETTICKS_CALC_REPS 1000000
273 |   ticks t_dur = 0;
274 |   uint32_t i;
275 |   for (i = 0; i < GETTICKS_CALC_REPS; i++) 
276 |     {
277 |       ticks t_start = getticks();
278 |       ticks t_end = getticks();
279 |       t_dur += t_end - t_start;
280 |     }
281 |   //    printf("corr in float %f\n", (t_dur / (double) GETTICKS_CALC_REPS));
282 |   ticks getticks_correction = (ticks)(t_dur / (double) GETTICKS_CALC_REPS);
283 |   return getticks_correction;
284 | }
285 | 
286 | #define IN_ORDER(id, num_cores)			\
287 |   {						\
288 |     B0;						\
289 |     uint32_t c;					\
290 |     for (c = 0; c < num_cores; c++)		\
291 |       {						\
292 | 	if (id == c)				\
293 | 	  {					
294 | 
295 | #define IN_ORDER_END				\
296 | 	  }					\
297 | 	B0;					\
298 |       }						\
299 |   }
300 | 
301 | 
302 |   static inline unsigned long* 
303 |   seed_rand() 
304 |   {
305 |     unsigned long* seeds;
306 |     seeds = (unsigned long*) malloc(3 * sizeof(unsigned long));
307 |     seeds[0] = getticks() % 123456789;
308 |     seeds[1] = getticks() % 362436069;
309 |     seeds[2] = getticks() % 521288629;
310 |     return seeds;
311 |   }
312 | 
313 | extern unsigned long* seeds;
314 |   //Marsaglia's xorshf generator //period 2^96-1
315 | static inline unsigned long
316 | xorshf96(unsigned long* x, unsigned long* y, unsigned long* z) 
317 | {          
318 |   unsigned long t;
319 |   (*x) ^= (*x) << 16;
320 |   (*x) ^= (*x) >> 5;
321 |   (*x) ^= (*x) << 1;
322 | 
323 |   t = *x;
324 |   (*x) = *y;
325 |   (*y) = *z;
326 |   (*z) = t ^ (*x) ^ (*y);
327 | 
328 |   return *z;
329 | }
330 | #define clrand() (xorshf96(seeds, seeds + 1, seeds + 2) & (test_stride - 1))
331 | #define sirand(range) ((xorshf96(seeds, seeds + 1, seeds + 2) % range) + 64)
332 | #define my_random(a, b, c) xorshf96(a, b, c)
333 | 
334 | static inline uint32_t pow2roundup (uint32_t x)
335 | {
336 |   if (x==0) return 1;
337 |   --x;
338 |   x |= x >> 1;
339 |   x |= x >> 2;
340 |   x |= x >> 4;
341 |   x |= x >> 8;
342 |   x |= x >> 16;
343 |   return x+1;
344 | }
345 | #endif	/* _H_CCBENCH_ */
346 | 


--------------------------------------------------------------------------------
/include/common.h:
--------------------------------------------------------------------------------
 1 | /*   
 2 |  *   File: common.h
 3 |  *   Author: Vasileios Trigonakis <vasileios.trigonakis@epfl.ch>
 4 |  *   Description: helper macros
 5 |  *   common.h is part of ccbench
 6 |  *
 7 |  * The MIT License (MIT)
 8 |  *
 9 |  * Copyright (C) 2013  Vasileios Trigonakis
10 |  *
11 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
12 |  * this software and associated documentation files (the "Software"), to deal in
13 |  * the Software without restriction, including without limitation the rights to
14 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
15 |  * the Software, and to permit persons to whom the Software is furnished to do so,
16 |  * subject to the following conditions:
17 |  *
18 |  * The above copyright notice and this permission notice shall be included in all
19 |  * copies or substantial portions of the Software.
20 |  *
21 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
23 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
24 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
25 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
26 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 |  *
28 |  */
29 | 
30 | #ifndef _COMMON_H_
31 | #define _COMMON_H_
32 | 
33 | #include <stdio.h>
34 | #include <inttypes.h>
35 | 
36 | #define XSTR(s)                         STR(s)
37 | #define STR(s)                          #s
38 | 
39 | #define P(args...) printf("[%02d] ", ID); printf(args); printf("\n"); fflush(stdout)
40 | #define PRINT P
41 | 
42 | extern uint8_t ID;
43 | #endif
44 | 


--------------------------------------------------------------------------------
/include/pfd.h:
--------------------------------------------------------------------------------
  1 | /*   
  2 |  *   File: pfd.h
  3 |  *   Author: Vasileios Trigonakis <vasileios.trigonakis@epfl.ch>
  4 |  *   Description: pfd interface, structures, and helper functions
  5 |  *   pfd.h is part of ccbench
  6 |  *
  7 |  * The MIT License (MIT)
  8 |  *
  9 |  * Copyright (C) 2013  Vasileios Trigonakis
 10 |  *
 11 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 12 |  * this software and associated documentation files (the "Software"), to deal in
 13 |  * the Software without restriction, including without limitation the rights to
 14 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 15 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 16 |  * subject to the following conditions:
 17 |  *
 18 |  * The above copyright notice and this permission notice shall be included in all
 19 |  * copies or substantial portions of the Software.
 20 |  *
 21 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 22 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 23 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 24 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 25 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 26 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 27 |  *
 28 |  */
 29 | 
 30 | #ifndef _PFD_H_
 31 | #define _PFD_H_
 32 | 
 33 | #include <inttypes.h>
 34 | #include <float.h>
 35 | #include <assert.h>
 36 | #include <stdlib.h>
 37 | #include "common.h"
 38 | 
 39 | 
 40 | typedef uint64_t ticks;
 41 | 
 42 | #if defined(__i386__)
 43 | static inline ticks 
 44 | getticks(void) 
 45 | {
 46 |   ticks ret;
 47 | 
 48 |   __asm__ __volatile__("rdtsc" : "=A" (ret));
 49 |   return ret;
 50 | }
 51 | #elif defined(__x86_64__)
 52 | static inline ticks
 53 | getticks(void)
 54 | {
 55 |   unsigned hi, lo;
 56 |   __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
 57 |   return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
 58 | }
 59 | #elif defined(__sparc__)
 60 | static inline ticks
 61 | getticks()
 62 | {
 63 |   ticks ret;
 64 |   __asm__ __volatile__ ("rd %%tick, %0" : "=r" (ret) : "0" (ret)); 
 65 |   return ret;
 66 | }
 67 | #elif defined(__tile__)
 68 | #include <arch/cycle.h>
 69 | static inline ticks getticks()
 70 | {
 71 |   return get_cycle_count();
 72 | }
 73 | #endif
 74 | 
 75 | 
 76 | #define DO_TIMINGS
 77 | 
 78 | #if !defined(PREFETCHW)
 79 | #  if defined(__x86_64__) | defined(__i386__)
 80 | #    define PREFETCHW(x) asm volatile("prefetchw %0" :: "m" (*(unsigned long *)x)) /* write */
 81 | #  elif defined(__sparc__)
 82 | #    define PREFETCHW(x) __builtin_prefetch((const void*) x, 1, 3)
 83 | #  elif defined(__tile__)
 84 | #    define PREFETCHW(x) tmc_mem_prefetch (x, 64)
 85 | #  else
 86 | #    warning "You need to define PREFETCHW(x) for your architecture"
 87 | #  endif
 88 | #endif
 89 | 
 90 | typedef struct abs_deviation
 91 | {
 92 |   uint64_t num_vals;
 93 |   double avg;
 94 |   double avg_10p;
 95 |   double avg_25p;
 96 |   double avg_50p;
 97 |   double avg_75p;
 98 |   double avg_rst;
 99 |   double abs_dev_10p;
100 |   double abs_dev_25p;
101 |   double abs_dev_50p;
102 |   double abs_dev_75p;
103 |   double abs_dev_rst;
104 |   double abs_dev;
105 |   double std_dev_10p;
106 |   double std_dev_25p;
107 |   double std_dev_50p;
108 |   double std_dev_75p;
109 |   double std_dev_rst;
110 |   double std_dev;
111 |   double min_val;
112 |   uint64_t min_val_idx;
113 |   double max_val;
114 |   uint64_t max_val_idx;
115 |   uint32_t num_dev_10p;
116 |   uint32_t num_dev_25p;
117 |   uint32_t num_dev_50p;
118 |   uint32_t num_dev_75p;
119 |   uint32_t num_dev_rst;
120 | } abs_deviation_t;
121 | 
122 | 
123 | #define PFD_NUM_STORES 2
124 | #define PFD_PRINT_MAX 200
125 | 
126 | extern volatile ticks** pfd_store;
127 | extern volatile ticks* _pfd_s;
128 | extern volatile ticks pfd_correction;
129 | #if !defined(DO_TIMINGS)
130 | #  define PFDINIT(num_entries) 
131 | #  define PFDI(store) 
132 | #  define PFDO(store, entry) 
133 | #  define PFDP(store, num_vals) 
134 | #  define PFDPN(store, num_vals, num_print)
135 | #else  /* DO_TIMINGS */
136 | #  define PFDINIT(num_entries) pfd_store_init(num_entries)
137 | 
138 | #  define PFDI(store)				\
139 |   {						\
140 |   asm volatile ("");				\
141 |   _pfd_s[store] = getticks();
142 | 
143 | 
144 | #  define PFDO(store, entry)						\
145 |   asm volatile ("");							\
146 |   pfd_store[store][entry] =  getticks() - _pfd_s[store] - pfd_correction; \
147 |   }
148 | 
149 | #  define PFDOR(store, entry, reps)					\
150 |   asm volatile ("");							\
151 |   volatile ticks __t = getticks();					\
152 |   pfd_store[store][entry] = (__t - _pfd_s[store] - pfd_correction) /	\
153 |     reps;								\
154 |   }
155 | 
156 | #  define PFDPN(store, num_vals, num_print)				\
157 |   {									\
158 |     uint32_t _i;							\
159 |     uint32_t p = num_print;						\
160 |     if (p > num_vals) { p = num_vals; }					\
161 |     for (_i = 0; _i < p; _i++)						\
162 |       {									\
163 | 	printf("[%3d: %4ld] ", _i, (long int) pfd_store[store][_i]);	\
164 |       }									\
165 |     abs_deviation_t ad;							\
166 |     get_abs_deviation(pfd_store[store], num_vals, &ad);			\
167 |     print_abs_deviation(&ad);						\
168 |   }
169 | #endif /* !DO_TIMINGS */
170 | 
171 | # define PFDPREFTCH(store, entry)		\
172 |   PFDI(store);					\
173 |   PFDO(store, entry);
174 | 
175 | 
176 | 
177 | void pfd_store_init(const uint32_t num_entries);
178 | void get_abs_deviation(volatile ticks* vals, const size_t num_vals, abs_deviation_t* abs_dev);
179 | void print_abs_deviation(const abs_deviation_t* abs_dev);
180 | 
181 | 
182 | #endif	/* _PFD_H_ */
183 | 


--------------------------------------------------------------------------------
/scripts/events_all:
--------------------------------------------------------------------------------
1 | cache-references,cache-misses,L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores,L1-dcache-store-misses,L1-dcache-prefetches,L1-dcache-prefetch-misses,L1-icache-loads,L1-icache-load-misses,L1-icache-prefetches,L1-icache-prefetch-misses,LLC-loads,LLC-load-misses,LLC-stores,LLC-store-misses,LLC-prefetches,LLC-prefetch-misses


--------------------------------------------------------------------------------
/scripts/run_niagara.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #!/bin/bash
3 | 
4 | ./ccbench $@;
5 | read;
6 | ./ccbench $@ -y8;
7 | 


--------------------------------------------------------------------------------
/scripts/run_opteron.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ./ccbench $@ ;
 4 | read;
 5 | ./ccbench $@ -y6;
 6 | read;
 7 | ./ccbench $@ -y12;
 8 | read;
 9 | ./ccbench $@ -y18;
10 | 


--------------------------------------------------------------------------------
/scripts/run_tilera.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./run ./ccbench $@ ;
4 | read;
5 | ./run ./ccbench $@ -y35;
6 | 


--------------------------------------------------------------------------------
/scripts/run_with_confidence.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | conf=$1;
 4 | shift;
 5 | run=$@;
 6 | 
 7 | tmp="conf.run.tmp";
 8 | 
 9 | 
10 | if [ $# -lt 1 ];
11 | then
12 |     echo "Usage: $0 TARGET_CONFIDENCE ./ccbench [PARAMETERS]";
13 |     echo "       runs ccbench until there is a run with a clustering";
14 |     echo "       around the avg that has >= TARGET_CONFIDENCE percentage";
15 |     echo "       of the total samples. The TARGET_CONFIDENCE is decreased";
16 |     echo "       by D after F failed attempts (D, F defined in the script)";
17 |     exit;
18 | fi;
19 | 
20 | tries_fail=3;
21 | reduce_on_fail=1;
22 | 
23 | echo " ** Confidence lvl: $conf";
24 | 
25 | tries=1;
26 | while :
27 | do
28 |     ./$run > $tmp;
29 |     res=$(cut -d'(' -f2 $tmp | gawk -v c=$conf '/%  \|/ { if ($1+0 > c) print $1" --("$0 }');
30 | 
31 |     if [ "$res" ];
32 |     then
33 | 	cat $tmp;
34 | 	echo " ** in # tries: $tries";
35 | 	break;
36 |     fi;
37 | 
38 |     tries=$((tries+1));
39 |     if [ $tries -gt $tries_fail ];
40 |     then
41 | 	conf=$((conf-reduce_on_fail));
42 | 	tries=1;
43 | 	echo " ** Failed after $tries_fail tries. New confidence lvl: $conf"
44 |     fi;
45 | done;
46 | 
47 | if [ -f $tmp ];
48 | then
49 |     rm $tmp;
50 | fi;
51 | 


--------------------------------------------------------------------------------
/scripts/run_xeon.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./ccbench $@ -x3;
4 | read;
5 | ./ccbench $@ -y11 -x3;
6 | read;
7 | ./ccbench $@ -y41 -x3;
8 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | /#pfd.c#
2 | /.#pfd.c
3 | 


--------------------------------------------------------------------------------
/src/barrier.c:
--------------------------------------------------------------------------------
  1 | /*   
  2 |  *   File: barrier.c
  3 |  *   Author: Vasileios Trigonakis <vasileios.trigonakis@epfl.ch>
  4 |  *   Description: implementation of process barriers
  5 |  *   barrier.c is part of ccbench
  6 |  *
  7 |  * The MIT License (MIT)
  8 |  *
  9 |  * Copyright (C) 2013  Vasileios Trigonakis
 10 |  *
 11 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 12 |  * this software and associated documentation files (the "Software"), to deal in
 13 |  * the Software without restriction, including without limitation the rights to
 14 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 15 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 16 |  * subject to the following conditions:
 17 |  *
 18 |  * The above copyright notice and this permission notice shall be included in all
 19 |  * copies or substantial portions of the Software.
 20 |  *
 21 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 22 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 23 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 24 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 25 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 26 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 27 |  *
 28 |  */
 29 | 
 30 | #include "barrier.h"
 31 | #include <sys/types.h>
 32 | #include <sys/stat.h>
 33 | #include <sys/mman.h>
 34 | #include <unistd.h>
 35 | #include <stdlib.h>
 36 | #include <stdio.h>
 37 | #include <errno.h>
 38 | #include <fcntl.h>
 39 | #include <string.h>
 40 | #include <sched.h>
 41 | #include <inttypes.h>
 42 | 
 43 | #ifdef __sparc__
 44 | #  include <sys/types.h>
 45 | #  include <sys/processor.h>
 46 | #  include <sys/procset.h>
 47 | #endif	/* __sparc__ */
 48 | 
 49 | barrier_t* barriers;
 50 | 
 51 | 
 52 | int color_all(int id)
 53 | {
 54 |   return 1;
 55 | }
 56 | 
 57 | void
 58 | barriers_init(const uint32_t num_procs)
 59 | {
 60 |   uint32_t size;
 61 |   size = NUM_BARRIERS * sizeof(barrier_t);
 62 |   if (size < 8192)
 63 |     {
 64 |       size = 8192;
 65 |     }
 66 | 
 67 |   char keyF[100];
 68 |   sprintf(keyF, BARRIER_MEM_FILE);
 69 | 
 70 |   int barrierfd = shm_open(keyF, O_CREAT | O_EXCL | O_RDWR, S_IRWXU | S_IRWXG);
 71 |   if (barrierfd<0)
 72 |     {
 73 |       if (errno != EEXIST)
 74 | 	{
 75 | 	  perror("In shm_open");
 76 | 	  exit(1);
 77 | 	}
 78 | 
 79 |       //this time it is ok if it already exists
 80 |       barrierfd = shm_open(keyF, O_CREAT | O_RDWR, S_IRWXU | S_IRWXG);
 81 |       if (barrierfd<0)
 82 | 	{
 83 | 	  perror("In shm_open");
 84 | 	  exit(1);
 85 | 	}
 86 |     }
 87 |   else
 88 |     {
 89 |       if (ftruncate(barrierfd, size) < 0) {
 90 | 	perror("ftruncate failed\n");
 91 | 	exit(1);
 92 |       }
 93 |     }
 94 | 
 95 |   void* mem = (void*) mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, barrierfd, 0);
 96 |   if (mem == NULL)
 97 |     {
 98 |       perror("ssmp_mem = NULL\n");
 99 |       exit(134);
100 |     }
101 | 
102 |   barriers = (barrier_t*) mem;
103 | 
104 |   uint32_t bar;
105 |   for (bar = 0; bar < NUM_BARRIERS; bar++) 
106 |     {
107 |       barrier_init(bar, 0, color_all, num_procs);
108 |     }
109 | }
110 | 
111 | void
112 | barrier_init(const uint32_t barrier_num, const uint64_t participants, int (*color)(int),
113 | 	     const uint32_t total_cores) 
114 | {
115 |   if (barrier_num >= NUM_BARRIERS) 
116 |     {
117 |       return;
118 |     }
119 | 
120 | 
121 |   barriers[barrier_num].num_crossing1 = 0;
122 |   barriers[barrier_num].num_crossing2 = 0;
123 |   barriers[barrier_num].num_crossing3 = 0;
124 |   barriers[barrier_num].color = color;
125 |   uint32_t ue, num_parts = 0;
126 |   for (ue = 0; ue < total_cores; ue++) 
127 |     {
128 |       num_parts += color(ue);
129 |     }
130 |   barriers[barrier_num].num_participants = num_parts;
131 | 
132 | }
133 | 
134 | 
135 | void 
136 | barrier_wait(const uint32_t barrier_num, const uint32_t id, const uint32_t total_cores) 
137 | {
138 |   _mm_mfence();
139 |   if (barrier_num >= NUM_BARRIERS) 
140 |     {
141 |       return;
142 |     }
143 | 
144 |   //  printf("enter: %d : %d\n", barrier_num, id);
145 | 
146 |   barrier_t *b = &barriers[barrier_num];
147 | 
148 |   int (*col)(int);
149 |   col = b->color;
150 | 
151 |   if (col(id) == 0) 
152 |     {
153 |       return;
154 |     }
155 | 
156 | 
157 |   b->num_crossing2 = 0;
158 |   FAI_U64(&b->num_crossing1);
159 | 
160 |   while (b->num_crossing1 < b->num_participants)
161 |     {
162 |       PAUSE();
163 |       _mm_mfence();
164 |     }
165 | 
166 | 
167 |   b->num_crossing3 = 0;
168 | 
169 |   FAI_U64(&b->num_crossing2);
170 | 
171 |   while (b->num_crossing2 < b->num_participants)
172 |     {
173 |       PAUSE();
174 |       _mm_mfence();
175 |     }
176 | 
177 |   b->num_crossing1 = 0;
178 | 
179 |   FAI_U64(&b->num_crossing3);
180 | 
181 |   while (b->num_crossing3 < b->num_participants)
182 |     {
183 |       PAUSE();
184 |       _mm_mfence();
185 |     }
186 | 
187 |   //  printf("EXIT : %d : %d\n", barrier_num, id);
188 | 
189 | }
190 | 
191 | void
192 | barriers_term(const uint32_t id) 
193 | {
194 |   if (id == 0)
195 |     {
196 |       shm_unlink(BARRIER_MEM_FILE);
197 |     }
198 | }
199 | 


--------------------------------------------------------------------------------
/src/ccbench.c:
--------------------------------------------------------------------------------
   1 | /*   
   2 |  *   File: ccbench.c
   3 |  *   Author: Vasileios Trigonakis <vasileios.trigonakis@epfl.ch>
   4 |  *   Description: the main functionality of ccbench
   5 |  *   ccbench.c is part of ccbench
   6 |  *
   7 |  * The MIT License (MIT)
   8 |  *
   9 |  * Copyright (C) 2013  Vasileios Trigonakis
  10 |  *
  11 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  12 |  * this software and associated documentation files (the "Software"), to deal in
  13 |  * the Software without restriction, including without limitation the rights to
  14 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  15 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  16 |  * subject to the following conditions:
  17 |  *
  18 |  * The above copyright notice and this permission notice shall be included in all
  19 |  * copies or substantial portions of the Software.
  20 |  *
  21 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  22 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
  23 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
  24 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
  25 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  26 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  27 |  *
  28 |  */
  29 | 
  30 | #include "ccbench.h"
  31 | 
  32 | uint8_t ID;
  33 | unsigned long* seeds;
  34 | 
  35 | #if defined(__tile__)
  36 | cpu_set_t cpus;
  37 | #endif
  38 | 
  39 | moesi_type_t test_test = DEFAULT_TEST;
  40 | uint32_t test_cores = DEFAULT_CORES;
  41 | uint32_t test_reps = DEFAULT_REPS;
  42 | uint32_t test_core1 = DEFAULT_CORE1;
  43 | uint32_t test_core2 = DEFAULT_CORE2;
  44 | uint32_t test_core3 = DEFAULT_CORE3;
  45 | uint32_t test_core_others = DEFAULT_CORE_OTHERS;
  46 | uint32_t test_flush = DEFAULT_FLUSH;
  47 | uint32_t test_verbose = DEFAULT_VERBOSE;
  48 | uint32_t test_print = DEFAULT_PRINT;
  49 | uint32_t test_stride = DEFAULT_STRIDE;
  50 | uint32_t test_fence = DEFAULT_FENCE;
  51 | uint32_t test_ao_success = DEFAULT_AO_SUCCESS;
  52 | size_t   test_mem_size = CACHE_LINE_NUM * sizeof(cache_line_t);
  53 | uint32_t test_cache_line_num = CACHE_LINE_NUM;
  54 | uint32_t test_lfence = DEFAULT_LFENCE;
  55 | uint32_t test_sfence = DEFAULT_SFENCE;
  56 | 
  57 | 
  58 | static void store_0(volatile cache_line_t* cache_line, volatile uint64_t reps);
  59 | static void store_0_no_pf(volatile cache_line_t* cache_line, volatile uint64_t reps);
  60 | static void store_0_eventually(volatile cache_line_t* cl, volatile uint64_t reps);
  61 | static void store_0_eventually_pfd1(volatile cache_line_t* cl, volatile uint64_t reps);
  62 | 
  63 | static uint64_t load_0(volatile cache_line_t* cache_line, volatile uint64_t reps);
  64 | static uint64_t load_next(volatile uint64_t* cl, volatile uint64_t reps);
  65 | static uint64_t load_0_eventually(volatile cache_line_t* cl, volatile uint64_t reps);
  66 | static uint64_t load_0_eventually_no_pf(volatile cache_line_t* cl);
  67 | 
  68 | static void invalidate(volatile cache_line_t* cache_line, uint64_t index, volatile uint64_t reps);
  69 | static uint32_t cas(volatile cache_line_t* cache_line, volatile uint64_t reps);
  70 | static uint32_t cas_0_eventually(volatile cache_line_t* cache_line, volatile uint64_t reps);
  71 | static uint32_t cas_no_pf(volatile cache_line_t* cache_line, volatile uint64_t reps);
  72 | static uint32_t fai(volatile cache_line_t* cache_line, volatile uint64_t reps);
  73 | static uint8_t tas(volatile cache_line_t* cl, volatile uint64_t reps);
  74 | static uint32_t swap(volatile cache_line_t* cl, volatile uint64_t reps);
  75 | 
  76 | static size_t parse_size(char* optarg);
  77 | static void create_rand_list_cl(volatile uint64_t* list, size_t n);
  78 | 
  79 | 
  80 | int
  81 | main(int argc, char **argv) 
  82 | {
  83 | 
  84 |   /* before doing any allocations */
  85 | #if defined(__tile__)
  86 |   if (tmc_cpus_get_my_affinity(&cpus) != 0)
  87 |     {
  88 |       tmc_task_die("Failure in 'tmc_cpus_get_my_affinity()'.");
  89 |     }
  90 | #endif
  91 | 
  92 | #if defined(XEON)
  93 |   set_cpu(1);
  94 | #else
  95 |   set_cpu(0);
  96 | #endif
  97 | 
  98 |   struct option long_options[] = 
  99 |     {
 100 |       // These options don't set a flag
 101 |       {"help",                      no_argument,       NULL, 'h'},
 102 |       {"cores",                     required_argument, NULL, 'c'},
 103 |       {"repetitions",               required_argument, NULL, 'r'},
 104 |       {"test",                      required_argument, NULL, 't'},
 105 |       {"core1",                     required_argument, NULL, 'x'},
 106 |       {"core2",                     required_argument, NULL, 'y'},
 107 |       {"core3",                     required_argument, NULL, 'z'},
 108 |       {"core-others",               required_argument, NULL, 'o'},
 109 |       {"stride",                    required_argument, NULL, 's'},
 110 |       {"fence",                     required_argument, NULL, 'e'},
 111 |       {"mem-size",                  required_argument, NULL, 'm'},
 112 |       {"flush",                     no_argument,       NULL, 'f'},
 113 |       {"success",                   no_argument,       NULL, 'u'},
 114 |       {"verbose",                   no_argument,       NULL, 'v'},
 115 |       {"print",                     required_argument, NULL, 'p'},
 116 |       {NULL, 0, NULL, 0}
 117 |     };
 118 | 
 119 |   int i;
 120 |   char c;
 121 |   while(1) 
 122 |     {
 123 |       i = 0;
 124 |       c = getopt_long(argc, argv, "hc:r:t:x:m:y:z:o:e:fvup:s:", long_options, &i);
 125 | 
 126 |       if(c == -1)
 127 | 	break;
 128 | 
 129 |       if(c == 0 && long_options[i].flag == 0)
 130 | 	c = long_options[i].val;
 131 | 
 132 |       switch(c) 
 133 | 	{
 134 | 	case 0:
 135 | 	  /* Flag is automatically set */
 136 | 	  break;
 137 | 	case 'h':
 138 | 	  printf("ccbench  Copyright (C) 2013  Vasileios Trigonakis <vasileios.trigonakis@epfl.ch>\n"
 139 | 		 "This program comes with ABSOLUTELY NO WARRANTY.\n"
 140 | 		 "This is free software, and you are welcome to redistribute it under certain conditions.\n\n"
 141 | 		 "ccbecnh is an application for measuring the cache-coherence latencies, i.e., the latencies of\n"
 142 | 		 "of loads, stores, CAS, FAI, TAS, and SWAP\n"
 143 | 		 "\n"
 144 | 		 "Usage:\n"
 145 | 		 "  ./ccbench [options...]\n"
 146 | 		 "\n"
 147 | 		 "Options:\n"
 148 | 		 "  -h, --help\n"
 149 | 		 "        Print this message\n"
 150 | 		 "  -c, --cores <int>\n"
 151 | 		 "        Number of cores to run the test on (default=" XSTR(DEFAULT_CORES) ")\n"
 152 | 		 "  -r, --repetitions <int>\n"
 153 | 		 "        Repetitions of the test case (default=" XSTR(DEFAULT_REPS) ")\n"
 154 | 		 "  -t, --test <int>\n"
 155 | 		 "        Test case to run (default=" XSTR(DEFAULT_TEST) "). See below for supported events\n"
 156 | 		 "  -x, --core1 <int>\n"
 157 | 		 "        1st core to use (default=" XSTR(DEFAULT_CORE1) ")\n"
 158 | 		 "  -y, --core2 <int>\n"
 159 | 		 "        2nd core to use (default=" XSTR(DEFAULT_CORE2) ")\n"
 160 | 		 "  -z, --core3 <int>\n"
 161 | 		 "        3rd core to use. Some (most) tests use only 2 cores (default=" XSTR(DEFAULT_CORE3) ")\n"
 162 | 		 "  -o, --core-others <int>\n"
 163 | 		 "        Offset for core that the processes with ID > 3 should bind (default=" XSTR(DEFAULT_CORE_OTHERS) ")\n"
 164 | 		 "  -f, --flush\n"
 165 | 		 "        Perform a cache line flush before the test (default=" XSTR(DEFAULT_FLUSH) ")\n"
 166 | 		 "  -s, --stride <int>\n"
 167 | 		 "        What stride size to use when accessing the cache line(s) (default=" XSTR(DEFAULT_STRIDE) ")\n"
 168 | 		 "        The application draws a random number X in the [0..(stride-1)] range and applies the target\n"
 169 | 		 "        operation on this random cache line. The operation is completed when X=0. The stride is used\n"
 170 | 		 "        in order to fool the hardware prefetchers that could hide the latency we want to measure.\n"
 171 | 		 "  -e, --fence <int>\n"
 172 | 		 "        What memory barrier (fence) lvl to use (default=" XSTR(DEFAULT_FENCE) ")\n"
 173 | 		 "        0 = no fences / 1 = load-store fences / 2 = full fences / 3 = load-none fences / 4 = none-store fences\n"
 174 | 		 "        5 = full-none fences / 6 = none-full fences / 7 = full-store fences / 8 = load-full fences \n"
 175 | 		 "  -m, --mem-size <int>\n"
 176 | 		 "        What memory size to use (in cache lines) (default=" XSTR(CACHE_LINE_NUM) ")\n"
 177 | 		 "  -u, --success\n"
 178 | 		 "        Make all atomic operations be successfull (e.g, TAS_ON_SHARED)\n"
 179 | 		 "  -v, --verbose\n"
 180 | 		 "        Verbose printing of results (default=" XSTR(DEFAULT_VERBOSE) ")\n"
 181 | 		 "  -p, --print <int>\n"
 182 | 		 "        If verbose, how many results to print (default=" XSTR(DEFAULT_PRINT) ")\n"
 183 | 		 );
 184 | 	  printf("Supported events: \n");
 185 | 	  int ar;
 186 | 	  for (ar = 0; ar < NUM_EVENTS; ar++)
 187 | 	    {
 188 | 	      printf("      %2d - %s\n", ar, moesi_type_des[ar]);
 189 | 	    }
 190 | 
 191 | 	  exit(0);
 192 | 	case 'c':
 193 | 	  test_cores = atoi(optarg);
 194 | 	  break;
 195 | 	case 'r':
 196 | 	  test_reps = atoi(optarg);
 197 | 	  break;
 198 | 	case 't':
 199 | 	  test_test = atoi(optarg);
 200 | 	  break;
 201 | 	case 'x':
 202 | 	  test_core1 = atoi(optarg);
 203 | 	  break;
 204 | 	case 'y':
 205 | 	  test_core2 = atoi(optarg);
 206 | 	  break;
 207 | 	case 'z':
 208 | 	  test_core3 = atoi(optarg);
 209 | 	  break;
 210 | 	case 'o':
 211 | 	  test_core_others = atoi(optarg);
 212 | 	  break;
 213 | 	case 'f':
 214 | 	  test_flush = 1;
 215 | 	  break;
 216 | 	case 's':
 217 | 	  test_stride = pow2roundup(atoi(optarg));
 218 | 	  break;
 219 | 	case 'e':
 220 | 	  test_fence = atoi(optarg);
 221 | 	  break;
 222 | 	case 'm':
 223 | 	  test_mem_size = parse_size(optarg);
 224 | 	  printf("Data size : %zu KiB\n", test_mem_size / 1024);
 225 | 	  break;
 226 | 	case 'u':
 227 | 	  test_ao_success = 1;
 228 | 	  break;
 229 | 	case 'v':
 230 | 	  test_verbose = 1;
 231 | 	  break;
 232 | 	case 'p':
 233 | 	  test_verbose = 1;
 234 | 	  test_print = atoi(optarg);
 235 | 	  break;
 236 | 	case '?':
 237 | 	  printf("Use -h or --help for help\n");
 238 | 	  exit(0);
 239 | 	default:
 240 | 	  exit(1);
 241 | 	}
 242 |     }
 243 | 
 244 | 
 245 |   test_cache_line_num = test_mem_size / sizeof(cache_line_t);
 246 | 
 247 |   if ((test_test == STORE_ON_EXCLUSIVE || test_test == STORE_ON_INVALID || test_test == LOAD_FROM_INVALID
 248 |        || test_test == LOAD_FROM_EXCLUSIVE || test_test == LOAD_FROM_SHARED) && !test_flush)
 249 |     {
 250 |       assert((test_reps * test_stride) <= test_cache_line_num);
 251 |     }
 252 | 
 253 |   if (test_test != LOAD_FROM_MEM_SIZE)
 254 |     {
 255 |       assert(test_stride < test_cache_line_num);
 256 |     }
 257 | 
 258 | 
 259 |   ID = 0;
 260 |   printf("test: %20s  / #cores: %d / #repetitions: %d / stride: %d (%u kiB)", moesi_type_des[test_test], 
 261 | 	 test_cores, test_reps, test_stride, (64 * test_stride) / 1024);
 262 |   if (test_flush)
 263 |     {
 264 |       printf(" / flush");
 265 |     }
 266 | 
 267 |   printf("  / fence: ");
 268 | 
 269 |   switch (test_fence)
 270 |     {
 271 |     case 1:
 272 |       printf(" load & store");
 273 |       test_lfence = test_sfence = 1;
 274 |       break;
 275 |     case 2:
 276 |       printf(" full");
 277 |       test_lfence = test_sfence = 2;
 278 |       break;
 279 |     case 3:
 280 |       printf(" load");
 281 |       test_lfence = 1;
 282 |       test_sfence = 0;
 283 |       break;
 284 |     case 4:
 285 |       printf(" store");
 286 |       test_lfence = 0;
 287 |       test_sfence = 1;
 288 |       break;
 289 |     case 5:
 290 |       printf(" full/none");
 291 |       test_lfence = 2;
 292 |       test_sfence = 0;
 293 |       break;
 294 |     case 6:
 295 |       printf(" none/full");
 296 |       test_lfence = 0;
 297 |       test_sfence = 2;
 298 |       break;
 299 |     case 7:
 300 |       printf(" full/store");
 301 |       test_lfence = 2;
 302 |       test_sfence = 1;
 303 |       break;
 304 |     case 8:
 305 |       printf(" load/full");
 306 |       test_lfence = 1;
 307 |       test_sfence = 2;
 308 |       break;    
 309 |     case 9:
 310 |       printf(" double write");
 311 |       test_lfence = 0;
 312 |       test_sfence = 3;
 313 |       break;
 314 |     default:
 315 |       printf(" none");
 316 |       test_lfence = test_sfence = 0;
 317 |       break;
 318 |     }
 319 | 
 320 |   printf("\n");
 321 | 
 322 |   printf("core1: %3u / core2: %3u ", test_core1, test_core2);
 323 |   if (test_cores >= 3)
 324 |     {
 325 |       printf("/ core3: %3u", test_core3);
 326 |     }
 327 |   printf("\n");
 328 | 
 329 |   barriers_init(test_cores);
 330 |   seeds = seed_rand();
 331 | 
 332 |   volatile cache_line_t* cache_line = cache_line_open();
 333 | 
 334 |   int rank;
 335 |   for (rank = 1; rank < test_cores; rank++) 
 336 |     {
 337 |       pid_t child = fork();
 338 |       if (child < 0) 
 339 | 	{
 340 | 	  P("Failure in fork():\n%s", strerror(errno));
 341 | 	} 
 342 |       else if (child == 0) 
 343 | 	{
 344 | 	  goto fork_done;
 345 | 	}
 346 |     }
 347 |   rank = 0;
 348 | 
 349 |  fork_done:
 350 |   ID = rank;
 351 |   size_t core = 0;
 352 |   switch (ID)
 353 |     {
 354 |     case 0:
 355 |       core = test_core1;
 356 |       break;
 357 |     case 1:
 358 |       core = test_core2;
 359 |       break;
 360 |     case 2:
 361 |       core = test_core3;
 362 |       break;
 363 |     default:
 364 |       core = ID - test_core_others;
 365 |     }
 366 | 
 367 | #if defined(NIAGARA)
 368 |   if (test_cores <= 8 && test_cores > 3)
 369 |     {
 370 |       if (ID == 0)
 371 | 	{
 372 | 	  PRINT(" ** spreading the 8 threads on the 8 real cores");
 373 | 	}
 374 |       core = ID * 8;
 375 |     }
 376 | #endif
 377 | 
 378 |   set_cpu(core);
 379 | 
 380 | #if defined(__tile__)
 381 |   tmc_cmem_init(0);		/*   initialize shared memory */
 382 | #endif  /* TILERA */
 383 | 
 384 |   volatile uint64_t* cl = (volatile uint64_t*) cache_line;
 385 | 
 386 |   B0;
 387 |   if (ID < 3)
 388 |     {
 389 |       PFDINIT(test_reps);
 390 |     }
 391 |   B0;
 392 | 
 393 |   /* /\********************************************************************************* */
 394 |   /*  *  main functionality */
 395 |   /*  *********************************************************************************\/ */
 396 | 
 397 |   uint64_t sum = 0;
 398 | 
 399 |   volatile uint64_t reps;
 400 |   for (reps = 0; reps < test_reps; reps++)
 401 |     {
 402 |       if (test_flush)
 403 | 	{
 404 | 	  _mm_mfence();
 405 | 	  _mm_clflush((void*) cache_line);
 406 | 	  _mm_mfence();
 407 | 	}
 408 | 
 409 |       B0;			/* BARRIER 0 */
 410 | 
 411 |       switch (test_test)
 412 | 	{
 413 | 	case STORE_ON_MODIFIED: /* 0 */
 414 | 	  {
 415 | 	    switch (ID)
 416 | 	      {
 417 | 	      case 0:
 418 | 		store_0_eventually(cache_line, reps);
 419 | 		B1;		/* BARRIER 1 */
 420 | 		break;
 421 | 	      case 1:
 422 | 		B1;		/* BARRIER 1 */
 423 | 		store_0_eventually(cache_line, reps);
 424 | 		break;
 425 | 	      default:
 426 | 		B1;		/* BARRIER 1 */
 427 | 		break;
 428 | 	      }
 429 | 	    break;
 430 | 	  }
 431 | 	case STORE_ON_MODIFIED_NO_SYNC: /* 1 */
 432 | 	  {
 433 | 	    switch (ID)
 434 | 	      {
 435 | 	      case 0:
 436 | 	      case 1:
 437 | 	      case 2:
 438 | 		store_0(cache_line, reps);
 439 | 		break;
 440 | 	      default:
 441 | 		store_0_no_pf(cache_line, reps);
 442 | 		break;
 443 | 	      }
 444 | 	    break;
 445 | 	  }
 446 | 	case STORE_ON_EXCLUSIVE: /* 2 */
 447 | 	  {
 448 | 	    switch (ID)
 449 | 	      {
 450 | 	      case 0:
 451 | 		sum += load_0_eventually(cache_line, reps);
 452 | 		B1;		/* BARRIER 1 */
 453 | 		break;
 454 | 	      case 1:
 455 | 		B1;		/* BARRIER 1 */
 456 | 		store_0_eventually(cache_line, reps);
 457 | 		break;
 458 | 	      default:
 459 | 		B1;		/* BARRIER 1 */
 460 | 		break;
 461 | 	      }
 462 | 
 463 | 	    if (!test_flush)
 464 | 	      {
 465 | 		cache_line += test_stride;
 466 | 	      }
 467 | 	    break;
 468 | 	  }
 469 | 	case STORE_ON_SHARED:	/* 3 */
 470 | 	  {
 471 | 	    switch (ID)
 472 | 	      {
 473 | 	      case 0:
 474 | 		sum += load_0_eventually(cache_line, reps);
 475 | 		B1;			/* BARRIER 1 */
 476 | 		B2;			/* BARRIER 2 */
 477 | 		break;
 478 | 	      case 1:
 479 | 		B1;			/* BARRIER 1 */
 480 | 		B2;			/* BARRIER 2 */
 481 | 		store_0_eventually(cache_line, reps);
 482 | 		break;
 483 | 	      case 2:
 484 | 		B1;			/* BARRIER 1 */
 485 | 		sum += load_0_eventually(cache_line, reps);
 486 | 		B2;			/* BARRIER 2 */
 487 | 		break;
 488 | 	      default:
 489 | 		B1;			/* BARRIER 1 */
 490 | 		sum += load_0_eventually_no_pf(cache_line);
 491 | 		B2;			/* BARRIER 2 */
 492 | 		break;
 493 | 	      }
 494 | 	    break;
 495 | 	  }
 496 | 	case STORE_ON_OWNED_MINE: /* 4 */
 497 | 	  {
 498 | 	    switch (ID)
 499 | 	      {
 500 | 	      case 0:
 501 | 		B1;			/* BARRIER 1 */
 502 | 		sum += load_0_eventually(cache_line, reps);
 503 | 		B2;			/* BARRIER 2 */
 504 | 		break;
 505 | 	      case 1:
 506 | 		store_0_eventually(cache_line, reps);
 507 | 		B1;			/* BARRIER 1 */
 508 | 		B2;			/* BARRIER 2 */
 509 | 		store_0_eventually_pfd1(cache_line, reps);
 510 | 		break;
 511 | 	      default:
 512 | 		B1;			/* BARRIER 1 */
 513 | 		sum += load_0_eventually_no_pf(cache_line);
 514 | 		B2;			/* BARRIER 2 */
 515 | 		break;
 516 | 	      }
 517 | 	    break;
 518 | 	  }
 519 | 	case STORE_ON_OWNED:	/* 5 */
 520 | 	  {
 521 | 	    switch (ID)
 522 | 	      {
 523 | 	      case 0:
 524 | 		store_0_eventually(cache_line, reps);
 525 | 		B1;			/* BARRIER 1 */
 526 | 		B2;			/* BARRIER 2 */
 527 | 		break;
 528 | 	      case 1:
 529 | 		B1;			/* BARRIER 1 */
 530 | 		sum += load_0_eventually(cache_line, reps);
 531 | 		B2;			/* BARRIER 2 */
 532 | 		store_0_eventually_pfd1(cache_line, reps);
 533 | 		break;
 534 | 	      default:
 535 | 		B1;			/* BARRIER 1 */
 536 | 		sum += load_0_eventually_no_pf(cache_line);
 537 | 		B2;			/* BARRIER 2 */
 538 | 		break;
 539 | 	      }
 540 | 	    break;
 541 | 	  }
 542 | 	case STORE_ON_INVALID:	/* 6 */
 543 | 	  {
 544 | 	    switch (ID)
 545 | 	      {
 546 | 	      case 0:
 547 | 		B1;
 548 | 		/* store_0_eventually(cache_line, reps); */
 549 | 		store_0(cache_line, reps);
 550 | 		if (!test_flush)
 551 | 		  {
 552 | 		    cache_line += test_stride;
 553 | 		  }
 554 | 		break;
 555 | 	      case 1:
 556 | 		invalidate(cache_line, 0, reps);
 557 | 		if (!test_flush)
 558 | 		  {
 559 | 		    cache_line += test_stride;
 560 | 		  }
 561 | 		B1;
 562 | 		break;
 563 | 	      default:
 564 | 		B1;
 565 | 		break;
 566 | 	      }
 567 | 	    break;
 568 | 	  }
 569 | 	case LOAD_FROM_MODIFIED: /* 7 */
 570 | 	  {
 571 | 	    switch (ID)
 572 | 	      {
 573 | 	      case 0:
 574 | 		store_0_eventually(cache_line, reps);
 575 | 		B1;		
 576 | 		break;
 577 | 	      case 1:
 578 | 		B1;			/* BARRIER 1 */
 579 | 		sum += load_0_eventually(cache_line, reps);
 580 | 		break;
 581 | 	      default:
 582 | 		B1;
 583 | 		break;
 584 | 	      }
 585 | 	    break;
 586 | 	  }
 587 | 	case LOAD_FROM_EXCLUSIVE: /* 8 */
 588 | 	  {
 589 | 	    switch (ID)
 590 | 	      {
 591 | 	      case 0:
 592 | 		sum += load_0_eventually(cache_line, reps);
 593 | 		B1;			/* BARRIER 1 */
 594 | 
 595 | 		if (!test_flush)
 596 | 		  {
 597 | 		    cache_line += test_stride;
 598 | 		  }
 599 | 		break;
 600 | 	      case 1:
 601 | 		B1;			/* BARRIER 1 */
 602 | 		sum += load_0_eventually(cache_line, reps);
 603 | 
 604 | 		if (!test_flush)
 605 | 		  {
 606 | 		    cache_line += test_stride;
 607 | 		  }
 608 | 		break;
 609 | 	      default:
 610 | 		B1;			/* BARRIER 1 */
 611 | 		break;
 612 | 	      }
 613 | 	    break;
 614 | 	  }
 615 | 	case LOAD_FROM_SHARED:	/* 9 */
 616 | 	  {
 617 | 	    switch (ID)
 618 | 	      {
 619 | 	      case 0:
 620 | 		sum += load_0_eventually(cache_line, reps);
 621 | 		B1;			/* BARRIER 1 */
 622 | 		B2;			/* BARRIER 2 */
 623 | 		break;
 624 | 	      case 1:
 625 | 		B1;			/* BARRIER 1 */
 626 | 		sum += load_0_eventually(cache_line, reps);
 627 | 		B2;			/* BARRIER 2 */
 628 | 		break;
 629 | 	      case 2:
 630 | 		B1;			/* BARRIER 1 */
 631 | 		B2;			/* BARRIER 2 */
 632 | 		sum += load_0_eventually(cache_line, reps);
 633 | 		break;
 634 | 	      default:
 635 | 		B1;			/* BARRIER 1 */
 636 | 		sum += load_0_eventually_no_pf(cache_line);
 637 | 		B2;			/* BARRIER 2 */
 638 | 		break;
 639 | 	      }
 640 | 
 641 | 	    if (!test_flush)
 642 | 	      {
 643 | 		cache_line += test_stride;
 644 | 	      }
 645 | 	    break;
 646 | 	  }
 647 | 	case LOAD_FROM_OWNED:	/* 10 */
 648 | 	  {
 649 | 	    switch (ID)
 650 | 	      {
 651 | 	      case 0:
 652 | 		store_0_eventually(cache_line, reps);
 653 | 		B1;			/* BARRIER 1 */
 654 | 		B2;			/* BARRIER 2 */
 655 | 		break;
 656 | 	      case 1:
 657 | 		B1;			/* BARRIER 1 */
 658 | 		sum += load_0_eventually(cache_line, reps);
 659 | 		B2;			/* BARRIER 2 */
 660 | 		break;
 661 | 	      case 2:
 662 | 		B1;			/* BARRIER 1 */
 663 | 		B2;			/* BARRIER 2 */
 664 | 		sum += load_0_eventually(cache_line, reps);
 665 | 		break;
 666 | 	      default:
 667 | 		B1;			/* BARRIER 1 */
 668 | 		B2;			/* BARRIER 2 */
 669 | 		break;
 670 | 	      }
 671 | 	    break;
 672 | 	  }
 673 | 	case LOAD_FROM_INVALID:	/* 11 */
 674 | 	  {
 675 | 	    switch (ID)
 676 | 	      {
 677 | 	      case 0:
 678 | 		B1;			/* BARRIER 1 */
 679 | 		sum += load_0_eventually(cache_line, reps); 		/* sum += load_0(cache_line, reps); */
 680 | 		break;
 681 | 	      case 1:
 682 | 		invalidate(cache_line, 0, reps);
 683 | 		B1;			/* BARRIER 1 */
 684 | 		break;
 685 | 	      default:
 686 | 		B1;			/* BARRIER 1 */
 687 | 		break;
 688 | 	      }
 689 | 
 690 | 	    if (!test_flush)
 691 | 	      {
 692 | 		cache_line += test_stride;
 693 | 	      }
 694 | 	    break;
 695 | 	  }
 696 | 	case CAS: /* 12 */
 697 | 	  {
 698 | 	    switch (ID)
 699 | 	      {
 700 | 	      case 0:
 701 | 		sum += cas_0_eventually(cache_line, reps);
 702 | 		B1;		/* BARRIER 1 */
 703 | 		break;
 704 | 	      case 1:
 705 | 		B1;		/* BARRIER 1 */
 706 | 		sum += cas_0_eventually(cache_line, reps);
 707 | 		break;
 708 | 	      default:
 709 | 		B1;		/* BARRIER 1 */
 710 | 		break;
 711 | 	      }
 712 | 	    break;
 713 | 	  }
 714 | 	case FAI: /* 13 */
 715 | 	  {
 716 | 	    switch (ID)
 717 | 	      {
 718 | 	      case 0:
 719 | 		sum += fai(cache_line, reps);
 720 | 		B1;		/* BARRIER 1 */
 721 | 		break;
 722 | 	      case 1:
 723 | 		B1;		/* BARRIER 1 */
 724 | 		sum += fai(cache_line, reps);
 725 | 		break;
 726 | 	      default:
 727 | 		B1;		/* BARRIER 1 */
 728 | 		break;
 729 | 	      }
 730 | 	    break;
 731 | 	  }
 732 | 	case TAS:		/* 14 */
 733 | 	  {
 734 | 	    switch (ID)
 735 | 	      {
 736 | 	      case 0:
 737 | 		sum += tas(cache_line, reps);
 738 | 		B1;		/* BARRIER 1 */
 739 | 		B2;		/* BARRIER 2 */
 740 | 		break;
 741 | 	      case 1:
 742 | 		B1;		/* BARRIER 1 */
 743 | 		sum += tas(cache_line, reps);
 744 | 		_mm_mfence();
 745 | 		cache_line->word[0] = 0;
 746 | 		B2;		/* BARRIER 2 */
 747 | 		break;
 748 | 	      default:
 749 | 		B1;		/* BARRIER 1 */
 750 | 		B2;		/* BARRIER 2 */
 751 | 		break;
 752 | 	      }
 753 | 	    break;
 754 | 	  }
 755 | 	case SWAP: /* 15 */
 756 | 	  {
 757 | 	    switch (ID)
 758 | 	      {
 759 | 	      case 0:
 760 | 		sum += swap(cache_line, reps);
 761 | 		B1;		/* BARRIER 1 */
 762 | 		break;
 763 | 	      case 1:
 764 | 		B1;		/* BARRIER 1 */
 765 | 		sum += swap(cache_line, reps);
 766 | 		break;
 767 | 	      default:
 768 | 		B1;		/* BARRIER 1 */
 769 | 		break;
 770 | 	      }
 771 | 	    break;
 772 | 	  }
 773 | 	case CAS_ON_MODIFIED: /* 16 */
 774 | 	  {
 775 | 	    switch (ID)
 776 | 	      {
 777 | 	      case 0:
 778 | 		store_0_eventually(cache_line, reps);
 779 | 		if (test_ao_success)
 780 | 		  {
 781 | 		    cache_line->word[0] = reps & 0x01;
 782 | 		  }
 783 | 		B1;		/* BARRIER 1 */
 784 | 		break;
 785 | 	      case 1:
 786 | 		B1;		/* BARRIER 1 */
 787 | 		sum += cas_0_eventually(cache_line, reps);
 788 | 		break;
 789 | 	      default:
 790 | 		B1;		/* BARRIER 1 */
 791 | 		break;
 792 | 	      }
 793 | 	    break;
 794 | 	  }
 795 | 	case FAI_ON_MODIFIED: /* 17 */
 796 | 	  {
 797 | 	    switch (ID)
 798 | 	      {
 799 | 	      case 0:
 800 | 		store_0_eventually(cache_line, reps);
 801 | 		B1;		/* BARRIER 1 */
 802 | 		break;
 803 | 	      case 1:
 804 | 		B1;		/* BARRIER 1 */
 805 | 		sum += fai(cache_line, reps);
 806 | 		break;
 807 | 	      default:
 808 | 		B1;		/* BARRIER 1 */
 809 | 		break;
 810 | 	      }
 811 | 	    break;
 812 | 	  }
 813 | 	case TAS_ON_MODIFIED: /* 18 */
 814 | 	  {
 815 | 	    switch (ID)
 816 | 	      {
 817 | 	      case 0:
 818 | 		store_0_eventually(cache_line, reps);
 819 | 		if (!test_ao_success)
 820 | 		  {
 821 | 		    cache_line->word[0] = 0xFFFFFFFF;
 822 | 		    _mm_mfence();
 823 | 		  }
 824 | 		B1;		/* BARRIER 1 */
 825 | 		break;
 826 | 	      case 1:
 827 | 		B1;		/* BARRIER 1 */
 828 | 		sum += tas(cache_line, reps);
 829 | 		break;
 830 | 	      default:
 831 | 		B1;		/* BARRIER 1 */
 832 | 		break;
 833 | 	      }
 834 | 	    break;
 835 | 	  }
 836 | 	case SWAP_ON_MODIFIED: /* 19 */
 837 | 	  {
 838 | 	    switch (ID)
 839 | 	      {
 840 | 	      case 0:
 841 | 		store_0_eventually(cache_line, reps);
 842 | 		B1;		/* BARRIER 1 */
 843 | 		break;
 844 | 	      case 1:
 845 | 		B1;		/* BARRIER 1 */
 846 | 		sum += swap(cache_line, reps);
 847 | 		break;
 848 | 	      default:
 849 | 		B1;		/* BARRIER 1 */
 850 | 		break;
 851 | 	      }
 852 | 	    break;
 853 | 	  }
 854 | 	case CAS_ON_SHARED: /* 20 */
 855 | 	  {
 856 | 	    switch (ID)
 857 | 	      {
 858 | 	      case 0:
 859 | 		sum += load_0_eventually(cache_line, reps);
 860 | 		B1;		/* BARRIER 1 */
 861 | 		B2;		/* BARRIER 2 */
 862 | 		break;
 863 | 	      case 1:
 864 | 		B1;		/* BARRIER 1 */
 865 | 		B2;		/* BARRIER 2 */
 866 | 		sum += cas_0_eventually(cache_line, reps);
 867 | 		break;
 868 | 	      case 2:
 869 | 		B1;		/* BARRIER 1 */
 870 | 		sum += load_0_eventually(cache_line, reps);
 871 | 		B2;		/* BARRIER 2 */
 872 | 		break;
 873 | 	      default:
 874 | 		B1;		/* BARRIER 1 */
 875 | 		sum += load_0_eventually_no_pf(cache_line);
 876 | 		B2;			/* BARRIER 2 */
 877 | 		break;
 878 | 	      }
 879 | 	    break;
 880 | 	  }
 881 | 	case FAI_ON_SHARED: /* 21 */
 882 | 	  {
 883 | 	    switch (ID)
 884 | 	      {
 885 | 	      case 0:
 886 | 		sum += load_0_eventually(cache_line, reps);
 887 | 		B1;		/* BARRIER 1 */
 888 | 		B2;		/* BARRIER 2 */
 889 | 		break;
 890 | 	      case 1:
 891 | 		B1;		/* BARRIER 1 */
 892 | 		B2;		/* BARRIER 2 */
 893 | 		sum += fai(cache_line, reps);
 894 | 		break;
 895 | 	      case 2:
 896 | 		B1;		/* BARRIER 1 */
 897 | 		sum += load_0_eventually(cache_line, reps);
 898 | 		B2;		/* BARRIER 2 */
 899 | 		break;
 900 | 	      default:
 901 | 		B1;		/* BARRIER 1 */
 902 | 		sum += load_0_eventually_no_pf(cache_line);
 903 | 		B2;			/* BARRIER 2 */
 904 | 		break;
 905 | 	      }
 906 | 	    break;
 907 | 	  }
 908 | 	case TAS_ON_SHARED: /* 22 */
 909 | 	  {
 910 | 	    switch (ID)
 911 | 	      {
 912 | 	      case 0:
 913 | 		if (test_ao_success)
 914 | 		  {
 915 | 		    cache_line->word[0] = 0;
 916 | 		  }
 917 | 		else
 918 | 		  {
 919 | 		    cache_line->word[0] = 0xFFFFFFFF;
 920 | 		  }
 921 | 		sum += load_0_eventually(cache_line, reps);
 922 | 		B1;		/* BARRIER 1 */
 923 | 		B2;		/* BARRIER 2 */
 924 | 		break;
 925 | 	      case 1:
 926 | 		B1;		/* BARRIER 1 */
 927 | 		B2;		/* BARRIER 2 */
 928 | 		sum += tas(cache_line, reps);
 929 | 		break;
 930 | 	      case 2:
 931 | 		B1;		/* BARRIER 1 */
 932 | 		sum += load_0_eventually(cache_line, reps);
 933 | 		B2;		/* BARRIER 2 */
 934 | 		break;
 935 | 	      default:
 936 | 		B1;		/* BARRIER 1 */
 937 | 		sum += load_0_eventually_no_pf(cache_line);
 938 | 		B2;			/* BARRIER 2 */
 939 | 		break;
 940 | 	      }
 941 | 	    break;
 942 | 	  }
 943 | 	case SWAP_ON_SHARED: /* 23 */
 944 | 	  {
 945 | 	    switch (ID)
 946 | 	      {
 947 | 	      case 0:
 948 | 		sum += load_0_eventually(cache_line, reps);
 949 | 		B1;		/* BARRIER 1 */
 950 | 		B2;		/* BARRIER 2 */
 951 | 		break;
 952 | 	      case 1:
 953 | 		B1;		/* BARRIER 1 */
 954 | 		B2;		/* BARRIER 2 */
 955 | 		sum += swap(cache_line, reps);
 956 | 		break;
 957 | 	      case 2:
 958 | 		B1;		/* BARRIER 1 */
 959 | 		sum += load_0_eventually(cache_line, reps);
 960 | 		B2;		/* BARRIER 2 */
 961 | 		break;
 962 | 	      default:
 963 | 		B1;		/* BARRIER 1 */
 964 | 		sum += load_0_eventually_no_pf(cache_line);
 965 | 		B2;			/* BARRIER 2 */
 966 | 		break;
 967 | 	      }
 968 | 	    break;
 969 | 	  }
 970 | 	case CAS_CONCURRENT: /* 24 */
 971 | 	  {
 972 | 	    switch (ID)
 973 | 	      {
 974 | 	      case 0:
 975 | 	      case 1:
 976 | 		sum += cas(cache_line, reps);
 977 | 		break;
 978 | 	      default:
 979 | 		sum += cas_no_pf(cache_line, reps);
 980 | 		break;
 981 | 	      }
 982 | 	    break;
 983 | 	  }
 984 | 	case FAI_ON_INVALID:	/* 25 */
 985 | 	  {
 986 | 	    switch (ID)
 987 | 	      {
 988 | 	      case 0:
 989 | 		B1;		/* BARRIER 1 */
 990 | 		sum += fai(cache_line, reps);
 991 | 		break;
 992 | 	      case 1:
 993 | 		invalidate(cache_line, 0, reps);
 994 | 		B1;		/* BARRIER 1 */
 995 | 		break;
 996 | 	      default:
 997 | 		B1;		/* BARRIER 1 */
 998 | 		break;
 999 | 	      }
1000 | 
1001 | 	    if (!test_flush)
1002 | 	      {
1003 | 		cache_line += test_stride;
1004 | 	      }
1005 | 	    break;
1006 | 	  }
1007 | 	case LOAD_FROM_L1:	/* 26 */
1008 | 	  {
1009 | 	    if (ID == 0)
1010 | 	      {
1011 | 		sum += load_0(cache_line, reps);
1012 | 		sum += load_0(cache_line, reps);
1013 | 		sum += load_0(cache_line, reps);
1014 | 	      }
1015 | 	    break;
1016 | 	  }
1017 | 	case LOAD_FROM_MEM_SIZE: /* 27 */
1018 | 	  {
1019 | 	    if (ID < 3)
1020 | 	      {
1021 | 		sum += load_next(cl, reps);
1022 | 	      }
1023 | 	  }
1024 | 	  break;
1025 | 	case LFENCE:		/* 28 */
1026 | 	  if (ID < 2)
1027 | 	    {
1028 | 	      PFDI(0);
1029 | 	      _mm_lfence();
1030 | 	      PFDO(0, reps);
1031 | 	    }
1032 | 	  break;
1033 | 	case SFENCE:		/* 29 */
1034 | 	  if (ID < 2)
1035 | 	    {
1036 | 	      PFDI(0);
1037 | 	      _mm_sfence();
1038 | 	      PFDO(0, reps);
1039 | 	    }
1040 | 	  break;
1041 | 	case MFENCE:		/* 30 */
1042 | 	  if (ID < 2)
1043 | 	    {
1044 | 	      PFDI(0);
1045 | 	      _mm_mfence();
1046 | 	      PFDO(0, reps);
1047 | 	    }
1048 | 	  break;
1049 | 	case PAUSE:		/* 31 */
1050 | 	  if (ID < 2)
1051 | 	    {
1052 | 	      PFDI(0);
1053 | 	      _mm_pause();
1054 | 	      PFDO(0, reps);
1055 | 	    }
1056 | 	  break;
1057 | 	case NOP:		/* 32 */
1058 | 	  if (ID < 2)
1059 | 	    {
1060 | 	      PFDI(0);
1061 | 	      asm volatile ("nop");
1062 | 	      PFDO(0, reps);
1063 | 	    }
1064 | 	  break;
1065 | 	case PROFILER:		/* 30 */
1066 | 	default:
1067 | 	  PFDI(0);
1068 | 	  asm volatile ("");
1069 | 	  PFDO(0, reps);
1070 | 	  break;
1071 | 	}
1072 | 
1073 |       B3;			/* BARRIER 3 */
1074 |     }
1075 | 
1076 |   if (!test_verbose)
1077 |     {
1078 |       test_print = 0;
1079 |     }
1080 | 
1081 |   uint32_t id;
1082 |   for (id = 0; id < test_cores; id++)
1083 |     {
1084 |       if (ID == id && ID < 3)
1085 | 	{
1086 | 	  switch (test_test)
1087 | 	    {
1088 | 	    case STORE_ON_OWNED_MINE:
1089 | 	    case STORE_ON_OWNED:
1090 | 	      if (ID < 2)
1091 | 		{
1092 | 		  PRINT(" *** Core %2d ************************************************************************************", ID);
1093 | 		  PFDPN(0, test_reps, test_print);
1094 | 		  if (ID == 1)
1095 | 		    {
1096 | 		      PFDPN(1, test_reps, test_print);
1097 | 		    }
1098 | 		}
1099 | 	      break;
1100 | 	    case CAS_CONCURRENT:
1101 | 	      if (ID < 2)
1102 | 		{
1103 | 		  PRINT(" *** Core %2d ************************************************************************************", ID);
1104 | 		  PFDPN(0, test_reps, test_print);
1105 | 		}
1106 | 	      break;
1107 | 	    case LOAD_FROM_L1:
1108 | 	      if (ID < 1)
1109 | 		{
1110 | 		  PRINT(" *** Core %2d ************************************************************************************", ID);
1111 | 		  PFDPN(0, test_reps, test_print);
1112 | 		}
1113 | 	      break;
1114 | 	    case LOAD_FROM_MEM_SIZE:
1115 | 	      if (ID < 3)
1116 | 		{
1117 | 		  PRINT(" *** Core %2d ************************************************************************************", ID);
1118 | 		  PFDPN(0, test_reps, test_print);
1119 | 		}
1120 | 	      break;
1121 | 	    default:
1122 | 	      PRINT(" *** Core %2d ************************************************************************************", ID);
1123 | 	      PFDPN(0, test_reps, test_print);
1124 | 	    }
1125 | 	}
1126 |       B0;
1127 |     }
1128 |   B10;
1129 | 
1130 | 
1131 |   if (ID == 0)
1132 |     {
1133 |       switch (test_test)
1134 | 	{
1135 | 	case STORE_ON_MODIFIED:
1136 | 	  {
1137 | 	    if (test_flush)
1138 | 	      {
1139 | 		PRINT(" ** Results from Core 0 : store on invalid");
1140 | 		PRINT(" ** Results from Core 1 : store on modified");
1141 | 	      }
1142 | 	    else
1143 | 	      {
1144 | 		PRINT(" ** Results from Core 0 and 1 : store on modified");
1145 | 	      }
1146 | 	    break;
1147 | 	  }
1148 | 	case STORE_ON_MODIFIED_NO_SYNC:
1149 | 	  {
1150 | 	    if (test_flush)
1151 | 	      {
1152 | 		PRINT(" ** Results do not make sense");
1153 | 	      }
1154 | 	    else
1155 | 	      {
1156 | 		PRINT(" ** Results from Core 0 and 1 : store on modified while another core is "
1157 | 		      "also trying to do the same");
1158 | 	      }
1159 | 	    break;
1160 | 	  }
1161 | 	case STORE_ON_EXCLUSIVE:
1162 | 	  {
1163 | 	    if (test_flush)
1164 | 	      {
1165 | 		PRINT(" ** Results from Core 0 : load from invalid");
1166 | 	      }
1167 | 	    else
1168 | 	      {
1169 | 		PRINT(" ** Results from Core 0 : load from invalid, BUT could have prefetching");
1170 | 	      }
1171 | 	    PRINT(" ** Results from Core 1 : store on exclusive");
1172 | 	    break;
1173 | 	  }
1174 | 	case STORE_ON_SHARED:
1175 | 	  {
1176 | 	    PRINT(" ** Results from Core 0 & 2: load from modified and exclusive or shared, respectively");
1177 | 	    PRINT(" ** Results from Core 1 : store on shared");
1178 | 	    if (test_cores < 3)
1179 | 	      {
1180 | 		PRINT(" ** Need >=3 processes to achieve STORE_ON_SHARED");
1181 | 	      }
1182 | 	    break;
1183 | 	  }
1184 | 	case STORE_ON_OWNED_MINE:
1185 | 	  {
1186 | 	    PRINT(" ** Results from Core 0 : load from modified (makes it owned, if owned state is supported)");
1187 | 	    if (test_flush)
1188 | 	      {
1189 | 		PRINT(" ** Results 1 from Core 1 : store to invalid");
1190 | 	      }
1191 | 	    else
1192 | 	      {
1193 | 		PRINT(" ** Results 1 from Core 1 : store to modified mine");
1194 | 	      }
1195 | 
1196 | 	    PRINT(" ** Results 2 from Core 1 : store to owned mine (if owned is supported, else exclusive)");
1197 | 	    break;
1198 | 	  }
1199 | 	case STORE_ON_OWNED:
1200 | 	  {
1201 | 	    if (test_flush)
1202 | 	      {
1203 | 		PRINT(" ** Results from Core 0 : store to modified");
1204 | 	      }
1205 | 	    else
1206 | 	      {
1207 | 		PRINT(" ** Results from Core 0 : store to invalid");
1208 | 	      }
1209 | 	    PRINT(" ** Results 1 from Core 1 : load from modified (makes it owned, if owned state is supported)");
1210 | 	    PRINT(" ** Results 2 from Core 1 : store to owned (if owned is supported, else exclusive mine)");
1211 | 	    break;
1212 | 	  }
1213 | 	case LOAD_FROM_MODIFIED:
1214 | 	  {
1215 | 	    if (test_flush)
1216 | 	      {
1217 | 		PRINT(" ** Results from Core 0 : store to invalid");
1218 | 	      }
1219 | 	    else
1220 | 	      {
1221 | 		PRINT(" ** Results from Core 0 : store to owned mine (if owned state supported, else exclusive)");
1222 | 	      }
1223 | 
1224 | 	    PRINT(" ** Results from Core 1 : load from modified (makes it owned, if owned state supported)");
1225 | 
1226 | 	    break;
1227 | 	  }
1228 | 	case LOAD_FROM_EXCLUSIVE:
1229 | 	  {
1230 | 	    if (test_flush)
1231 | 	      {
1232 | 		PRINT(" ** Results from Core 0 : load from invalid");
1233 | 	      }
1234 | 	    else
1235 | 	      {
1236 | 		PRINT(" ** Results from Core 0 : load from invalid, BUT could have prefetching");
1237 | 	      }
1238 | 	    PRINT(" ** Results from Core 1 : load from exclusive");
1239 | 
1240 | 	    break;
1241 | 	  }
1242 | 	case STORE_ON_INVALID:
1243 | 	  {
1244 | 	    PRINT(" ** Results from Core 0 : store on invalid");
1245 | 	    PRINT(" ** Results from Core 1 : cache line flush");
1246 | 	    break;
1247 | 	  }
1248 | 	case LOAD_FROM_INVALID:
1249 | 	  {
1250 | 	    PRINT(" ** Results from Core 0 : load from invalid");
1251 | 	    PRINT(" ** Results from Core 1 : cache line flush");
1252 | 	    break;
1253 | 	  }
1254 | 	case LOAD_FROM_SHARED:
1255 | 	  {
1256 | 	    if (test_flush)
1257 | 	      {
1258 | 		PRINT(" ** Results from Core 0 : load from invalid");
1259 | 	      }
1260 | 	    else
1261 | 	      {
1262 | 		PRINT(" ** Results from Core 0 : load from invalid, BUT could have prefetching");
1263 | 	      }
1264 | 	    PRINT(" ** Results from Core 1 : load from exclusive");
1265 | 	    if (test_cores >= 3)
1266 | 	      {
1267 | 		PRINT(" ** Results from Core 2 : load from shared");
1268 | 	      }
1269 | 	    else
1270 | 	      {
1271 | 		PRINT(" ** Need >=3 processes to achieve LOAD_FROM_SHARED");
1272 | 	      }
1273 | 	    break;
1274 | 	  }
1275 | 	case LOAD_FROM_OWNED:
1276 | 	  {
1277 | 	    if (test_flush)
1278 | 	      {
1279 | 		PRINT(" ** Results from Core 0 : store to invalid");
1280 | 	      }
1281 | 	    else
1282 | 	      {
1283 | 		PRINT(" ** Results from Core 0 : store to owned mine (if owned is supported, else shared)");
1284 | 	      }
1285 | 	    PRINT(" ** Results from Core 1 : load from modified");
1286 | 	    if (test_cores == 3)
1287 | 	      {
1288 | 		PRINT(" ** Results from Core 2 : load from owned");
1289 | 	      }
1290 | 	    else
1291 | 	      {
1292 | 		PRINT(" ** Need 3 processes to achieve LOAD_FROM_OWNED");
1293 | 	      }
1294 | 	    break;
1295 | 	  }
1296 | 	case CAS:
1297 | 	  {
1298 | 	    PRINT(" ** Results from Core 0 : CAS successfull");
1299 | 	    PRINT(" ** Results from Core 1 : CAS unsuccessfull");
1300 | 	    break;
1301 | 	  }
1302 | 	case FAI:
1303 | 	  {
1304 | 	    PRINT(" ** Results from Cores 0 & 1: FAI");
1305 | 	    break;
1306 | 	  }
1307 | 	case TAS:
1308 | 	  {
1309 | 	    PRINT(" ** Results from Core 0 : TAS successfull");
1310 | 	    PRINT(" ** Results from Core 1 : TAS unsuccessfull");
1311 | 	    break;
1312 | 	  }
1313 | 	case SWAP:
1314 | 	  {
1315 | 	    PRINT(" ** Results from Cores 0 & 1: SWAP");
1316 | 	    break;
1317 | 	  }
1318 | 	case CAS_ON_MODIFIED:
1319 | 	  {
1320 | 	    PRINT(" ** Results from Core 0 : store on modified");
1321 | 	    uint32_t succ = 50 + test_ao_success * 50;
1322 | 	    PRINT(" ** Results from Core 1 : CAS on modified (%d%% successfull)", succ);
1323 | 	    break;
1324 | 	  }
1325 | 	case FAI_ON_MODIFIED:
1326 | 	  {
1327 | 	    PRINT(" ** Results from Core 0 : store on modified");
1328 | 	    PRINT(" ** Results from Core 1 : FAI on modified");
1329 | 	    break;
1330 | 	  }
1331 | 	case TAS_ON_MODIFIED:
1332 | 	  {
1333 | 	    PRINT(" ** Results from Core 0 : store on modified");
1334 | 	    uint32_t succ = test_ao_success * 100;
1335 | 	    PRINT(" ** Results from Core 1 : TAS on modified (%d%% successfull)", succ);
1336 | 	    break;
1337 | 	  }
1338 | 	case SWAP_ON_MODIFIED:
1339 | 	  {
1340 | 	    PRINT(" ** Results from Core 0 : store on modified");
1341 | 	    PRINT(" ** Results from Core 1 : SWAP on modified");
1342 | 	    break;
1343 | 	  }
1344 | 	case CAS_ON_SHARED:
1345 | 	  {
1346 | 	    PRINT(" ** Results from Core 0 : load from modified");
1347 | 	    PRINT(" ** Results from Core 1 : CAS on shared (100%% successfull)");
1348 | 	    PRINT(" ** Results from Core 2 : load from exlusive or shared");
1349 | 	    if (test_cores < 3)
1350 | 	      {
1351 | 		PRINT(" ** Need >=3 processes to achieve CAS_ON_SHARED");
1352 | 	      }
1353 | 	    break;
1354 | 	  }
1355 | 	case FAI_ON_SHARED:
1356 | 	  {
1357 | 	    PRINT(" ** Results from Core 0 : load from modified");
1358 | 	    PRINT(" ** Results from Core 1 : FAI on shared");
1359 | 	    PRINT(" ** Results from Core 2 : load from exlusive or shared");
1360 | 	    if (test_cores < 3)
1361 | 	      {
1362 | 		PRINT(" ** Need >=3 processes to achieve FAI_ON_SHARED");
1363 | 	      }
1364 | 	    break;
1365 | 	  }
1366 | 	case TAS_ON_SHARED:
1367 | 	  {
1368 | 	    PRINT(" ** Results from Core 0 : load from L1");
1369 | 	    uint32_t succ = test_ao_success * 100;
1370 | 	    PRINT(" ** Results from Core 1 : TAS on shared (%d%% successfull)", succ);
1371 | 	    PRINT(" ** Results from Core 2 : load from exlusive or shared");
1372 | 	    if (test_cores < 3)
1373 | 	      {
1374 | 		PRINT(" ** Need >=3 processes to achieve TAS_ON_SHARED");
1375 | 	      }
1376 | 	    break;
1377 | 	  }
1378 | 	case SWAP_ON_SHARED:
1379 | 	  {
1380 | 	    PRINT(" ** Results from Core 0 : load from modified");
1381 | 	    PRINT(" ** Results from Core 1 : SWAP on shared");
1382 | 	    PRINT(" ** Results from Core 2 : load from exlusive or shared");
1383 | 	    if (test_cores < 3)
1384 | 	      {
1385 | 		PRINT(" ** Need >=3 processes to achieve SWAP_ON_SHARED");
1386 | 	      }
1387 | 	    break;
1388 | 	  }
1389 | 	case CAS_CONCURRENT:
1390 | 	  {
1391 | 	    PRINT(" ** Results from Cores 0 & 1: CAS concurrent");
1392 | 	    break;
1393 | 	  }
1394 | 	case FAI_ON_INVALID:
1395 | 	  {
1396 | 	    PRINT(" ** Results from Core 0 : FAI on invalid");
1397 | 	    PRINT(" ** Results from Core 1 : cache line flush");
1398 | 	    break;
1399 | 	  }
1400 | 	case LOAD_FROM_L1:
1401 | 	  {
1402 | 	    PRINT(" ** Results from Core 0: load from L1");
1403 | 	    break;
1404 | 	  }
1405 | 	case LOAD_FROM_MEM_SIZE:
1406 | 	  {
1407 | 	    PRINT(" ** Results from Corees 0 & 1 & 2: load from random %zu KiB", test_mem_size / 1024);
1408 | 	    break;
1409 | 	  }
1410 | 	case LFENCE:
1411 | 	  {
1412 | 	    PRINT(" ** Results from Cores 0 & 1: load fence");
1413 | 	    break;
1414 | 	  }
1415 | 	case SFENCE:
1416 | 	  {
1417 | 	    PRINT(" ** Results from Cores 0 & 1: store fence");
1418 | 	    break;
1419 | 	  }
1420 | 	case MFENCE:
1421 | 	  {
1422 | 	    PRINT(" ** Results from Cores 0 & 1: full fence");
1423 | 	    break;
1424 | 	  }
1425 | 	case PROFILER:
1426 | 	  {
1427 | 	    PRINT(" ** Results from Cores 0 & 1: empty profiler region (start_prof - empty - stop_prof");
1428 | 	    break;
1429 | 	  }
1430 | 
1431 | 	default:
1432 | 	  break;
1433 | 	}
1434 |     }
1435 | 
1436 |   B0;
1437 | 
1438 | 
1439 |   if (ID < 3)
1440 |     {
1441 |       PRINT(" value of cl is %-10u / sum is %llu", cache_line->word[0], (LLU) sum);
1442 |     }
1443 |   cache_line_close(ID, "cache_line");
1444 |   barriers_term(ID);
1445 |   return 0;
1446 | 
1447 | }
1448 | 
1449 | uint32_t
1450 | cas(volatile cache_line_t* cl, volatile uint64_t reps)
1451 | {
1452 |   uint8_t o = reps & 0x1;
1453 |   uint8_t no = !o; 
1454 |   volatile uint32_t r;
1455 | 
1456 |   PFDI(0);
1457 |   r = CAS_U32(cl->word, o, no);
1458 |   PFDO(0, reps);
1459 | 
1460 |   return (r == o);
1461 | }
1462 | 
1463 | uint32_t
1464 | cas_no_pf(volatile cache_line_t* cl, volatile uint64_t reps)
1465 | {
1466 |   uint8_t o = reps & 0x1;
1467 |   uint8_t no = !o; 
1468 |   volatile uint32_t r;
1469 |   r = CAS_U32(cl->word, o, no);
1470 | 
1471 |   return (r == o);
1472 | }
1473 | 
1474 | uint32_t
1475 | cas_0_eventually(volatile cache_line_t* cl, volatile uint64_t reps)
1476 | {
1477 |   uint8_t o = reps & 0x1;
1478 |   uint8_t no = !o; 
1479 |   volatile uint32_t r;
1480 | 
1481 |   uint32_t cln = 0;
1482 |   do
1483 |     {
1484 |       cln = clrand();
1485 |       volatile cache_line_t* cl1 = cl + cln;
1486 |       PFDI(0);
1487 |       r = CAS_U32(cl1->word, o, no);
1488 |       PFDO(0, reps);
1489 |     }
1490 |   while (cln > 0);
1491 | 
1492 |   return (r == o);
1493 | }
1494 | 
1495 | uint32_t
1496 | fai(volatile cache_line_t* cl, volatile uint64_t reps)
1497 | {
1498 |   volatile uint32_t t = 0;
1499 | 
1500 |   uint32_t cln = 0;
1501 |   do
1502 |     {
1503 |       cln = clrand();
1504 |       volatile cache_line_t* cl1 = cl + cln;
1505 |       PFDI(0);
1506 |       t = FAI_U32(cl1->word);
1507 |       PFDO(0, reps);
1508 |     }
1509 |   while (cln > 0);
1510 | 
1511 |   return t;
1512 | }
1513 | 
1514 | uint8_t
1515 | tas(volatile cache_line_t* cl, volatile uint64_t reps)
1516 | {
1517 |   volatile uint8_t r;
1518 | 
1519 |   uint32_t cln = 0;
1520 |   do
1521 |     {
1522 |       cln = clrand();
1523 |       volatile cache_line_t* cl1 = cl + cln;
1524 | #if defined(TILERA)
1525 |       volatile uint32_t* b = (volatile uint32_t*) cl1->word;
1526 | #else
1527 |       volatile uint8_t* b = (volatile uint8_t*) cl1->word;
1528 | #endif
1529 | 
1530 |       PFDI(0);
1531 |       r = TAS_U8(b);
1532 |       PFDO(0, reps);
1533 |     }
1534 |   while (cln > 0);
1535 | 
1536 |   return (r != 255);
1537 | }
1538 | 
1539 | uint32_t
1540 | swap(volatile cache_line_t* cl, volatile uint64_t reps)
1541 | {
1542 |   volatile uint32_t res;
1543 | 
1544 |   uint32_t cln = 0;
1545 |   do
1546 |     {
1547 |       cln = clrand();
1548 |       volatile cache_line_t* cl1 = cl + cln;
1549 |       PFDI(0);
1550 |       res = SWAP_U32(cl1->word, ID);
1551 |       PFDO(0, reps);
1552 |     }
1553 |   while (cln > 0);
1554 | 
1555 |   _mm_mfence();
1556 |   return res;
1557 | }
1558 | 
1559 | void
1560 | store_0(volatile cache_line_t* cl, volatile uint64_t reps)
1561 | {
1562 |   if (test_sfence == 0)
1563 |     {
1564 |       PFDI(0);
1565 |       cl->word[0] = reps;
1566 |       PFDO(0, reps);
1567 |     }
1568 |   else if (test_sfence == 1)
1569 |     {
1570 |       PFDI(0);
1571 |       cl->word[0] = reps;
1572 |       _mm_sfence();
1573 |       PFDO(0, reps);
1574 |     }
1575 |   else if (test_sfence == 2)
1576 |     {
1577 |       PFDI(0);
1578 |       cl->word[0] = reps;
1579 |       _mm_mfence();
1580 |       PFDO(0, reps);
1581 |     }
1582 | }
1583 | 
1584 | void
1585 | store_0_no_pf(volatile cache_line_t* cl, volatile uint64_t reps)
1586 | {
1587 |   cl->word[0] = reps;
1588 |   if (test_sfence == 1)
1589 |     {
1590 |       _mm_sfence();
1591 |     }
1592 |   else if (test_sfence == 2)
1593 |     {
1594 |       _mm_mfence();
1595 |     }
1596 | }
1597 | 
1598 | static void
1599 | store_0_eventually_sf(volatile cache_line_t* cl, volatile uint64_t reps)
1600 | {
1601 |   volatile uint32_t cln = 0;
1602 |   do
1603 |     {
1604 |       cln = clrand();
1605 |       volatile uint32_t *w = &cl[cln].word[0];
1606 |       PFDI(0);
1607 |       w[0] = cln;
1608 |       _mm_sfence();
1609 |       PFDO(0, reps);
1610 |     }
1611 |   while (cln > 0);
1612 | }
1613 | 
1614 | static void
1615 | store_0_eventually_mf(volatile cache_line_t* cl, volatile uint64_t reps)
1616 | {
1617 |   volatile uint32_t cln = 0;
1618 |   do
1619 |     {
1620 |       cln = clrand();
1621 |       volatile uint32_t *w = &cl[cln].word[0];
1622 |       PFDI(0);
1623 |       w[0] = cln;
1624 |       _mm_mfence();
1625 |       PFDO(0, reps);
1626 |     }
1627 |   while (cln > 0);
1628 | }
1629 | 
1630 | static void
1631 | store_0_eventually_nf(volatile cache_line_t* cl, volatile uint64_t reps)
1632 | {
1633 |   volatile uint32_t cln = 0;
1634 |   do
1635 |     {
1636 |       cln = clrand();
1637 |       volatile uint32_t *w = &cl[cln].word[0];
1638 |       PFDI(0);
1639 |       w[0] = cln;
1640 |       PFDO(0, reps);
1641 |     }
1642 |   while (cln > 0);
1643 | }
1644 | 
1645 | static void
1646 | store_0_eventually_dw(volatile cache_line_t* cl, volatile uint64_t reps)
1647 | {
1648 |   volatile uint32_t cln = 0;
1649 |   do
1650 |     {
1651 |       cln = clrand();
1652 |       volatile uint32_t *w = &cl[cln].word[0];
1653 |       PFDI(0);
1654 |       w[0] = cln;
1655 |       w[16] = cln;
1656 |       PFDO(0, reps);
1657 |     }
1658 |   while (cln > 0);
1659 | }
1660 | 
1661 | void
1662 | store_0_eventually(volatile cache_line_t* cl, volatile uint64_t reps)
1663 | {
1664 |   if (test_sfence == 0)
1665 |     {
1666 |       store_0_eventually_nf(cl, reps);
1667 |     }
1668 |   else if (test_sfence == 1)
1669 |     {
1670 |       store_0_eventually_sf(cl, reps);
1671 |     }
1672 |   else if (test_sfence == 2)
1673 |     {
1674 |       store_0_eventually_mf(cl, reps);
1675 |     }
1676 |   else if (test_sfence == 3)
1677 |     {
1678 |       store_0_eventually_dw(cl, reps);
1679 |     }
1680 |   /* _mm_mfence(); */
1681 | }
1682 | 
1683 | 
1684 | static void
1685 | store_0_eventually_pfd1_sf(volatile cache_line_t* cl, volatile uint64_t reps)
1686 | {
1687 |   volatile uint32_t cln = 0;
1688 |   do
1689 |     {
1690 |       cln = clrand();
1691 |       volatile uint32_t *w = &cl[cln].word[0];
1692 |       PFDI(1);
1693 |       w[0] = cln;
1694 |       _mm_sfence();
1695 |       PFDO(1, reps);
1696 |     }
1697 |   while (cln > 0);
1698 | }
1699 | 
1700 | static void
1701 | store_0_eventually_pfd1_mf(volatile cache_line_t* cl, volatile uint64_t reps)
1702 | {
1703 |   volatile uint32_t cln = 0;
1704 |   do
1705 |     {
1706 |       cln = clrand();
1707 |       volatile uint32_t *w = &cl[cln].word[0];
1708 |       PFDI(1);
1709 |       w[0] = cln;
1710 |       _mm_mfence();
1711 |       PFDO(1, reps);
1712 |     }
1713 |   while (cln > 0);
1714 | }
1715 | 
1716 | static void
1717 | store_0_eventually_pfd1_nf(volatile cache_line_t* cl, volatile uint64_t reps)
1718 | {
1719 |   volatile uint32_t cln = 0;
1720 |   do
1721 |     {
1722 |       cln = clrand();
1723 |       volatile uint32_t *w = &cl[cln].word[0];
1724 |       PFDI(1);
1725 |       w[0] = cln;
1726 |       PFDO(1, reps);
1727 |     }
1728 |   while (cln > 0);
1729 | }
1730 | 
1731 | void
1732 | store_0_eventually_pfd1(volatile cache_line_t* cl, volatile uint64_t reps)
1733 | {
1734 |   if (test_sfence == 0)
1735 |     {
1736 |       store_0_eventually_pfd1_nf(cl, reps);
1737 |     }
1738 |   else if (test_sfence == 1)
1739 |     {
1740 |       store_0_eventually_pfd1_sf(cl, reps);
1741 |     }
1742 |   else if (test_sfence == 2)
1743 |     {
1744 |       store_0_eventually_pfd1_mf(cl, reps);
1745 |     }
1746 |   /* _mm_mfence(); */
1747 | }
1748 | 
1749 | static uint64_t
1750 | load_0_eventually_lf(volatile cache_line_t* cl, volatile uint64_t reps)
1751 | {
1752 |   volatile uint32_t cln = 0;
1753 |   volatile uint64_t val = 0;
1754 | 
1755 |   do
1756 |     {
1757 |       cln = clrand();
1758 |       volatile uint32_t* w = &cl[cln].word[0];
1759 |       PFDI(0);
1760 |       val = w[0];
1761 |       _mm_lfence();
1762 |       PFDO(0, reps);
1763 |     }
1764 |   while (cln > 0);
1765 |   return val;
1766 | }
1767 | 
1768 | static uint64_t
1769 | load_0_eventually_mf(volatile cache_line_t* cl, volatile uint64_t reps)
1770 | {
1771 |   volatile uint32_t cln = 0;
1772 |   volatile uint64_t val = 0;
1773 | 
1774 |   do
1775 |     {
1776 |       cln = clrand();
1777 |       volatile uint32_t* w = &cl[cln].word[0];
1778 |       PFDI(0);
1779 |       val = w[0];
1780 |       _mm_mfence();
1781 |       PFDO(0, reps);
1782 |     }
1783 |   while (cln > 0);
1784 |   return val;
1785 | }
1786 | 
1787 | static uint64_t
1788 | load_0_eventually_nf(volatile cache_line_t* cl, volatile uint64_t reps)
1789 | {
1790 |   volatile uint32_t cln = 0;
1791 |   volatile uint64_t val = 0;
1792 | 
1793 |   do
1794 |     {
1795 |       cln = clrand();
1796 |       volatile uint32_t* w = &cl[cln].word[0];
1797 |       PFDI(0);
1798 |       val = w[0];
1799 |       PFDO(0, reps);
1800 |     }
1801 |   while (cln > 0);
1802 |   return val;
1803 | }
1804 | 
1805 | 
1806 | uint64_t
1807 | load_0_eventually(volatile cache_line_t* cl, volatile uint64_t reps)
1808 | {
1809 |   uint64_t val = 0;
1810 |   if (test_lfence == 0)
1811 |     {
1812 |       val = load_0_eventually_nf(cl, reps);
1813 |     }
1814 |   else if (test_lfence == 1)
1815 |     {
1816 |       val = load_0_eventually_lf(cl, reps);
1817 |     }
1818 |   else if (test_lfence == 2)
1819 |     {
1820 |       val = load_0_eventually_mf(cl, reps);
1821 |     }
1822 |   _mm_mfence();
1823 |   return val;
1824 | }
1825 | 
1826 | uint64_t
1827 | load_0_eventually_no_pf(volatile cache_line_t* cl)
1828 | {
1829 |   uint32_t cln = 0;
1830 |   uint64_t sum = 0;
1831 |   do
1832 |     {
1833 |       cln = clrand();
1834 |       volatile uint32_t *w = &cl[cln].word[0];
1835 |       sum = w[0];
1836 |     }
1837 |   while (cln > 0);
1838 | 
1839 |   _mm_mfence();
1840 |   return sum;
1841 | }
1842 | 
1843 | static uint64_t
1844 | load_0_lf(volatile cache_line_t* cl, volatile uint64_t reps)
1845 | {
1846 |   volatile uint32_t val = 0;
1847 |   volatile uint32_t* p = (volatile uint32_t*) &cl->word[0];
1848 |   PFDI(0);
1849 |   val = p[0];
1850 |   _mm_lfence();
1851 |   PFDO(0, reps);
1852 |   return val;
1853 | }
1854 | 
1855 | static uint64_t
1856 | load_0_mf(volatile cache_line_t* cl, volatile uint64_t reps)
1857 | {
1858 |   volatile uint32_t val = 0;
1859 |   volatile uint32_t* p = (volatile uint32_t*) &cl->word[0];
1860 |   PFDI(0);
1861 |   val = p[0];
1862 |   _mm_mfence();
1863 |   PFDO(0, reps);
1864 |   return val;
1865 | }
1866 | 
1867 | static uint64_t
1868 | load_0_nf(volatile cache_line_t* cl, volatile uint64_t reps)
1869 | {
1870 |   volatile uint32_t val = 0;
1871 |   volatile uint32_t* p = (volatile uint32_t*) &cl->word[0];
1872 |   PFDI(0);
1873 |   val = p[0];
1874 |   PFDO(0, reps);
1875 |   return val;
1876 | }
1877 | 
1878 | 
1879 | uint64_t
1880 | load_0(volatile cache_line_t* cl, volatile uint64_t reps)
1881 | {
1882 |   uint64_t val = 0;
1883 |   if (test_lfence == 0)
1884 |     {
1885 |       val = load_0_nf(cl, reps);
1886 |     }
1887 |   else if (test_lfence == 1)
1888 |     {
1889 |       val = load_0_lf(cl, reps);
1890 |     }
1891 |   else if (test_lfence == 2)
1892 |     {
1893 |       val = load_0_mf(cl, reps);
1894 |     }
1895 |   _mm_mfence();
1896 |   return val;
1897 | }
1898 | 
1899 | static uint64_t
1900 | load_next_lf(volatile uint64_t* cl, volatile uint64_t reps)
1901 | {
1902 |   const size_t do_reps = test_cache_line_num;
1903 |   PFDI(0);
1904 |   int i;
1905 |   for (i = 0; i < do_reps; i++)
1906 |     {
1907 |       cl = (uint64_t*) *cl;
1908 |       _mm_lfence();
1909 |     }
1910 |   PFDOR(0, reps, do_reps);
1911 |   return *cl;
1912 | 
1913 | }
1914 | 
1915 | static uint64_t
1916 | load_next_mf(volatile uint64_t* cl, volatile uint64_t reps)
1917 | {
1918 |   const size_t do_reps = test_cache_line_num;
1919 |   PFDI(0);
1920 |   int i;
1921 |   for (i = 0; i < do_reps; i++)
1922 |     {
1923 |       cl = (uint64_t*) *cl;
1924 |       _mm_mfence();
1925 |     }
1926 |   PFDOR(0, reps, do_reps);
1927 |   return *cl;
1928 | 
1929 | }
1930 | 
1931 | static uint64_t
1932 | load_next_nf(volatile uint64_t* cl, volatile uint64_t reps)
1933 | {
1934 |   const size_t do_reps = test_cache_line_num;
1935 |   PFDI(0);
1936 |   int i;
1937 |   for (i = 0; i < do_reps; i++)
1938 |     {
1939 |       cl = (uint64_t*) *cl;
1940 |     }
1941 |   PFDOR(0, reps, do_reps);
1942 |   return *cl;
1943 | }
1944 | 
1945 | uint64_t
1946 | load_next(volatile uint64_t* cl, volatile uint64_t reps)
1947 | {
1948 |   uint64_t val = 0;
1949 |   if (test_lfence == 0)
1950 |     {
1951 |       val = load_next_nf(cl, reps);
1952 |     }
1953 |   else if (test_lfence == 1)
1954 |     {
1955 |       val = load_next_lf(cl, reps);
1956 |     }
1957 |   else if (test_lfence == 2)
1958 |     {
1959 |       val = load_next_mf(cl, reps);
1960 |     }
1961 |   return val;
1962 | }
1963 | 
1964 | void
1965 | invalidate(volatile cache_line_t* cl, uint64_t index, volatile uint64_t reps)
1966 | {
1967 |   PFDI(0);
1968 |   _mm_clflush((void*) (cl + index));
1969 |   PFDO(0, reps);
1970 |   _mm_mfence();
1971 | }
1972 | 
1973 | static size_t
1974 | parse_size(char* optarg)
1975 | {
1976 |   size_t test_mem_size_multi = 1;
1977 |   char multi = optarg[strlen(optarg) - 1];
1978 |   if (multi == 'b' || multi == 'B')
1979 |     {
1980 |       optarg[strlen(optarg) - 1] = optarg[strlen(optarg)];
1981 |       multi = optarg[strlen(optarg) - 1];
1982 |     }
1983 | 
1984 |   if (multi == 'k' || multi == 'K')
1985 |     {
1986 |       test_mem_size_multi = 1024;
1987 |       optarg[strlen(optarg) - 1] = optarg[strlen(optarg)];
1988 |     }
1989 |   else if (multi == 'm' || multi == 'M')
1990 |     {
1991 |       test_mem_size_multi = 1024 * 1024LL;
1992 |       optarg[strlen(optarg) - 1] = optarg[strlen(optarg)];
1993 |     }
1994 |   else if (multi == 'g' || multi == 'G')
1995 |     {
1996 |       test_mem_size_multi = 1024 * 1024 * 1024LL;
1997 |       optarg[strlen(optarg) - 1] = optarg[strlen(optarg)];
1998 |     }
1999 | 
2000 |   return test_mem_size_multi * atoi(optarg);
2001 | }
2002 | 
2003 | volatile cache_line_t* 
2004 | cache_line_open()
2005 | {
2006 |   uint64_t size = test_cache_line_num * sizeof(cache_line_t);
2007 | 
2008 | #if defined(__tile__)
2009 |   tmc_alloc_t alloc = TMC_ALLOC_INIT;
2010 |   tmc_alloc_set_shared(&alloc);
2011 |   /*   tmc_alloc_set_home(&alloc, TMC_ALLOC_HOME_HASH); */
2012 |   /*   tmc_alloc_set_home(&alloc, MAP_CACHE_NO_LOCAL); */
2013 |   tmc_alloc_set_home(&alloc, TMC_ALLOC_HOME_HERE);
2014 |   /*   tmc_alloc_set_home(&alloc, TMC_ALLOC_HOME_TASK); */
2015 |   
2016 |   volatile cache_line_t* cache_line = (volatile cache_line_t*) tmc_alloc_map(&alloc, size);
2017 |   if (cache_line == NULL)
2018 |     {
2019 |       tmc_task_die("Failed to allocate memory.");
2020 |     }
2021 | 
2022 |   tmc_cmem_init(0);		/*   initialize shared memory */
2023 | 
2024 | 
2025 |   cache_line->word[0] = 0;
2026 | 
2027 | #else	 /* !__tile__ ****************************************************************************************/
2028 |   char keyF[100];
2029 |   sprintf(keyF, CACHE_LINE_MEM_FILE);
2030 | 
2031 |   int ssmpfd = shm_open(keyF, O_CREAT | O_EXCL | O_RDWR, S_IRWXU | S_IRWXG);
2032 |   if (ssmpfd < 0) 
2033 |     {
2034 |       if (errno != EEXIST) 
2035 | 	{
2036 | 	  perror("In shm_open");
2037 | 	  exit(1);
2038 | 	}
2039 | 
2040 | 
2041 |       ssmpfd = shm_open(keyF, O_CREAT | O_RDWR, S_IRWXU | S_IRWXG);
2042 |       if (ssmpfd < 0) 
2043 | 	{
2044 | 	  perror("In shm_open");
2045 | 	  exit(1);
2046 | 	}
2047 |     }
2048 |   else {
2049 |     //    P("%s newly openned", keyF);
2050 |     if (ftruncate(ssmpfd, size) < 0) {
2051 |       perror("ftruncate failed\n");
2052 |       exit(1);
2053 |     }
2054 |   }
2055 | 
2056 |   volatile cache_line_t* cache_line = 
2057 |     (volatile cache_line_t *) mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, ssmpfd, 0);
2058 |   if (cache_line == NULL)
2059 |     {
2060 |       perror("cache_line = NULL\n");
2061 |       exit(134);
2062 |     }
2063 | 
2064 | #endif  /* __tile ********************************************************************************************/
2065 |   memset((void*) cache_line, '1', size);
2066 | 
2067 |   if (ID == 0)
2068 |     {
2069 |       uint32_t cl;
2070 |       for (cl = 0; cl < test_cache_line_num; cl++)
2071 | 	{
2072 | 	  cache_line[cl].word[0] = 0;
2073 | 	  _mm_clflush((void*) (cache_line + cl));
2074 | 	}
2075 | 
2076 |       if (test_test == LOAD_FROM_MEM_SIZE)
2077 | 	{
2078 | 	  create_rand_list_cl((volatile uint64_t*) cache_line, test_mem_size / sizeof(uint64_t));
2079 | 	}
2080 | 
2081 | 
2082 |     }
2083 | 
2084 |   _mm_mfence();
2085 |   return cache_line;
2086 | }
2087 | 
2088 | static void
2089 | create_rand_list_cl(volatile uint64_t* list, size_t n)
2090 | {
2091 |   size_t per_cl = sizeof(cache_line_t) / sizeof(uint64_t);
2092 |   n /= per_cl;
2093 | 
2094 |   unsigned long* s = seed_rand();
2095 |   s[0] = 0xB9E4E2F1F1E2E3D5L;
2096 |   s[1] = 0xF1E2E3D5B9E4E2F1L;
2097 |   s[2] = 0x9B3A0FA212342345L;
2098 | 
2099 |   uint8_t* used = calloc(n * per_cl, sizeof(uint8_t));
2100 |   assert (used != NULL);
2101 | 
2102 |   size_t idx = 0;
2103 |   size_t used_num = 0;
2104 |   while (used_num < n - 1)
2105 |     {
2106 |       used[idx] = 1;
2107 |       used_num++;
2108 |       
2109 |       size_t nxt;
2110 |       do 
2111 | 	{
2112 | 	  nxt = (my_random(s, s+1, s+2) % n) * per_cl;
2113 | 	}
2114 |       while (used[nxt]);
2115 | 
2116 |       list[idx] = (uint64_t) (list + nxt);
2117 |       idx = nxt;
2118 |     }
2119 |   list[idx] = (uint64_t) (list); /* close the loop! */
2120 | 
2121 |   free(s);
2122 |   free(used);
2123 | } 
2124 | 
2125 | void
2126 | cache_line_close(const uint32_t id, const char* name)
2127 | {
2128 | #if !defined(__tile__)
2129 |   if (id == 0)
2130 |     {
2131 |       char keyF[100];
2132 |       sprintf(keyF, CACHE_LINE_MEM_FILE);
2133 |       shm_unlink(keyF);
2134 |     }
2135 | #else
2136 |   tmc_cmem_close();
2137 | #endif
2138 | }
2139 | 
2140 | 


--------------------------------------------------------------------------------
/src/pfd.c:
--------------------------------------------------------------------------------
  1 | /*   
  2 |  *   File: pfd.c
  3 |  *   Author: Vasileios Trigonakis <vasileios.trigonakis@epfl.ch>
  4 |  *   Description: a fine-grained profiler based on rdtsc
  5 |  *   pfd.c is part of ccbench
  6 |  *
  7 |  * The MIT License (MIT)
  8 |  *
  9 |  * Copyright (C) 2013  Vasileios Trigonakis
 10 |  *
 11 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 12 |  * this software and associated documentation files (the "Software"), to deal in
 13 |  * the Software without restriction, including without limitation the rights to
 14 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 15 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 16 |  * subject to the following conditions:
 17 |  *
 18 |  * The above copyright notice and this permission notice shall be included in all
 19 |  * copies or substantial portions of the Software.
 20 |  *
 21 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 22 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 23 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 24 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 25 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 26 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 27 |  *
 28 |  */
 29 | 
 30 | #include "pfd.h"
 31 | #include <math.h>
 32 | #include "atomic_ops.h"
 33 | 
 34 | volatile ticks** pfd_store;
 35 | volatile ticks* _pfd_s;
 36 | volatile ticks pfd_correction;
 37 | 
 38 | void 
 39 | pfd_store_init(uint32_t num_entries)
 40 | {
 41 |   _pfd_s = (volatile ticks*) malloc(PFD_NUM_STORES * sizeof(ticks));
 42 |   pfd_store = (volatile ticks**) malloc(PFD_NUM_STORES * sizeof(ticks*));
 43 |   assert(_pfd_s != NULL && pfd_store != NULL);
 44 | 
 45 |   volatile uint32_t i;
 46 |   for (i = 0; i < PFD_NUM_STORES; i++)
 47 |     {
 48 |       pfd_store[i] = (ticks*) malloc(num_entries * sizeof(ticks));
 49 |       assert(pfd_store[i] != NULL);
 50 |       PREFETCHW((void*) &pfd_store[i][0]);
 51 |     }
 52 | 
 53 |   int32_t tries = 10;
 54 |   uint32_t print_warning = 0;
 55 | 
 56 | 
 57 | #if defined(XEON) || defined(OPTERON2) || defined(XEON2) || defined(DEFAULT)
 58 |   /* enforcing max freq if freq scaling is enabled */
 59 |   volatile uint64_t speed;
 60 |   for (speed = 0; speed < 20e7; speed++)
 61 |     {
 62 |       asm volatile ("");
 63 |     }
 64 | #endif	/* XEON */
 65 | 
 66 |   pfd_correction = 0;
 67 | 
 68 | #define PFD_CORRECTION_CONF 3
 69 |  retry:
 70 |   for (i = 0; i < num_entries; i++)
 71 |     {
 72 |       PFDI(0);
 73 |       asm volatile ("");
 74 |       PFDO(0, i);
 75 |     }
 76 | 
 77 |   abs_deviation_t ad;
 78 |   get_abs_deviation(pfd_store[0], num_entries, &ad);
 79 |   double std_pp = 100 * (1 - (ad.avg - ad.std_dev) / ad.avg);
 80 | 
 81 |   if (std_pp > PFD_CORRECTION_CONF)
 82 |     {
 83 |       if (print_warning++ == 1)	/* print warning if 2 failed attempts */
 84 | 	{
 85 | 	  printf("* warning: avg pfd correction is %.1f with std deviation: %.1f%%. Recalculating.\n", 
 86 | 		 ad.avg, std_pp);
 87 | 	}
 88 |       if (tries-- > 0)
 89 | 	{
 90 | 	  goto retry;
 91 | 	}
 92 |       else
 93 | 	{
 94 | 	  printf("* warning: setting pfd correction manually\n");
 95 | #if defined(OPTERON)
 96 | 	  ad.avg = 64;
 97 | #elif defined(OPTERON2)
 98 | 	  ad.avg = 68;
 99 | #elif defined(XEON) || defined(XEON2)
100 | 	  ad.avg = 20;
101 | #elif defined(NIAGARA)
102 | 	  ad.avg = 76;
103 | #else
104 | 	  printf("* warning: no default value for pfd correction is provided (fix in src/pfd.c)\n");
105 | #endif
106 | 	}
107 |     }
108 | 
109 |   pfd_correction = ad.avg;
110 |   assert(pfd_correction > 0);
111 |   
112 |   printf("* set pfd correction: %llu (std deviation: %.1f%%)\n", (long long unsigned int) pfd_correction, std_pp);
113 | }
114 | 
115 | static inline 
116 | double absd(double x)
117 | {
118 |   if (x >= 0)
119 |     {
120 |       return x;
121 |     }
122 |   else 
123 |     {
124 |       return -x;
125 |     }
126 | }
127 | 
128 | 
129 | #define llu long long unsigned int
130 | void 
131 | print_abs_deviation(const abs_deviation_t* abs_dev)
132 | {
133 |   printf("\n ---- statistics:\n");
134 |   PRINT("    avg : %-10.1f abs dev : %-10.1f std dev : %-10.1f num     : %llu", 
135 | 	abs_dev->avg, abs_dev->abs_dev, abs_dev->std_dev, (llu) abs_dev->num_vals);
136 |   PRINT("    min : %-10.1f (element: %6llu)    max     : %-10.1f (element: %6llu)", abs_dev->min_val, 
137 | 	(llu) abs_dev->min_val_idx, abs_dev->max_val, (llu) abs_dev->max_val_idx);
138 |   double v10p = 100 * 
139 |     (1 - (abs_dev->num_vals - abs_dev->num_dev_10p) / (double) abs_dev->num_vals);
140 |   double std_10pp = 100 * (1 - (abs_dev->avg_10p - abs_dev->std_dev_10p) / abs_dev->avg_10p);
141 |   PRINT("  0-10%% : %-10u ( %5.1f%%  |  avg:  %6.1f  |  abs dev: %6.1f  |  std dev: %6.1f = %5.1f%% )", 
142 | 	abs_dev->num_dev_10p, v10p, abs_dev->avg_10p, abs_dev->abs_dev_10p, abs_dev->std_dev_10p, std_10pp);
143 |   double v25p = 100 
144 |     * (1 - (abs_dev->num_vals - abs_dev->num_dev_25p) / (double) abs_dev->num_vals);
145 |   double std_25pp = 100 * (1 - (abs_dev->avg_25p - abs_dev->std_dev_25p) / abs_dev->avg_25p);
146 |   PRINT(" 10-25%% : %-10u ( %5.1f%%  |  avg:  %6.1f  |  abs dev: %6.1f  |  std dev: %6.1f = %5.1f%% )", 
147 | 	abs_dev->num_dev_25p, v25p, abs_dev->avg_25p, abs_dev->abs_dev_25p, abs_dev->std_dev_25p, std_25pp);
148 |   double v50p = 100 * 
149 |     (1 - (abs_dev->num_vals - abs_dev->num_dev_50p) / (double) abs_dev->num_vals);
150 |   double std_50pp = 100 * (1 - (abs_dev->avg_50p - abs_dev->std_dev_50p) / abs_dev->avg_50p);
151 |   PRINT(" 25-50%% : %-10u ( %5.1f%%  |  avg:  %6.1f  |  abs dev: %6.1f  |  std dev: %6.1f = %5.1f%% )", 
152 | 	abs_dev->num_dev_50p, v50p, abs_dev->avg_50p, abs_dev->abs_dev_50p, abs_dev->std_dev_50p, std_50pp);
153 |   double v75p = 100 * 
154 |     (1 - (abs_dev->num_vals - abs_dev->num_dev_75p) / (double) abs_dev->num_vals);
155 |   double std_75pp = 100 * (1 - (abs_dev->avg_75p - abs_dev->std_dev_75p) / abs_dev->avg_75p);
156 |   PRINT(" 50-75%% : %-10u ( %5.1f%%  |  avg:  %6.1f  |  abs dev: %6.1f  |  std dev: %6.1f = %5.1f%% )", 
157 | 	abs_dev->num_dev_75p, v75p, abs_dev->avg_75p, abs_dev->abs_dev_75p, abs_dev->std_dev_75p, std_75pp);
158 |   double vrest = 100 * 
159 |     (1 - (abs_dev->num_vals - abs_dev->num_dev_rst) / (double) abs_dev->num_vals);
160 |   double std_rspp = 100 * (1 - (abs_dev->avg_rst - abs_dev->std_dev_rst) / abs_dev->avg_rst);
161 |   PRINT("75-100%% : %-10u ( %5.1f%%  |  avg:  %6.1f  |  abs dev: %6.1f  |  std dev: %6.1f = %5.1f%% )\n", 
162 | 	abs_dev->num_dev_rst, vrest, abs_dev->avg_rst, abs_dev->abs_dev_rst, abs_dev->std_dev_rst, std_rspp);
163 | }
164 | 
165 | #define PFD_VAL_UP_LIMIT 1500	/* do not consider values higher than this value */
166 | 
167 | void
168 | get_abs_deviation(volatile ticks* vals, const size_t num_vals, abs_deviation_t* abs_dev)
169 | {
170 |   abs_dev->num_vals = num_vals;
171 |   ticks sum_vals = 0;
172 |   uint32_t i;
173 |   for (i = 0; i < num_vals; i++)
174 |     {
175 |       if ((int64_t) vals[i] < 0 || vals[i] > PFD_VAL_UP_LIMIT)
176 | 	{
177 | 	  vals[i] = 0;
178 | 	}
179 |       sum_vals += vals[i];
180 |     }
181 | 
182 |   double avg = sum_vals / (double) num_vals;
183 |   abs_dev->avg = avg;
184 |   double max_val = 0;
185 |   double min_val = DBL_MAX;
186 |   uint64_t max_val_idx = 0, min_val_idx = 0;
187 |   uint32_t num_dev_10p = 0; ticks sum_vals_10p = 0; double dev_10p = 0.1 * avg;
188 |   uint32_t num_dev_25p = 0; ticks sum_vals_25p = 0; double dev_25p = 0.25 * avg;
189 |   uint32_t num_dev_50p = 0; ticks sum_vals_50p = 0; double dev_50p = 0.5 * avg;
190 |   uint32_t num_dev_75p = 0; ticks sum_vals_75p = 0; double dev_75p = 0.75 * avg;
191 |   uint32_t num_dev_rst = 0; ticks sum_vals_rst = 0;
192 | 
193 |   double sum_adev = 0;		/* abs deviation */
194 |   double sum_stdev = 0;		/* std deviation */
195 |   for (i = 0; i < num_vals; i++)
196 |     {
197 |       double diff = vals[i] - avg;
198 |       double ad = absd(diff);
199 |       if (vals[i] > max_val)
200 | 	{
201 | 	  max_val = vals[i];
202 | 	  max_val_idx = i;
203 | 	}
204 |       else if (vals[i] < min_val)
205 | 	{
206 | 	  min_val = vals[i];
207 | 	  min_val_idx = i;
208 | 	}
209 | 
210 |       if (ad <= dev_10p)
211 | 	{
212 | 	  num_dev_10p++;
213 | 	  sum_vals_10p += vals[i];
214 | 	}
215 |       else if (ad <= dev_25p)
216 | 	{
217 | 	  num_dev_25p++;
218 | 	  sum_vals_25p += vals[i];
219 | 	}
220 |       else if (ad <= dev_50p)
221 | 	{
222 | 	  num_dev_50p++;
223 | 	  sum_vals_50p += vals[i];
224 | 	}
225 |       else if (ad <= dev_75p)
226 | 	{
227 | 	  num_dev_75p++;
228 | 	  sum_vals_75p += vals[i];
229 | 	}
230 |       else
231 | 	{
232 | 	  num_dev_rst++;
233 | 	  sum_vals_rst += vals[i];
234 | 	}
235 | 
236 |       sum_adev += ad;
237 |       sum_stdev += ad*ad;
238 |     }
239 |   abs_dev->min_val = min_val;
240 |   abs_dev->min_val_idx = min_val_idx;
241 |   abs_dev->max_val = max_val;
242 |   abs_dev->max_val_idx = max_val_idx;
243 |   abs_dev->num_dev_10p = num_dev_10p;
244 |   abs_dev->num_dev_25p = num_dev_25p;
245 |   abs_dev->num_dev_50p = num_dev_50p;
246 |   abs_dev->num_dev_75p = num_dev_75p;
247 |   abs_dev->num_dev_rst = num_dev_rst;
248 | 
249 |   abs_dev->avg_10p = sum_vals_10p / (double) num_dev_10p;
250 |   abs_dev->avg_25p = sum_vals_25p / (double) num_dev_25p;
251 |   abs_dev->avg_50p = sum_vals_50p / (double) num_dev_50p;
252 |   abs_dev->avg_75p = sum_vals_75p / (double) num_dev_75p;
253 |   abs_dev->avg_rst = sum_vals_rst / (double) num_dev_rst;
254 | 
255 |   double sum_adev_10p = 0, sum_adev_25p = 0, sum_adev_50p = 0, sum_adev_75p = 0, sum_adev_rst = 0;
256 |   double sum_stdev_10p = 0, sum_stdev_25p = 0, sum_stdev_50p = 0, sum_stdev_75p = 0, sum_stdev_rst = 0;
257 | 
258 |   /* pass again to calculate the deviations for the 10/25..p */
259 |   for (i = 0; i < num_vals; i++)
260 |     {
261 |       double diff = vals[i] - avg;
262 |       double ad = absd(diff);
263 |       if (ad <= dev_10p)
264 | 	{
265 | 	  double diff = vals[i] - abs_dev->avg_10p;
266 | 	  double ad = absd(diff);
267 | 	  sum_adev_10p += ad;
268 | 	  sum_stdev_10p += (ad*ad);
269 | 	}
270 |       else if (ad <= dev_25p)
271 | 	{
272 | 	  double diff = vals[i] - abs_dev->avg_25p;
273 | 	  double ad = absd(diff);
274 | 	  sum_adev_25p += ad;
275 | 	  sum_stdev_25p += (ad*ad);
276 | 	}
277 |       else if (ad <= dev_50p)
278 | 	{
279 | 	  double diff = vals[i] - abs_dev->avg_50p;
280 | 	  double ad = absd(diff);
281 | 	  sum_adev_50p += ad;
282 | 	  sum_stdev_50p += (ad*ad);
283 | 	}
284 |       else if (ad <= dev_75p)
285 | 	{
286 | 	  double diff = vals[i] - abs_dev->avg_75p;
287 | 	  double ad = absd(diff);
288 | 	  sum_adev_75p += ad;
289 | 	  sum_stdev_75p += (ad*ad);
290 | 	}
291 |       else
292 | 	{
293 | 	  double diff = vals[i] - abs_dev->avg_rst;
294 | 	  double ad = absd(diff);
295 | 	  sum_adev_rst += ad;
296 | 	  sum_stdev_rst += (ad*ad);
297 | 	}
298 |     }
299 | 
300 |   abs_dev->abs_dev_10p = sum_adev_10p / num_dev_10p; 
301 |   abs_dev->abs_dev_25p = sum_adev_25p / num_dev_25p; 
302 |   abs_dev->abs_dev_50p = sum_adev_50p / num_dev_50p; 
303 |   abs_dev->abs_dev_75p = sum_adev_75p / num_dev_75p; 
304 |   abs_dev->abs_dev_rst = sum_adev_rst / num_dev_rst; 
305 | 
306 |   abs_dev->std_dev_10p = sqrt(sum_stdev_10p / num_dev_10p); 
307 |   abs_dev->std_dev_25p = sqrt(sum_stdev_25p / num_dev_25p); 
308 |   abs_dev->std_dev_50p = sqrt(sum_stdev_50p / num_dev_50p); 
309 |   abs_dev->std_dev_75p = sqrt(sum_stdev_75p / num_dev_75p); 
310 |   abs_dev->std_dev_rst = sqrt(sum_stdev_rst / num_dev_rst); 
311 | 
312 |   double adev = sum_adev / num_vals;
313 |   abs_dev->abs_dev = adev;
314 |   double stdev = sqrt(sum_stdev / num_vals);
315 |   abs_dev->std_dev = stdev;
316 | }
317 | 


--------------------------------------------------------------------------------