├── .gitignore
├── LICENSE.txt
├── third-party-programs.txt
├── makefile
├── README.md
├── run.sh
└── stream.c


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.bin
3 | *.log
4 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2021 Intel Corporation
 2 | 
 3 | SPDX-License-Identifier: BSD-3-Clause
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 6 | 
 7 |     1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 8 |     2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 9 |     3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 | 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 | 


--------------------------------------------------------------------------------
/third-party-programs.txt:
--------------------------------------------------------------------------------
 1 | Third Party Programs File
 2 | 
 3 | This file contains the list of third party software ("third party programs")
 4 | contained in the Intel software and their required notices and/or license terms.
 5 | This third party software, even if included with the distribution of the Intel
 6 | software, may be governed by separate license terms, including without limitation,
 7 | third party license terms, other Intel software license terms, and open source
 8 | software license terms. These separate license terms govern your use of the third
 9 | party programs as set forth in the "third-party-programs.txt" or other similarly named text file.
10 | 
11 | 
12 | Third party programs and their corresponding required notices and/or license terms are listed below.
13 | 
14 | --------------------------------------------------------------------------------
15 | 1. stream 
16 | *-----------------------------------------------------------------------
17 | * Copyright 1991-2003: John D. McCalpin
18 | *-----------------------------------------------------------------------
19 | * License:
20 | *  1. You are free to use this program and/or to redistribute
21 | *     this program.
22 | *  2. You are free to modify this program for your own use,
23 | *     including commercial use, subject to the publication
24 | *     restrictions in item 3.
25 | *  3. You are free to publish results obtained from running this
26 | *     program, or from works that you derive from this program,
27 | *     with the following limitations:
28 | *     3a. In order to be referred to as "STREAM benchmark results",
29 | *         published results must be in conformance to the STREAM
30 | *         Run Rules, (briefly reviewed below) published at
31 | *         http://www.cs.virginia.edu/stream/ref.html
32 | *         and incorporated herein by reference.
33 | *         As the copyright holder, John McCalpin retains the
34 | *         right to determine conformity with the Run Rules.
35 | *     3b. Results based on modified source code or on runs not in
36 | *         accordance with the STREAM Run Rules must be clearly
37 | *         labelled whenever they are published.  Examples of
38 | *         proper labelling include:
39 | *         "tuned STREAM benchmark results" 
40 | *         "based on a variant of the STREAM benchmark code"
41 | *         Other comparable, clear and reasonable labelling is
42 | *         acceptable.
43 | *     3c. Submission of results to the STREAM benchmark web site
44 | *         is encouraged, but not required.
45 | *  4. Use of this program or creation of derived works based on this
46 | *     program constitutes acceptance of these licensing restrictions.
47 | *  5. Absolutely no warranty is expressed or implied.
48 | *-----------------------------------------------------------------------
49 | -------------------------------------------------------------
50 | 
51 | *Other names and brands may be claimed as the property of others.
52 | 
53 | -------------------------------------------------------------
54 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2021 Intel Corporation
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | CC  = icc
 5 | 
 6 | # STREAM options:
 7 | # -DNTIMES control the number of times each stream kernel is executed
 8 | # -DOFFSET controls the number of bytes between each of the buffers
 9 | # -DSTREAM_TYPE specifies the data-type of elements in the buffers
10 | # -DSTREAM_ARRAY_SIZE specifies the number of elements in each buffer
11 | STREAM_CPP_OPTS   = -DNTIMES=100 -DOFFSET=0 -DSTREAM_TYPE=double
12 | # Size per array is approx. ~2GB. Delibrately using non-power of 2 elements
13 | # 256*1024*1024 elements = 268435456 elements = 2GiB with FP64
14 | STREAM_ARRAY_SIZE = 269000000
15 | 
16 | ifdef size
17 | STREAM_ARRAY_SIZE = $(size)
18 | endif
19 | STREAM_CPP_OPTS  += -DSTREAM_ARRAY_SIZE=$(STREAM_ARRAY_SIZE)
20 | 
21 | # Intel Compiler options to control the generated ISA
22 | AVX_COPTS      = -xAVX
23 | AVX2_COPTS     = -xCORE-AVX2
24 | AVX512_COPTS   = -xCORE-AVX512 -qopt-zmm-usage=high
25 | # Common Intel Compiler options that are independent of ISA
26 | COMMON_COPTS   = -Wall -O3 -mcmodel=medium -qopenmp -shared-intel
27 | 
28 | ifdef rfo
29 | COMMON_COPTS += -qopt-streaming-stores never -fno-builtin
30 | else
31 | COMMON_COPTS += -qopt-streaming-stores always
32 | endif
33 | 
34 | AVX_OBJS    = stream_avx.o
35 | AVX2_OBJS   = stream_avx2.o
36 | AVX512_OBJS = stream_avx512.o
37 | 
38 | ifdef cpu
39 | all: stream_$(cpu).bin
40 | else
41 | all: stream_avx.bin stream_avx2.bin stream_avx512.bin
42 | endif
43 | 
44 | SRC = stream.c
45 | 
46 | stream_avx.o: $(SRC)
47 | 	$(CC) $(COMMON_COPTS) $(AVX_COPTS) $(STREAM_CPP_OPTS) -c $(SRC) -o $@
48 | stream_avx2.o: $(SRC)
49 | 	$(CC) $(COMMON_COPTS) $(AVX2_COPTS) $(STREAM_CPP_OPTS) -c $(SRC) -o $@
50 | stream_avx512.o: $(SRC)
51 | 	$(CC) $(COMMON_COPTS) $(AVX512_COPTS) $(STREAM_CPP_OPTS) -c $(SRC) -o $@
52 | 
53 | 
54 | stream_avx.bin: $(AVX_OBJS)
55 | 	$(CC) $(COMMON_COPTS) $(AVX_COPTS) $^ -o $@
56 | stream_avx2.bin: $(AVX2_OBJS)
57 | 	$(CC) $(COMMON_COPTS) $(AVX2_COPTS) $^ -o $@
58 | stream_avx512.bin: $(AVX512_OBJS)
59 | 	$(CC) $(COMMON_COPTS) $(AVX512_COPTS) $^ -o $@
60 | 
61 | help:
62 | 	@echo -e "Running 'make' with no options would compile the STREAM benchmark with $(STREAM_ARRAY_SIZE) FP64 elements per array for following Intel CPU's:\n"
63 | 	@echo -e "\tstream_avx.bin        => Targeted for Intel CPU's that support AVX ISA"
64 | 	@echo -e "\tstream_avx2.bin       => Targeted for Intel CPU's that support AVX2 ISA"
65 | 	@echo -e "\tstream_avx512.bin     => Targeted for Intel CPU's that support AVX512 ISA"
66 | 	@echo -e "\nThe following options are supported:"
67 | 	@echo -e "\tsize=<number_of_elements_per_array>"
68 | 	@echo ""
69 | 	@echo -e "\tcpu=<avx,avx2,avx512>"
70 | 	@echo ""
71 | 	@echo -e "\trfo=1 forces to use regular cached stores instead of non-temporal stores"
72 | 	@echo ""
73 | 	@echo -e "\nFew examples:"
74 | 	@echo -e "To compile STREAM benchmark only for Intel AVX512 CPU's, do:"
75 | 	@echo -e "\tmake cpu=avx512"
76 | 	@echo ""
77 | 	@echo -e "To compile STREAM benchmark for Intel AVX512 CPU's with each buffer containing 67108864 elements, do:"
78 | 	@echo -e "\tmake size=67108864 cpu=avx512"
79 | 
80 | clean:
81 | 	rm -rf *.o *.bin 
82 | 
83 | .PHONY: all clean help
84 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DISCONTINUATION OF PROJECT #  
 2 | This project will no longer be maintained by Intel.  
 3 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project.  
 4 | Intel no longer accepts patches to this project.  
 5 |  If you have an ongoing need to use this project, are interested in independently developing it, or would like to maintain patches for the open source software community, please create your own fork of this project.  
 6 |   
 7 | # Memory Bandwidth Benchmarks
 8 | 
 9 | ## Overview
10 | This repository intends to provides a set of benchmarks that can be used to measure the memory bandwidth performance of CPU's. In the initial release, we provide the de-facto memory bandwidth benchmark, [STREAM](https://www.cs.virginia.edu/stream/) [1] along with compilation and run scripts to obtain ideal performance on Intel(R) Processors.
11 | 
12 | ## STREAM Overview
13 | STREAM is a simple, synthetic benchmark designed to measure sustainable memory bandwidth (in MB/s) for four simple vector kernels: Copy, Scale, Add and Triad. Its source code is freely available [here](https://www.cs.virginia.edu/stream/FTP/Code/)
14 | 
15 | There are two categories created by STREAM benchmark author for citing memory bandwidth performance -- Standard and Tuned. Results obtained from the unmodified source code are considered as "Standard" whereas a "Tuned" category has been added to allow users or vendors to submit results based on modified source code. On Intel(R) processors, we don't need to modify the source code of the benchmark for optimal performance. We provide instructions to compile and run STREAM without any source code modifications. Hence, the performance results obtained would fall under the Standard category.
16 | 
17 | The general rule for STREAM is that each array must be at least 4x the size of the sum of all the last-level caches used in the run, or 1 million elements, whichever is larger.
18 | 
19 | ## Pre-requisites
20 | - Intel C Compiler: Performance of STREAM benchmark is dependent on the Compiler options used. Hence, we rely on the Intel C Compiler to generate the underlying non-temporal store instructions to achieve optimal performance on Intel CPU's.
21 | - Linux environment: Currently, the makefile assume Linux OS environment.
22 | 
23 | ## Compilation
24 | - Ensure Intel C Compiler (icc) is available in your shell environment.
25 | - Run `make`. This would generate the following binaries:
26 |   - stream_avx.bin        => Targeted for Intel CPU's that support AVX ISA
27 |   - stream_avx2.bin       => Targeted for Intel CPU's that support AVX2 ISA
28 |   - stream_avx512.bin     => Targeted for Intel CPU's that support AVX512 ISA
29 | 
30 | Be default, the following STREAM configuration parameters are used in compiling the binaries:
31 | - STREAM_TYPE = double
32 | - STREAM_ARRAY_SIZE = 269000000 (this translates to about 2 GB per array of memory footprint)
33 | - NTIMES = 100
34 | - OFFSET = 0
35 | 
36 | Makefile supports the following options:
37 | - size=<number_of_elements_per_array>
38 | - cpu=<avx,avx2,avx512>
39 | - rfo=1 forces to use regular cached stores instead of non-temporal stores
40 | - help
41 | 
42 | Few examples:
43 | - To compile STREAM benchmark only for Intel AVX512 CPU's, do: `make cpu=avx512`
44 | - To compile STREAM benchmark for Intel AVX512 CPU's with each buffer containing 67108864 elements, do:  `make size=67108864 cpu=avx512`
45 | - To explicitly use regular cached stores, do: `make rfo=1`
46 | 
47 | ## Running STREAM
48 | We provide a run script (`run.sh`) that can be used for benchmarking purposes. The run script does the following --
49 | 
50 | 1.  Binary: Use the appropriate STREAM binary produced from the compilation step, i.e picks the highest supported ISA on your target system
51 | 2.  OpenMP settings: Sets the OMP_NUM_THREADS to number of physical cores on the system. KMP_AFFINITY (thread affininity control variable of Intel OpenMP runtime) set to compact pinning. Ignores Hyper-threading cores even if enabled on system.
52 | 3.  Store the results to a log file. Also, output relevant system info such as number of sockets, cores, threads, NUMA domains, memory sub-system etc. Running with sudo would result in more detailed info on memory subsystem as it parses output of `dmidecode`
53 | 
54 | 
55 | [1]: McCalpin, John D., 1995: "Memory Bandwidth and Machine Balance in Current High Performance Computers", IEEE Computer Society Technical Committee on Computer Architecture (TCCA) Newsletter, December 1995.
56 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright (C) 2021 Intel Corporation
  4 | # SPDX-License-Identifier: BSD-3-Clause
  5 | 
  6 | function mach_info()
  7 | {
  8 |    num_socks=$(lscpu | grep "Socket(s):" | awk '{print $NF}')
  9 |    num_cores_per_sock=$(lscpu | grep "Core(s) per socket:" | awk '{print $NF}')
 10 |    num_threads_per_core=$(lscpu | grep "Thread(s) per core:" | awk '{print $NF}')
 11 |    ht_enabled=$( [ "$num_threads_per_core" -gt 1 ] && echo "true" || echo "false")
 12 |    num_cores_total=$(($num_socks*$num_cores_per_sock))
 13 | 
 14 |    num_numa_domains=$(numactl -H | grep "available:" | awk '{print $2}')
 15 |    num_numa_domains_per_sock=$(($num_numa_domains/$num_socks))
 16 |    num_cores_per_numa_domain=$(numactl -H | grep "node 0 cpus:" | awk -F ":" '{print $NF}' | awk '{print NF}')
 17 |    num_cores_per_numa_domain=$(($num_cores_per_numa_domain/$num_threads_per_core))
 18 | 
 19 |    l1_cache_size=$(cat /sys/devices/system/cpu/cpu0/cache/index0/size)
 20 |    l1_cache_ways=$(cat /sys/devices/system/cpu/cpu0/cache/index0/ways_of_associativity)
 21 |    l2_cache_size=$([ -f "/sys/devices/system/cpu/cpu0/cache/index2/size" ] && cat /sys/devices/system/cpu/cpu0/cache/index2/size)
 22 |    l2_cache_ways=$([ -f "/sys/devices/system/cpu/cpu0/cache/index2/ways_of_associativity" ] && cat /sys/devices/system/cpu/cpu0/cache/index2/ways_of_associativity)
 23 |    if [ -f "/sys/devices/system/cpu/cpu0/cache/index3/size" ]; then
 24 |      l3_cache_size=$(cat /sys/devices/system/cpu/cpu0/cache/index3/size)
 25 |      l3_cache_ways=$(cat /sys/devices/system/cpu/cpu0/cache/index3/ways_of_associativity)
 26 |      # todo: make this robust
 27 |      l3_cache_shared_cpu_count=$(cat /sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list  | awk -F "," '{print $1}' | awk -F "-" '{print $NF}')
 28 |      l3_cache_shared_cpu_count=$(($l3_cache_shared_cpu_count+1))
 29 |      l3_cache_size_per_core=$(($(echo $l3_cache_size | tr -d "[:alpha:]")/$l3_cache_shared_cpu_count))
 30 |      l3_cache_size_per_sock=$(($l3_cache_size_per_core*$num_cores_per_sock))
 31 |      l3_cache_size_per_core+=" KB"
 32 |      l3_cache_size_per_sock+=" KB"
 33 |    fi
 34 |    
 35 |    if [ "$EUID" -eq 0 ]; then
 36 |      #todo: iterate over all md to verify homogenous config
 37 |      md_size=$(dmidecode -t memory | grep "Memory Device" -A21  | grep -E 'Size:[[:space:]]*[[:digit:]]+' -m1)
 38 |      md_type=$(dmidecode -t memory | grep "Memory Device" -A21  | grep -E 'Type:[[:space:]]*DDR' -m1 | awk '{print $NF}')
 39 |      md_speed=$(dmidecode -t memory | grep "Memory Device" -A21  | grep -v "Configured Memory Speed"  | grep -E 'Speed:[[:space:]]*[[:digit:]]+' -m1 | awk -F ":" '{print $NF}')
 40 |      mem_vendor=$(dmidecode -t memory | grep "${md_size}" -m1 -A21 | grep -m1 "Manufacturer:" | awk '{print $NF}')
 41 |      # md_configured_speed=$(dmidecode -t memory | grep "Memory Device" -A21 -m1 | grep -E 'Configured Clock Speed:' | awk -F ":" '{print $NF}')
 42 |      # todo: add 'Locator' info and num_mem_channels_per_sock
 43 |      num_mem_channels=$(dmidecode -t memory | grep -c "${md_size}")
 44 |      size=$(echo $md_size | awk '{print $2}')
 45 |      mem_size_total=$(($size*$num_mem_channels))
 46 |      mem_size_total+=$(echo " $(echo $md_size | awk '{print $NF}')")
 47 |      mem_speed=$md_speed
 48 |      mem_type=$md_type
 49 |      mem_size_per_dimm=$(echo $md_size | awk -F ":" '{print $NF}')
 50 |      peak_mem_bw_system=$(echo "scale=2; (8 * $num_mem_channels * $(echo $mem_speed | tr -d "[:alpha:]|[:punct:]" ) / 1000)" | bc -l)
 51 |      peak_mem_bw_per_sock=$(echo "scale=2; ($peak_mem_bw_system / $num_socks)" | bc -l)
 52 |    else
 53 |      memory_size_total=$(cat /proc/meminfo | grep "MemTotal" | awk '{printf ("%.2f GB",$2/1000/1000)}')
 54 |    fi
 55 | 
 56 | 
 57 |    model_name=$(lscpu | grep "Model name:" | awk -F ":" '{print $NF}' | tr -s "[:space:]")
 58 |    os_name=$(cat /etc/os-release  | grep "PRETTY_NAME" | awk -F "=" '{print $NF}' | tr -d "\"")
 59 |    kernel_release=$(uname -r)
 60 |    hostname=$(hostname -f)
 61 |    thp=$( [ "$(grep -o "\[always\]" /sys/kernel/mm/*transparent_hugepage/enabled)" == "[always]" ] && echo "enabled" || echo "disabled")
 62 |    icc_version=$(icc --version | head -n1)
 63 | 
 64 |    if [ -f /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
 65 |       cpu_turbo=$( [ "$(cat /sys/devices/system/cpu/intel_pstate/no_turbo)" == 1 ] && echo "disabled" || echo "enabled")
 66 |    elif [ -f /sys/devices/system/cpu/cpufreq/boost ]; then
 67 |       cpu_turbo=$( [ "$(cat /sys/devices/system/cpu/cpufreq/boost)" == 1 ] && echo "enabled" || echo "disabled")
 68 |    fi
 69 |    cpu_scaling_governor=$(cat /sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_governor | sort -u)
 70 |    cpu_scaling_driver=$(cat /sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_driver | sort -u)
 71 |    
 72 |    lscpu_flags=$(lscpu | grep "Flags:")
 73 |    if [ -z "${lscpu_flags}" ]; then
 74 |      lscpu_flags=$(cat /proc/cpuinfo | grep -m1 "flags[[:space:]]*:")
 75 |    fi
 76 |    
 77 |    for isa in avx512f avx2 avx
 78 |    do
 79 |        echo ${lscpu_flags} | grep -w ${isa} &> /dev/null
 80 |         if [ $? -eq 0 ]; then
 81 |           target_cpu=${isa}
 82 |           break
 83 |          fi
 84 |    done
 85 | }
 86 | 
 87 | function show_mach_info()
 88 | {
 89 |   echo -e "\nCPU Model = $model_name"
 90 |   echo -e "\nSockets/Cores/Threads:"
 91 |   echo -e "\tnum_sockets          = $num_socks"
 92 |   echo -e "\tnum_cores_total      = $num_cores_total"
 93 |   echo -e "\tnum_cores_per_socket = $num_cores_per_sock"
 94 |   echo -e "\tnum_threads_per_core = $num_threads_per_core"
 95 |   echo -e "\tHyper-Threading      = $ht_enabled"
 96 |   
 97 |   echo -e "\nNUMA:"
 98 |   echo -e "\tnum_numa_domains            = $num_numa_domains"
 99 |   echo -e "\tnum_numa_domains_per_socket = $num_numa_domains_per_sock"
100 |   echo -e "\tnum_cores_per_numa_domain   = $num_cores_per_numa_domain"
101 | 
102 |   if [ "$EUID" -eq 0 ]; then
103 |     echo -e "\nMemory:"
104 |     echo -e "\tmem_vendor           = $mem_vendor"
105 |     echo -e "\tmem_speed            = $mem_speed"
106 |     echo -e "\tmem_type             = $mem_type"
107 |     echo -e "\tmem_size_total       = $mem_size_total"
108 |     echo -e "\tmem_size_per_dimm    = $mem_size_per_dimm"
109 |     echo -e "\tnum_mem_channels     = $num_mem_channels"
110 |     echo -e "\tpeak_mem_bw_system   = $peak_mem_bw_system GB/sec"
111 |     echo -e "\tpeak_mem_bw_per_sock = $peak_mem_bw_per_sock GB/sec"
112 |   else
113 |     echo -e "\nMemory = $memory_size_total"
114 |   fi
115 | 
116 |   echo -e "\nCPU Caches:"
117 |   echo -e "\tL1_cache = $l1_cache_size (${l1_cache_ways}-way)"
118 |   echo -e "\tL2_cache = $l2_cache_size (${l2_cache_ways}-way)"
119 |   echo -e "\tL3_cache = $l3_cache_size (${l3_cache_ways}-way)"
120 |   echo -e "\tL3_cache_per_sock = $l3_cache_size_per_sock"
121 |   echo -e "\tL3_cache_per_core = $l3_cache_size_per_core"
122 | 
123 |   echo -e "\nOS:"
124 |   echo -e "Operating System       = $os_name"
125 |   echo -e "Kernel version         = $kernel_release"
126 |   echo -e "CPU Turbo Boost        = $cpu_turbo"
127 |   echo -e "CPU Scaling Governor   = $cpu_scaling_governor"
128 |   echo -e "CPU Scaling Driver     = $cpu_scaling_driver"
129 |   echo -e "Transparent Huge Pages = $thp"
130 | 
131 |   echo ""
132 |   echo "ICC version = ${icc_version}"
133 |   echo "Target ISA  = ${target_cpu}"
134 |   echo "Hostname    = $(hostname -f)"
135 |   echo "Date        = $(date)"
136 |   echo ""
137 | }
138 | 
139 | function check_binary()
140 | {
141 |   if [ "${target_cpu}" == "avx512f" ]; then
142 |      binary=stream_avx512.bin
143 |   elif [ "${target_cpu}" == "avx2" ]; then
144 |      binary=stream_avx2.bin
145 |   elif [ "${target_cpu}" == "avx" ]; then
146 |      binary=stream_avx.bin
147 |   else
148 |      echo "Unknown ISA, aborting.."
149 |      exit 1
150 |   fi
151 | 
152 |   if [ ! -f ${binary} ]; then
153 |      echo "${binary} not found, aborting.."
154 |      exit 1
155 |   fi
156 | 
157 |   objdump -D ${binary} | grep vmovntpd &> /dev/null
158 |   if [ $? -eq 0 ]; then
159 |      nt_stores_status=exist
160 |      stype=nt
161 |   else
162 |      nt_stores_status="does-not-exist"
163 |      stype=rfo
164 |   fi
165 | 
166 |   # objdump -D ${binary} | grep memcpy &> /dev/null
167 |   # if [ $? -eq 0 ]; then
168 |   #    memcpy_status=exist
169 |   # else
170 |   #    memcpy_status="does-not-exist"
171 |   # fi
172 | 
173 |   # objdump -D ${binary} | grep memset &> /dev/null
174 |   # if [ $? -eq 0 ]; then
175 |   #    memset_status=exist
176 |   # else
177 |   #    memset_status="does-not-exist"
178 |   # fi
179 | 
180 |   echo "${binary} disassembly:" 2>&1 | tee -a $$-runinfo.log
181 |   echo "NT-Stores : ${nt_stores_status}" 2>&1 | tee -a $$-runinfo.log
182 |   # echo "Memcpy()  : ${memcpy_status}" 2>&1 | tee -a $$-runinfo.log
183 |   # echo "Memset()  : ${memset_status}" 2>&1 | tee -a $$-runinfo.log
184 | }
185 | 
186 | 
187 | function bench_simple() 
188 | {
189 |   l0_dir=$(echo ${model_name} | sed -E -e 's/ /-/g' -e 's/\(R\)|\@|\$|\%//g')
190 |   res_dir=${l0_dir}/${stype}
191 | 
192 |   mkdir -p ${res_dir}
193 | 
194 |   if [ "${ht_enabled}" == "true" ]; then
195 |      export KMP_AFFINITY=granularity=fine,compact,1,0
196 |   else
197 |      export KMP_AFFINITY=compact
198 |   fi
199 | 
200 |   for t in ${num_cores_total}
201 |   do
202 |     export OMP_NUM_THREADS=$t
203 |     res_file=$(basename ${binary} .bin)_${t}t.log
204 |     echo "Running ${binary} with ${t} threads in compact affinity, output log will be saved in ${res_dir}/${res_file}"
205 | 
206 |     cat $$-runinfo.log > ${res_dir}/${res_file}
207 |     ./${binary} &>> ${res_dir}/${res_file}
208 |   done
209 | 
210 |   rm $$-runinfo.log
211 | }
212 | 
213 | 
214 | function bench_sweep()
215 | {
216 |   l0_dir=$(echo ${model_name} | sed -E -e 's/ /-/g' -e 's/\(R\)|\@|\$|\%//g')
217 |   l1_dir=nps-$num_numa_domains_per_sock
218 | 
219 | #uma runs
220 |   mkdir -p ${l0_dir}/${l1_dir}/uma
221 | 
222 |   for aff in compact distribute
223 |   do
224 |     res_dir=${l0_dir}/${l1_dir}/uma/${aff}/${stype}
225 | 
226 |     if [ "$aff" == "compact" ]; then
227 |       mkdir -p ${res_dir}
228 |       cp $$-runinfo.log ${res_dir}/runinfo.log
229 |        if [ "${ht_enabled}" == "true" ]; then
230 |          export KMP_AFFINITY=granularity=fine,verbose,compact,1,0
231 |        else
232 |          export KMP_AFFINITY=verbose,compact
233 |        fi
234 | 
235 |        # for ((t=1;t<=${num_cores_total};t++));
236 |        for t in $(seq 1 ${num_cores_per_sock}) ${num_cores_total};
237 |        do
238 |          export OMP_NUM_THREADS=$t
239 |          res_file=$(basename ${binary} .bin)_${t}t.log
240 |          echo "Running ${binary} with ${t} threads in $aff pinning, output log will be saved in ${res_dir}/${res_file}"
241 | 
242 |          cat $$-runinfo.log > ${res_dir}/${res_file}
243 |          ./${binary} &>> ${res_dir}/${res_file}
244 |        done
245 | 
246 |     elif [ "$aff" == "distribute" ]; then
247 |       unset KMP_AFFINITY
248 |       if [ "${num_numa_domains_per_sock}" == "1" ]; then
249 |          continue
250 |       fi
251 |       mkdir -p ${res_dir}
252 |       cp $$-runinfo.log ${res_dir}/runinfo.log
253 | 
254 |       export OMP_PROC_BIND=spread
255 |       export KMP_HW_SUBSET=1s
256 | 
257 |       for ((t=1;t<=${num_cores_per_sock};t++));
258 |       do
259 |         res_file=$(basename ${binary} .bin)_${t}t.log
260 |         echo "Running ${binary} with ${t} threads in $aff pinning, output log will be saved in ${res_dir}/${res_file}"
261 |         cat $$-runinfo.log > ${res_dir}/${res_file}
262 |         export OMP_NUM_THREADS=$t
263 |         ./${binary} &>> ${res_dir}/${res_file}
264 |       done
265 |     fi
266 |   done
267 | 
268 | #numa runs (with compact affinity only)
269 |   res_dir=${l0_dir}/${l1_dir}/numa/compact/${stype}
270 |   mkdir -p ${res_dir}
271 | 
272 |   if [ "${ht_enabled}" == "true" ]; then
273 |      export KMP_AFFINITY=granularity=fine,compact,1,0
274 |   else
275 |      export KMP_AFFINITY=compact
276 |   fi
277 |   unset OMP_PROC_BIND
278 |   unset KMP_HW_SUBSET
279 | 
280 |   thread_list=(1 ${num_cores_per_numa_domain})
281 |   if [ "${num_cores_per_numa_domain}" != "${num_cores_per_sock}" ]; then
282 |     thread_list+=(${num_cores_per_sock})
283 |   fi
284 | 
285 |   cp $$-runinfo.log ${res_dir}/runinfo.log
286 |   for t in ${thread_list[*]}
287 |   do
288 |     export OMP_NUM_THREADS=$t
289 |     for ((id=0;id<${num_numa_domains};id++));
290 |     do
291 |       res_file=$(basename ${binary} .bin)_${t}t_m${id}.log
292 |       echo "Running ${binary} with ${t} threads from numa-$id, output log will be saved in ${res_dir}/${res_file}"
293 | 
294 |       cat $$-runinfo.log > ${res_dir}/${res_file}
295 |       numactl -m$id ./${binary} &>> ${res_dir}/${res_file}
296 |     done
297 |   done
298 | 
299 |   rm $$-runinfo.log
300 | }
301 | 
302 | 
303 | mach_info
304 | show_mach_info 2>&1 | tee $$-runinfo.log
305 | check_binary
306 | #bench_sweep
307 | bench_simple
308 | 


--------------------------------------------------------------------------------
/stream.c:
--------------------------------------------------------------------------------
  1 | /*-----------------------------------------------------------------------*/
  2 | /* Program: STREAM                                                       */
  3 | /* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
  4 | /* Original code developed by John D. McCalpin                           */
  5 | /* Programmers: John D. McCalpin                                         */
  6 | /*              Joe R. Zagar                                             */
  7 | /*                                                                       */
  8 | /* This program measures memory transfer rates in MB/s for simple        */
  9 | /* computational kernels coded in C.                                     */
 10 | /*-----------------------------------------------------------------------*/
 11 | /* Copyright 1991-2013: John D. McCalpin                                 */
 12 | /*-----------------------------------------------------------------------*/
 13 | /* License:                                                              */
 14 | /*  1. You are free to use this program and/or to redistribute           */
 15 | /*     this program.                                                     */
 16 | /*  2. You are free to modify this program for your own use,             */
 17 | /*     including commercial use, subject to the publication              */
 18 | /*     restrictions in item 3.                                           */
 19 | /*  3. You are free to publish results obtained from running this        */
 20 | /*     program, or from works that you derive from this program,         */
 21 | /*     with the following limitations:                                   */
 22 | /*     3a. In order to be referred to as "STREAM benchmark results",     */
 23 | /*         published results must be in conformance to the STREAM        */
 24 | /*         Run Rules, (briefly reviewed below) published at              */
 25 | /*         http://www.cs.virginia.edu/stream/ref.html                    */
 26 | /*         and incorporated herein by reference.                         */
 27 | /*         As the copyright holder, John McCalpin retains the            */
 28 | /*         right to determine conformity with the Run Rules.             */
 29 | /*     3b. Results based on modified source code or on runs not in       */
 30 | /*         accordance with the STREAM Run Rules must be clearly          */
 31 | /*         labelled whenever they are published.  Examples of            */
 32 | /*         proper labelling include:                                     */
 33 | /*           "tuned STREAM benchmark results"                            */
 34 | /*           "based on a variant of the STREAM benchmark code"           */
 35 | /*         Other comparable, clear, and reasonable labelling is          */
 36 | /*         acceptable.                                                   */
 37 | /*     3c. Submission of results to the STREAM benchmark web site        */
 38 | /*         is encouraged, but not required.                              */
 39 | /*  4. Use of this program or creation of derived works based on this    */
 40 | /*     program constitutes acceptance of these licensing restrictions.   */
 41 | /*  5. Absolutely no warranty is expressed or implied.                   */
 42 | /*-----------------------------------------------------------------------*/
 43 | # include <stdio.h>
 44 | # include <unistd.h>
 45 | # include <math.h>
 46 | # include <float.h>
 47 | # include <limits.h>
 48 | # include <sys/time.h>
 49 | 
 50 | /*-----------------------------------------------------------------------
 51 |  * INSTRUCTIONS:
 52 |  *
 53 |  *	1) STREAM requires different amounts of memory to run on different
 54 |  *           systems, depending on both the system cache size(s) and the
 55 |  *           granularity of the system timer.
 56 |  *     You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
 57 |  *           to meet *both* of the following criteria:
 58 |  *       (a) Each array must be at least 4 times the size of the
 59 |  *           available cache memory. I don't worry about the difference
 60 |  *           between 10^6 and 2^20, so in practice the minimum array size
 61 |  *           is about 3.8 times the cache size.
 62 |  *           Example 1: One Xeon E3 with 8 MB L3 cache
 63 |  *               STREAM_ARRAY_SIZE should be >= 4 million, giving
 64 |  *               an array size of 30.5 MB and a total memory requirement
 65 |  *               of 91.5 MB.  
 66 |  *           Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
 67 |  *               STREAM_ARRAY_SIZE should be >= 20 million, giving
 68 |  *               an array size of 153 MB and a total memory requirement
 69 |  *               of 458 MB.  
 70 |  *       (b) The size should be large enough so that the 'timing calibration'
 71 |  *           output by the program is at least 20 clock-ticks.  
 72 |  *           Example: most versions of Windows have a 10 millisecond timer
 73 |  *               granularity.  20 "ticks" at 10 ms/tic is 200 milliseconds.
 74 |  *               If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
 75 |  *               This means the each array must be at least 1 GB, or 128M elements.
 76 |  *
 77 |  *      Version 5.10 increases the default array size from 2 million
 78 |  *          elements to 10 million elements in response to the increasing
 79 |  *          size of L3 caches.  The new default size is large enough for caches
 80 |  *          up to 20 MB. 
 81 |  *      Version 5.10 changes the loop index variables from "register int"
 82 |  *          to "ssize_t", which allows array indices >2^32 (4 billion)
 83 |  *          on properly configured 64-bit systems.  Additional compiler options
 84 |  *          (such as "-mcmodel=medium") may be required for large memory runs.
 85 |  *
 86 |  *      Array size can be set at compile time without modifying the source
 87 |  *          code for the (many) compilers that support preprocessor definitions
 88 |  *          on the compile line.  E.g.,
 89 |  *                gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
 90 |  *          will override the default size of 10M with a new size of 100M elements
 91 |  *          per array.
 92 |  */
 93 | #ifndef STREAM_ARRAY_SIZE
 94 | #   define STREAM_ARRAY_SIZE	10000000
 95 | #endif
 96 | 
 97 | /*  2) STREAM runs each kernel "NTIMES" times and reports the *best* result
 98 |  *         for any iteration after the first, therefore the minimum value
 99 |  *         for NTIMES is 2.
100 |  *      There are no rules on maximum allowable values for NTIMES, but
101 |  *         values larger than the default are unlikely to noticeably
102 |  *         increase the reported performance.
103 |  *      NTIMES can also be set on the compile line without changing the source
104 |  *         code using, for example, "-DNTIMES=7".
105 |  */
106 | #ifdef NTIMES
107 | #if NTIMES<=1
108 | #   define NTIMES	10
109 | #endif
110 | #endif
111 | #ifndef NTIMES
112 | #   define NTIMES	10
113 | #endif
114 | 
115 | /*  Users are allowed to modify the "OFFSET" variable, which *may* change the
116 |  *         relative alignment of the arrays (though compilers may change the 
117 |  *         effective offset by making the arrays non-contiguous on some systems). 
118 |  *      Use of non-zero values for OFFSET can be especially helpful if the
119 |  *         STREAM_ARRAY_SIZE is set to a value close to a large power of 2.
120 |  *      OFFSET can also be set on the compile line without changing the source
121 |  *         code using, for example, "-DOFFSET=56".
122 |  */
123 | #ifndef OFFSET
124 | #   define OFFSET	0
125 | #endif
126 | 
127 | /*
128 |  *	3) Compile the code with optimization.  Many compilers generate
129 |  *       unreasonably bad code before the optimizer tightens things up.  
130 |  *     If the results are unreasonably good, on the other hand, the
131 |  *       optimizer might be too smart for me!
132 |  *
133 |  *     For a simple single-core version, try compiling with:
134 |  *            cc -O stream.c -o stream
135 |  *     This is known to work on many, many systems....
136 |  *
137 |  *     To use multiple cores, you need to tell the compiler to obey the OpenMP
138 |  *       directives in the code.  This varies by compiler, but a common example is
139 |  *            gcc -O -fopenmp stream.c -o stream_omp
140 |  *       The environment variable OMP_NUM_THREADS allows runtime control of the 
141 |  *         number of threads/cores used when the resulting "stream_omp" program
142 |  *         is executed.
143 |  *
144 |  *     To run with single-precision variables and arithmetic, simply add
145 |  *         -DSTREAM_TYPE=float
146 |  *     to the compile line.
147 |  *     Note that this changes the minimum array sizes required --- see (1) above.
148 |  *
149 |  *     The preprocessor directive "TUNED" does not do much -- it simply causes the 
150 |  *       code to call separate functions to execute each kernel.  Trivial versions
151 |  *       of these functions are provided, but they are *not* tuned -- they just 
152 |  *       provide predefined interfaces to be replaced with tuned code.
153 |  *
154 |  *
155 |  *	4) Optional: Mail the results to mccalpin@cs.virginia.edu
156 |  *	   Be sure to include info that will help me understand:
157 |  *		a) the computer hardware configuration (e.g., processor model, memory type)
158 |  *		b) the compiler name/version and compilation flags
159 |  *      c) any run-time information (such as OMP_NUM_THREADS)
160 |  *		d) all of the output from the test case.
161 |  *
162 |  * Thanks!
163 |  *
164 |  *-----------------------------------------------------------------------*/
165 | 
166 | # define HLINE "-------------------------------------------------------------\n"
167 | 
168 | # ifndef MIN
169 | # define MIN(x,y) ((x)<(y)?(x):(y))
170 | # endif
171 | # ifndef MAX
172 | # define MAX(x,y) ((x)>(y)?(x):(y))
173 | # endif
174 | 
175 | #ifndef STREAM_TYPE
176 | #define STREAM_TYPE double
177 | #endif
178 | 
179 | static STREAM_TYPE	a[STREAM_ARRAY_SIZE+OFFSET],
180 | 			b[STREAM_ARRAY_SIZE+OFFSET],
181 | 			c[STREAM_ARRAY_SIZE+OFFSET];
182 | 
183 | static double	avgtime[4] = {0}, maxtime[4] = {0},
184 | 		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
185 | 
186 | static char	*label[4] = {"Copy:      ", "Scale:     ",
187 |     "Add:       ", "Triad:     "};
188 | 
189 | static double	bytes[4] = {
190 |     2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
191 |     2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
192 |     3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
193 |     3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE
194 |     };
195 | 
196 | extern double mysecond();
197 | extern void checkSTREAMresults();
198 | #ifdef TUNED
199 | extern void tuned_STREAM_Copy();
200 | extern void tuned_STREAM_Scale(STREAM_TYPE scalar);
201 | extern void tuned_STREAM_Add();
202 | extern void tuned_STREAM_Triad(STREAM_TYPE scalar);
203 | #endif
204 | #ifdef _OPENMP
205 | extern int omp_get_num_threads();
206 | #endif
207 | int
208 | main()
209 |     {
210 |     int			quantum, checktick();
211 |     int			BytesPerWord;
212 |     int			k;
213 |     ssize_t		j;
214 |     STREAM_TYPE		scalar;
215 |     double		t, times[4][NTIMES];
216 | 
217 |     /* --- SETUP --- determine precision and check timing --- */
218 | 
219 |     printf(HLINE);
220 |     printf("STREAM version $Revision: 5.10 $\n");
221 |     printf(HLINE);
222 |     BytesPerWord = sizeof(STREAM_TYPE);
223 |     printf("This system uses %d bytes per array element.\n",
224 | 	BytesPerWord);
225 | 
226 |     printf(HLINE);
227 | #ifdef N
228 |     printf("*****  WARNING: ******\n");
229 |     printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
230 |     printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
231 |     printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
232 |     printf("*****  WARNING: ******\n");
233 | #endif
234 | 
235 |     printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
236 |     printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
237 | 	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
238 | 	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
239 |     printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
240 | 	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
241 | 	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
242 |     printf("Each kernel will be executed %d times.\n", NTIMES);
243 |     printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
244 |     printf(" will be used to compute the reported bandwidth.\n");
245 | 
246 | #ifdef _OPENMP
247 |     printf(HLINE);
248 | #pragma omp parallel 
249 |     {
250 | #pragma omp master
251 | 	{
252 | 	    k = omp_get_num_threads();
253 | 	    printf ("Number of Threads requested = %i\n",k);
254 |         }
255 |     }
256 | #endif
257 | 
258 | #ifdef _OPENMP
259 | 	k = 0;
260 | #pragma omp parallel
261 | #pragma omp atomic 
262 | 		k++;
263 |     printf ("Number of Threads counted = %i\n",k);
264 | #endif
265 | 
266 |     /* Get initial value for system clock. */
267 | #pragma omp parallel for
268 |     for (j=0; j<STREAM_ARRAY_SIZE; j++) {
269 | 	    a[j] = 1.0;
270 | 	    b[j] = 2.0;
271 | 	    c[j] = 0.0;
272 | 	}
273 | 
274 |     printf(HLINE);
275 | 
276 |     if  ( (quantum = checktick()) >= 1) 
277 | 	printf("Your clock granularity/precision appears to be "
278 | 	    "%d microseconds.\n", quantum);
279 |     else {
280 | 	printf("Your clock granularity appears to be "
281 | 	    "less than one microsecond.\n");
282 | 	quantum = 1;
283 |     }
284 | 
285 |     t = mysecond();
286 | #pragma omp parallel for
287 |     for (j = 0; j < STREAM_ARRAY_SIZE; j++)
288 | 		a[j] = 2.0E0 * a[j];
289 |     t = 1.0E6 * (mysecond() - t);
290 | 
291 |     printf("Each test below will take on the order"
292 | 	" of %d microseconds.\n", (int) t  );
293 |     printf("   (= %d clock ticks)\n", (int) (t/quantum) );
294 |     printf("Increase the size of the arrays if this shows that\n");
295 |     printf("you are not getting at least 20 clock ticks per test.\n");
296 | 
297 |     printf(HLINE);
298 | 
299 |     printf("WARNING -- The above is only a rough guideline.\n");
300 |     printf("For best results, please be sure you know the\n");
301 |     printf("precision of your system timer.\n");
302 |     printf(HLINE);
303 |     
304 |     /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
305 | 
306 |     scalar = 3.0;
307 |     for (k=0; k<NTIMES; k++)
308 | 	{
309 | 	times[0][k] = mysecond();
310 | #ifdef TUNED
311 |         tuned_STREAM_Copy();
312 | #else
313 | #pragma omp parallel for
314 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
315 | 	    c[j] = a[j];
316 | #endif
317 | 	times[0][k] = mysecond() - times[0][k];
318 | 	
319 | 	times[1][k] = mysecond();
320 | #ifdef TUNED
321 |         tuned_STREAM_Scale(scalar);
322 | #else
323 | #pragma omp parallel for
324 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
325 | 	    b[j] = scalar*c[j];
326 | #endif
327 | 	times[1][k] = mysecond() - times[1][k];
328 | 	
329 | 	times[2][k] = mysecond();
330 | #ifdef TUNED
331 |         tuned_STREAM_Add();
332 | #else
333 | #pragma omp parallel for
334 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
335 | 	    c[j] = a[j]+b[j];
336 | #endif
337 | 	times[2][k] = mysecond() - times[2][k];
338 | 	
339 | 	times[3][k] = mysecond();
340 | #ifdef TUNED
341 |         tuned_STREAM_Triad(scalar);
342 | #else
343 | #pragma omp parallel for
344 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
345 | 	    a[j] = b[j]+scalar*c[j];
346 | #endif
347 | 	times[3][k] = mysecond() - times[3][k];
348 | 	}
349 | 
350 |     /*	--- SUMMARY --- */
351 | 
352 |     for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
353 | 	{
354 | 	for (j=0; j<4; j++)
355 | 	    {
356 | 	    avgtime[j] = avgtime[j] + times[j][k];
357 | 	    mintime[j] = MIN(mintime[j], times[j][k]);
358 | 	    maxtime[j] = MAX(maxtime[j], times[j][k]);
359 | 	    }
360 | 	}
361 |     
362 |     printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
363 |     for (j=0; j<4; j++) {
364 | 		avgtime[j] = avgtime[j]/(double)(NTIMES-1);
365 | 
366 | 		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
367 | 	       1.0E-06 * bytes[j]/mintime[j],
368 | 	       avgtime[j],
369 | 	       mintime[j],
370 | 	       maxtime[j]);
371 |     }
372 |     printf(HLINE);
373 | 
374 |     /* --- Check Results --- */
375 |     checkSTREAMresults();
376 |     printf(HLINE);
377 | 
378 |     return 0;
379 | }
380 | 
381 | # define	M	20
382 | 
383 | int
384 | checktick()
385 |     {
386 |     int		i, minDelta, Delta;
387 |     double	t1, t2, timesfound[M];
388 | 
389 | /*  Collect a sequence of M unique time values from the system. */
390 | 
391 |     for (i = 0; i < M; i++) {
392 | 	t1 = mysecond();
393 | 	while( ((t2=mysecond()) - t1) < 1.0E-6 )
394 | 	    ;
395 | 	timesfound[i] = t1 = t2;
396 | 	}
397 | 
398 | /*
399 |  * Determine the minimum difference between these M values.
400 |  * This result will be our estimate (in microseconds) for the
401 |  * clock granularity.
402 |  */
403 | 
404 |     minDelta = 1000000;
405 |     for (i = 1; i < M; i++) {
406 | 	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
407 | 	minDelta = MIN(minDelta, MAX(Delta,0));
408 | 	}
409 | 
410 |    return(minDelta);
411 |     }
412 | 
413 | 
414 | 
415 | /* A gettimeofday routine to give access to the wall
416 |    clock timer on most UNIX-like systems.  */
417 | 
418 | #include <sys/time.h>
419 | 
420 | double mysecond()
421 | {
422 |         struct timeval tp;
423 |         struct timezone tzp;
424 |         int i;
425 | 
426 |         i = gettimeofday(&tp,&tzp);
427 |         return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
428 | }
429 | 
430 | #ifndef abs
431 | #define abs(a) ((a) >= 0 ? (a) : -(a))
432 | #endif
433 | void checkSTREAMresults ()
434 | {
435 | 	STREAM_TYPE aj,bj,cj,scalar;
436 | 	STREAM_TYPE aSumErr,bSumErr,cSumErr;
437 | 	STREAM_TYPE aAvgErr,bAvgErr,cAvgErr;
438 | 	double epsilon;
439 | 	ssize_t	j;
440 | 	int	k,ierr,err;
441 | 
442 |     /* reproduce initialization */
443 | 	aj = 1.0;
444 | 	bj = 2.0;
445 | 	cj = 0.0;
446 |     /* a[] is modified during timing check */
447 | 	aj = 2.0E0 * aj;
448 |     /* now execute timing loop */
449 | 	scalar = 3.0;
450 | 	for (k=0; k<NTIMES; k++)
451 |         {
452 |             cj = aj;
453 |             bj = scalar*cj;
454 |             cj = aj+bj;
455 |             aj = bj+scalar*cj;
456 |         }
457 | 
458 |     /* accumulate deltas between observed and expected results */
459 | 	aSumErr = 0.0;
460 | 	bSumErr = 0.0;
461 | 	cSumErr = 0.0;
462 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++) {
463 | 		aSumErr += abs(a[j] - aj);
464 | 		bSumErr += abs(b[j] - bj);
465 | 		cSumErr += abs(c[j] - cj);
466 | 		// if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj);	// MCCALPIN
467 | 	}
468 | 	aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
469 | 	bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
470 | 	cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
471 | 
472 | 	if (sizeof(STREAM_TYPE) == 4) {
473 | 		epsilon = 1.e-6;
474 | 	}
475 | 	else if (sizeof(STREAM_TYPE) == 8) {
476 | 		epsilon = 1.e-13;
477 | 	}
478 | 	else {
479 | 		printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
480 | 		epsilon = 1.e-6;
481 | 	}
482 | 
483 | 	err = 0;
484 | 	if (abs(aAvgErr/aj) > epsilon) {
485 | 		err++;
486 | 		printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
487 | 		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
488 | 		ierr = 0;
489 | 		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
490 | 			if (abs(a[j]/aj-1.0) > epsilon) {
491 | 				ierr++;
492 | #ifdef VERBOSE
493 | 				if (ierr < 10) {
494 | 					printf("         array a: index: %ld, expected: %e, observed: %e, relative error: %e\n",
495 | 						j,aj,a[j],abs((aj-a[j])/aAvgErr));
496 | 				}
497 | #endif
498 | 			}
499 | 		}
500 | 		printf("     For array a[], %d errors were found.\n",ierr);
501 | 	}
502 | 	if (abs(bAvgErr/bj) > epsilon) {
503 | 		err++;
504 | 		printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
505 | 		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
506 | 		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
507 | 		ierr = 0;
508 | 		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
509 | 			if (abs(b[j]/bj-1.0) > epsilon) {
510 | 				ierr++;
511 | #ifdef VERBOSE
512 | 				if (ierr < 10) {
513 | 					printf("         array b: index: %ld, expected: %e, observed: %e, relative error: %e\n",
514 | 						j,bj,b[j],abs((bj-b[j])/bAvgErr));
515 | 				}
516 | #endif
517 | 			}
518 | 		}
519 | 		printf("     For array b[], %d errors were found.\n",ierr);
520 | 	}
521 | 	if (abs(cAvgErr/cj) > epsilon) {
522 | 		err++;
523 | 		printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
524 | 		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
525 | 		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
526 | 		ierr = 0;
527 | 		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
528 | 			if (abs(c[j]/cj-1.0) > epsilon) {
529 | 				ierr++;
530 | #ifdef VERBOSE
531 | 				if (ierr < 10) {
532 | 					printf("         array c: index: %ld, expected: %e, observed: %e, relative error: %e\n",
533 | 						j,cj,c[j],abs((cj-c[j])/cAvgErr));
534 | 				}
535 | #endif
536 | 			}
537 | 		}
538 | 		printf("     For array c[], %d errors were found.\n",ierr);
539 | 	}
540 | 	if (err == 0) {
541 | 		printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
542 | 	}
543 | #ifdef VERBOSE
544 | 	printf ("Results Validation Verbose Results: \n");
545 | 	printf ("    Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj);
546 | 	printf ("    Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]);
547 | 	printf ("    Rel Errors on a, b, c:     %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj));
548 | #endif
549 | }
550 | 
551 | #ifdef TUNED
552 | /* stubs for "tuned" versions of the kernels */
553 | void tuned_STREAM_Copy()
554 | {
555 | 	ssize_t j;
556 | #pragma omp parallel for
557 |         for (j=0; j<STREAM_ARRAY_SIZE; j++)
558 |             c[j] = a[j];
559 | }
560 | 
561 | void tuned_STREAM_Scale(STREAM_TYPE scalar)
562 | {
563 | 	ssize_t j;
564 | #pragma omp parallel for
565 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
566 | 	    b[j] = scalar*c[j];
567 | }
568 | 
569 | void tuned_STREAM_Add()
570 | {
571 | 	ssize_t j;
572 | #pragma omp parallel for
573 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
574 | 	    c[j] = a[j]+b[j];
575 | }
576 | 
577 | void tuned_STREAM_Triad(STREAM_TYPE scalar)
578 | {
579 | 	ssize_t j;
580 | #pragma omp parallel for
581 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
582 | 	    a[j] = b[j]+scalar*c[j];
583 | }
584 | /* end of stubs for the "tuned" versions of the kernels */
585 | #endif
586 | 


--------------------------------------------------------------------------------