├── .gitignore ├── LICENSE.txt ├── third-party-programs.txt ├── makefile ├── README.md ├── run.sh └── stream.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.bin 3 | *.log 4 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2021 Intel Corporation 2 | 3 | SPDX-License-Identifier: BSD-3-Clause 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | -------------------------------------------------------------------------------- /third-party-programs.txt: -------------------------------------------------------------------------------- 1 | Third Party Programs File 2 | 3 | This file contains the list of third party software ("third party programs") 4 | contained in the Intel software and their required notices and/or license terms. 5 | This third party software, even if included with the distribution of the Intel 6 | software, may be governed by separate license terms, including without limitation, 7 | third party license terms, other Intel software license terms, and open source 8 | software license terms. These separate license terms govern your use of the third 9 | party programs as set forth in the "third-party-programs.txt" or other similarly named text file. 10 | 11 | 12 | Third party programs and their corresponding required notices and/or license terms are listed below. 13 | 14 | -------------------------------------------------------------------------------- 15 | 1. stream 16 | *----------------------------------------------------------------------- 17 | * Copyright 1991-2003: John D. McCalpin 18 | *----------------------------------------------------------------------- 19 | * License: 20 | * 1. You are free to use this program and/or to redistribute 21 | * this program. 22 | * 2. You are free to modify this program for your own use, 23 | * including commercial use, subject to the publication 24 | * restrictions in item 3. 25 | * 3. You are free to publish results obtained from running this 26 | * program, or from works that you derive from this program, 27 | * with the following limitations: 28 | * 3a. In order to be referred to as "STREAM benchmark results", 29 | * published results must be in conformance to the STREAM 30 | * Run Rules, (briefly reviewed below) published at 31 | * http://www.cs.virginia.edu/stream/ref.html 32 | * and incorporated herein by reference. 33 | * As the copyright holder, John McCalpin retains the 34 | * right to determine conformity with the Run Rules. 35 | * 3b. Results based on modified source code or on runs not in 36 | * accordance with the STREAM Run Rules must be clearly 37 | * labelled whenever they are published. Examples of 38 | * proper labelling include: 39 | * "tuned STREAM benchmark results" 40 | * "based on a variant of the STREAM benchmark code" 41 | * Other comparable, clear and reasonable labelling is 42 | * acceptable. 43 | * 3c. Submission of results to the STREAM benchmark web site 44 | * is encouraged, but not required. 45 | * 4. Use of this program or creation of derived works based on this 46 | * program constitutes acceptance of these licensing restrictions. 47 | * 5. Absolutely no warranty is expressed or implied. 48 | *----------------------------------------------------------------------- 49 | ------------------------------------------------------------- 50 | 51 | *Other names and brands may be claimed as the property of others. 52 | 53 | ------------------------------------------------------------- 54 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2021 Intel Corporation 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | 4 | CC = icc 5 | 6 | # STREAM options: 7 | # -DNTIMES control the number of times each stream kernel is executed 8 | # -DOFFSET controls the number of bytes between each of the buffers 9 | # -DSTREAM_TYPE specifies the data-type of elements in the buffers 10 | # -DSTREAM_ARRAY_SIZE specifies the number of elements in each buffer 11 | STREAM_CPP_OPTS = -DNTIMES=100 -DOFFSET=0 -DSTREAM_TYPE=double 12 | # Size per array is approx. ~2GB. Delibrately using non-power of 2 elements 13 | # 256*1024*1024 elements = 268435456 elements = 2GiB with FP64 14 | STREAM_ARRAY_SIZE = 269000000 15 | 16 | ifdef size 17 | STREAM_ARRAY_SIZE = $(size) 18 | endif 19 | STREAM_CPP_OPTS += -DSTREAM_ARRAY_SIZE=$(STREAM_ARRAY_SIZE) 20 | 21 | # Intel Compiler options to control the generated ISA 22 | AVX_COPTS = -xAVX 23 | AVX2_COPTS = -xCORE-AVX2 24 | AVX512_COPTS = -xCORE-AVX512 -qopt-zmm-usage=high 25 | # Common Intel Compiler options that are independent of ISA 26 | COMMON_COPTS = -Wall -O3 -mcmodel=medium -qopenmp -shared-intel 27 | 28 | ifdef rfo 29 | COMMON_COPTS += -qopt-streaming-stores never -fno-builtin 30 | else 31 | COMMON_COPTS += -qopt-streaming-stores always 32 | endif 33 | 34 | AVX_OBJS = stream_avx.o 35 | AVX2_OBJS = stream_avx2.o 36 | AVX512_OBJS = stream_avx512.o 37 | 38 | ifdef cpu 39 | all: stream_$(cpu).bin 40 | else 41 | all: stream_avx.bin stream_avx2.bin stream_avx512.bin 42 | endif 43 | 44 | SRC = stream.c 45 | 46 | stream_avx.o: $(SRC) 47 | $(CC) $(COMMON_COPTS) $(AVX_COPTS) $(STREAM_CPP_OPTS) -c $(SRC) -o $@ 48 | stream_avx2.o: $(SRC) 49 | $(CC) $(COMMON_COPTS) $(AVX2_COPTS) $(STREAM_CPP_OPTS) -c $(SRC) -o $@ 50 | stream_avx512.o: $(SRC) 51 | $(CC) $(COMMON_COPTS) $(AVX512_COPTS) $(STREAM_CPP_OPTS) -c $(SRC) -o $@ 52 | 53 | 54 | stream_avx.bin: $(AVX_OBJS) 55 | $(CC) $(COMMON_COPTS) $(AVX_COPTS) $^ -o $@ 56 | stream_avx2.bin: $(AVX2_OBJS) 57 | $(CC) $(COMMON_COPTS) $(AVX2_COPTS) $^ -o $@ 58 | stream_avx512.bin: $(AVX512_OBJS) 59 | $(CC) $(COMMON_COPTS) $(AVX512_COPTS) $^ -o $@ 60 | 61 | help: 62 | @echo -e "Running 'make' with no options would compile the STREAM benchmark with $(STREAM_ARRAY_SIZE) FP64 elements per array for following Intel CPU's:\n" 63 | @echo -e "\tstream_avx.bin => Targeted for Intel CPU's that support AVX ISA" 64 | @echo -e "\tstream_avx2.bin => Targeted for Intel CPU's that support AVX2 ISA" 65 | @echo -e "\tstream_avx512.bin => Targeted for Intel CPU's that support AVX512 ISA" 66 | @echo -e "\nThe following options are supported:" 67 | @echo -e "\tsize=" 68 | @echo "" 69 | @echo -e "\tcpu=" 70 | @echo "" 71 | @echo -e "\trfo=1 forces to use regular cached stores instead of non-temporal stores" 72 | @echo "" 73 | @echo -e "\nFew examples:" 74 | @echo -e "To compile STREAM benchmark only for Intel AVX512 CPU's, do:" 75 | @echo -e "\tmake cpu=avx512" 76 | @echo "" 77 | @echo -e "To compile STREAM benchmark for Intel AVX512 CPU's with each buffer containing 67108864 elements, do:" 78 | @echo -e "\tmake size=67108864 cpu=avx512" 79 | 80 | clean: 81 | rm -rf *.o *.bin 82 | 83 | .PHONY: all clean help 84 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DISCONTINUATION OF PROJECT # 2 | This project will no longer be maintained by Intel. 3 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project. 4 | Intel no longer accepts patches to this project. 5 | If you have an ongoing need to use this project, are interested in independently developing it, or would like to maintain patches for the open source software community, please create your own fork of this project. 6 | 7 | # Memory Bandwidth Benchmarks 8 | 9 | ## Overview 10 | This repository intends to provides a set of benchmarks that can be used to measure the memory bandwidth performance of CPU's. In the initial release, we provide the de-facto memory bandwidth benchmark, [STREAM](https://www.cs.virginia.edu/stream/) [1] along with compilation and run scripts to obtain ideal performance on Intel(R) Processors. 11 | 12 | ## STREAM Overview 13 | STREAM is a simple, synthetic benchmark designed to measure sustainable memory bandwidth (in MB/s) for four simple vector kernels: Copy, Scale, Add and Triad. Its source code is freely available [here](https://www.cs.virginia.edu/stream/FTP/Code/) 14 | 15 | There are two categories created by STREAM benchmark author for citing memory bandwidth performance -- Standard and Tuned. Results obtained from the unmodified source code are considered as "Standard" whereas a "Tuned" category has been added to allow users or vendors to submit results based on modified source code. On Intel(R) processors, we don't need to modify the source code of the benchmark for optimal performance. We provide instructions to compile and run STREAM without any source code modifications. Hence, the performance results obtained would fall under the Standard category. 16 | 17 | The general rule for STREAM is that each array must be at least 4x the size of the sum of all the last-level caches used in the run, or 1 million elements, whichever is larger. 18 | 19 | ## Pre-requisites 20 | - Intel C Compiler: Performance of STREAM benchmark is dependent on the Compiler options used. Hence, we rely on the Intel C Compiler to generate the underlying non-temporal store instructions to achieve optimal performance on Intel CPU's. 21 | - Linux environment: Currently, the makefile assume Linux OS environment. 22 | 23 | ## Compilation 24 | - Ensure Intel C Compiler (icc) is available in your shell environment. 25 | - Run `make`. This would generate the following binaries: 26 | - stream_avx.bin => Targeted for Intel CPU's that support AVX ISA 27 | - stream_avx2.bin => Targeted for Intel CPU's that support AVX2 ISA 28 | - stream_avx512.bin => Targeted for Intel CPU's that support AVX512 ISA 29 | 30 | Be default, the following STREAM configuration parameters are used in compiling the binaries: 31 | - STREAM_TYPE = double 32 | - STREAM_ARRAY_SIZE = 269000000 (this translates to about 2 GB per array of memory footprint) 33 | - NTIMES = 100 34 | - OFFSET = 0 35 | 36 | Makefile supports the following options: 37 | - size= 38 | - cpu= 39 | - rfo=1 forces to use regular cached stores instead of non-temporal stores 40 | - help 41 | 42 | Few examples: 43 | - To compile STREAM benchmark only for Intel AVX512 CPU's, do: `make cpu=avx512` 44 | - To compile STREAM benchmark for Intel AVX512 CPU's with each buffer containing 67108864 elements, do: `make size=67108864 cpu=avx512` 45 | - To explicitly use regular cached stores, do: `make rfo=1` 46 | 47 | ## Running STREAM 48 | We provide a run script (`run.sh`) that can be used for benchmarking purposes. The run script does the following -- 49 | 50 | 1. Binary: Use the appropriate STREAM binary produced from the compilation step, i.e picks the highest supported ISA on your target system 51 | 2. OpenMP settings: Sets the OMP_NUM_THREADS to number of physical cores on the system. KMP_AFFINITY (thread affininity control variable of Intel OpenMP runtime) set to compact pinning. Ignores Hyper-threading cores even if enabled on system. 52 | 3. Store the results to a log file. Also, output relevant system info such as number of sockets, cores, threads, NUMA domains, memory sub-system etc. Running with sudo would result in more detailed info on memory subsystem as it parses output of `dmidecode` 53 | 54 | 55 | [1]: McCalpin, John D., 1995: "Memory Bandwidth and Machine Balance in Current High Performance Computers", IEEE Computer Society Technical Committee on Computer Architecture (TCCA) Newsletter, December 1995. 56 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (C) 2021 Intel Corporation 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | 6 | function mach_info() 7 | { 8 | num_socks=$(lscpu | grep "Socket(s):" | awk '{print $NF}') 9 | num_cores_per_sock=$(lscpu | grep "Core(s) per socket:" | awk '{print $NF}') 10 | num_threads_per_core=$(lscpu | grep "Thread(s) per core:" | awk '{print $NF}') 11 | ht_enabled=$( [ "$num_threads_per_core" -gt 1 ] && echo "true" || echo "false") 12 | num_cores_total=$(($num_socks*$num_cores_per_sock)) 13 | 14 | num_numa_domains=$(numactl -H | grep "available:" | awk '{print $2}') 15 | num_numa_domains_per_sock=$(($num_numa_domains/$num_socks)) 16 | num_cores_per_numa_domain=$(numactl -H | grep "node 0 cpus:" | awk -F ":" '{print $NF}' | awk '{print NF}') 17 | num_cores_per_numa_domain=$(($num_cores_per_numa_domain/$num_threads_per_core)) 18 | 19 | l1_cache_size=$(cat /sys/devices/system/cpu/cpu0/cache/index0/size) 20 | l1_cache_ways=$(cat /sys/devices/system/cpu/cpu0/cache/index0/ways_of_associativity) 21 | l2_cache_size=$([ -f "/sys/devices/system/cpu/cpu0/cache/index2/size" ] && cat /sys/devices/system/cpu/cpu0/cache/index2/size) 22 | l2_cache_ways=$([ -f "/sys/devices/system/cpu/cpu0/cache/index2/ways_of_associativity" ] && cat /sys/devices/system/cpu/cpu0/cache/index2/ways_of_associativity) 23 | if [ -f "/sys/devices/system/cpu/cpu0/cache/index3/size" ]; then 24 | l3_cache_size=$(cat /sys/devices/system/cpu/cpu0/cache/index3/size) 25 | l3_cache_ways=$(cat /sys/devices/system/cpu/cpu0/cache/index3/ways_of_associativity) 26 | # todo: make this robust 27 | l3_cache_shared_cpu_count=$(cat /sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list | awk -F "," '{print $1}' | awk -F "-" '{print $NF}') 28 | l3_cache_shared_cpu_count=$(($l3_cache_shared_cpu_count+1)) 29 | l3_cache_size_per_core=$(($(echo $l3_cache_size | tr -d "[:alpha:]")/$l3_cache_shared_cpu_count)) 30 | l3_cache_size_per_sock=$(($l3_cache_size_per_core*$num_cores_per_sock)) 31 | l3_cache_size_per_core+=" KB" 32 | l3_cache_size_per_sock+=" KB" 33 | fi 34 | 35 | if [ "$EUID" -eq 0 ]; then 36 | #todo: iterate over all md to verify homogenous config 37 | md_size=$(dmidecode -t memory | grep "Memory Device" -A21 | grep -E 'Size:[[:space:]]*[[:digit:]]+' -m1) 38 | md_type=$(dmidecode -t memory | grep "Memory Device" -A21 | grep -E 'Type:[[:space:]]*DDR' -m1 | awk '{print $NF}') 39 | md_speed=$(dmidecode -t memory | grep "Memory Device" -A21 | grep -v "Configured Memory Speed" | grep -E 'Speed:[[:space:]]*[[:digit:]]+' -m1 | awk -F ":" '{print $NF}') 40 | mem_vendor=$(dmidecode -t memory | grep "${md_size}" -m1 -A21 | grep -m1 "Manufacturer:" | awk '{print $NF}') 41 | # md_configured_speed=$(dmidecode -t memory | grep "Memory Device" -A21 -m1 | grep -E 'Configured Clock Speed:' | awk -F ":" '{print $NF}') 42 | # todo: add 'Locator' info and num_mem_channels_per_sock 43 | num_mem_channels=$(dmidecode -t memory | grep -c "${md_size}") 44 | size=$(echo $md_size | awk '{print $2}') 45 | mem_size_total=$(($size*$num_mem_channels)) 46 | mem_size_total+=$(echo " $(echo $md_size | awk '{print $NF}')") 47 | mem_speed=$md_speed 48 | mem_type=$md_type 49 | mem_size_per_dimm=$(echo $md_size | awk -F ":" '{print $NF}') 50 | peak_mem_bw_system=$(echo "scale=2; (8 * $num_mem_channels * $(echo $mem_speed | tr -d "[:alpha:]|[:punct:]" ) / 1000)" | bc -l) 51 | peak_mem_bw_per_sock=$(echo "scale=2; ($peak_mem_bw_system / $num_socks)" | bc -l) 52 | else 53 | memory_size_total=$(cat /proc/meminfo | grep "MemTotal" | awk '{printf ("%.2f GB",$2/1000/1000)}') 54 | fi 55 | 56 | 57 | model_name=$(lscpu | grep "Model name:" | awk -F ":" '{print $NF}' | tr -s "[:space:]") 58 | os_name=$(cat /etc/os-release | grep "PRETTY_NAME" | awk -F "=" '{print $NF}' | tr -d "\"") 59 | kernel_release=$(uname -r) 60 | hostname=$(hostname -f) 61 | thp=$( [ "$(grep -o "\[always\]" /sys/kernel/mm/*transparent_hugepage/enabled)" == "[always]" ] && echo "enabled" || echo "disabled") 62 | icc_version=$(icc --version | head -n1) 63 | 64 | if [ -f /sys/devices/system/cpu/intel_pstate/no_turbo ]; then 65 | cpu_turbo=$( [ "$(cat /sys/devices/system/cpu/intel_pstate/no_turbo)" == 1 ] && echo "disabled" || echo "enabled") 66 | elif [ -f /sys/devices/system/cpu/cpufreq/boost ]; then 67 | cpu_turbo=$( [ "$(cat /sys/devices/system/cpu/cpufreq/boost)" == 1 ] && echo "enabled" || echo "disabled") 68 | fi 69 | cpu_scaling_governor=$(cat /sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_governor | sort -u) 70 | cpu_scaling_driver=$(cat /sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_driver | sort -u) 71 | 72 | lscpu_flags=$(lscpu | grep "Flags:") 73 | if [ -z "${lscpu_flags}" ]; then 74 | lscpu_flags=$(cat /proc/cpuinfo | grep -m1 "flags[[:space:]]*:") 75 | fi 76 | 77 | for isa in avx512f avx2 avx 78 | do 79 | echo ${lscpu_flags} | grep -w ${isa} &> /dev/null 80 | if [ $? -eq 0 ]; then 81 | target_cpu=${isa} 82 | break 83 | fi 84 | done 85 | } 86 | 87 | function show_mach_info() 88 | { 89 | echo -e "\nCPU Model = $model_name" 90 | echo -e "\nSockets/Cores/Threads:" 91 | echo -e "\tnum_sockets = $num_socks" 92 | echo -e "\tnum_cores_total = $num_cores_total" 93 | echo -e "\tnum_cores_per_socket = $num_cores_per_sock" 94 | echo -e "\tnum_threads_per_core = $num_threads_per_core" 95 | echo -e "\tHyper-Threading = $ht_enabled" 96 | 97 | echo -e "\nNUMA:" 98 | echo -e "\tnum_numa_domains = $num_numa_domains" 99 | echo -e "\tnum_numa_domains_per_socket = $num_numa_domains_per_sock" 100 | echo -e "\tnum_cores_per_numa_domain = $num_cores_per_numa_domain" 101 | 102 | if [ "$EUID" -eq 0 ]; then 103 | echo -e "\nMemory:" 104 | echo -e "\tmem_vendor = $mem_vendor" 105 | echo -e "\tmem_speed = $mem_speed" 106 | echo -e "\tmem_type = $mem_type" 107 | echo -e "\tmem_size_total = $mem_size_total" 108 | echo -e "\tmem_size_per_dimm = $mem_size_per_dimm" 109 | echo -e "\tnum_mem_channels = $num_mem_channels" 110 | echo -e "\tpeak_mem_bw_system = $peak_mem_bw_system GB/sec" 111 | echo -e "\tpeak_mem_bw_per_sock = $peak_mem_bw_per_sock GB/sec" 112 | else 113 | echo -e "\nMemory = $memory_size_total" 114 | fi 115 | 116 | echo -e "\nCPU Caches:" 117 | echo -e "\tL1_cache = $l1_cache_size (${l1_cache_ways}-way)" 118 | echo -e "\tL2_cache = $l2_cache_size (${l2_cache_ways}-way)" 119 | echo -e "\tL3_cache = $l3_cache_size (${l3_cache_ways}-way)" 120 | echo -e "\tL3_cache_per_sock = $l3_cache_size_per_sock" 121 | echo -e "\tL3_cache_per_core = $l3_cache_size_per_core" 122 | 123 | echo -e "\nOS:" 124 | echo -e "Operating System = $os_name" 125 | echo -e "Kernel version = $kernel_release" 126 | echo -e "CPU Turbo Boost = $cpu_turbo" 127 | echo -e "CPU Scaling Governor = $cpu_scaling_governor" 128 | echo -e "CPU Scaling Driver = $cpu_scaling_driver" 129 | echo -e "Transparent Huge Pages = $thp" 130 | 131 | echo "" 132 | echo "ICC version = ${icc_version}" 133 | echo "Target ISA = ${target_cpu}" 134 | echo "Hostname = $(hostname -f)" 135 | echo "Date = $(date)" 136 | echo "" 137 | } 138 | 139 | function check_binary() 140 | { 141 | if [ "${target_cpu}" == "avx512f" ]; then 142 | binary=stream_avx512.bin 143 | elif [ "${target_cpu}" == "avx2" ]; then 144 | binary=stream_avx2.bin 145 | elif [ "${target_cpu}" == "avx" ]; then 146 | binary=stream_avx.bin 147 | else 148 | echo "Unknown ISA, aborting.." 149 | exit 1 150 | fi 151 | 152 | if [ ! -f ${binary} ]; then 153 | echo "${binary} not found, aborting.." 154 | exit 1 155 | fi 156 | 157 | objdump -D ${binary} | grep vmovntpd &> /dev/null 158 | if [ $? -eq 0 ]; then 159 | nt_stores_status=exist 160 | stype=nt 161 | else 162 | nt_stores_status="does-not-exist" 163 | stype=rfo 164 | fi 165 | 166 | # objdump -D ${binary} | grep memcpy &> /dev/null 167 | # if [ $? -eq 0 ]; then 168 | # memcpy_status=exist 169 | # else 170 | # memcpy_status="does-not-exist" 171 | # fi 172 | 173 | # objdump -D ${binary} | grep memset &> /dev/null 174 | # if [ $? -eq 0 ]; then 175 | # memset_status=exist 176 | # else 177 | # memset_status="does-not-exist" 178 | # fi 179 | 180 | echo "${binary} disassembly:" 2>&1 | tee -a $$-runinfo.log 181 | echo "NT-Stores : ${nt_stores_status}" 2>&1 | tee -a $$-runinfo.log 182 | # echo "Memcpy() : ${memcpy_status}" 2>&1 | tee -a $$-runinfo.log 183 | # echo "Memset() : ${memset_status}" 2>&1 | tee -a $$-runinfo.log 184 | } 185 | 186 | 187 | function bench_simple() 188 | { 189 | l0_dir=$(echo ${model_name} | sed -E -e 's/ /-/g' -e 's/\(R\)|\@|\$|\%//g') 190 | res_dir=${l0_dir}/${stype} 191 | 192 | mkdir -p ${res_dir} 193 | 194 | if [ "${ht_enabled}" == "true" ]; then 195 | export KMP_AFFINITY=granularity=fine,compact,1,0 196 | else 197 | export KMP_AFFINITY=compact 198 | fi 199 | 200 | for t in ${num_cores_total} 201 | do 202 | export OMP_NUM_THREADS=$t 203 | res_file=$(basename ${binary} .bin)_${t}t.log 204 | echo "Running ${binary} with ${t} threads in compact affinity, output log will be saved in ${res_dir}/${res_file}" 205 | 206 | cat $$-runinfo.log > ${res_dir}/${res_file} 207 | ./${binary} &>> ${res_dir}/${res_file} 208 | done 209 | 210 | rm $$-runinfo.log 211 | } 212 | 213 | 214 | function bench_sweep() 215 | { 216 | l0_dir=$(echo ${model_name} | sed -E -e 's/ /-/g' -e 's/\(R\)|\@|\$|\%//g') 217 | l1_dir=nps-$num_numa_domains_per_sock 218 | 219 | #uma runs 220 | mkdir -p ${l0_dir}/${l1_dir}/uma 221 | 222 | for aff in compact distribute 223 | do 224 | res_dir=${l0_dir}/${l1_dir}/uma/${aff}/${stype} 225 | 226 | if [ "$aff" == "compact" ]; then 227 | mkdir -p ${res_dir} 228 | cp $$-runinfo.log ${res_dir}/runinfo.log 229 | if [ "${ht_enabled}" == "true" ]; then 230 | export KMP_AFFINITY=granularity=fine,verbose,compact,1,0 231 | else 232 | export KMP_AFFINITY=verbose,compact 233 | fi 234 | 235 | # for ((t=1;t<=${num_cores_total};t++)); 236 | for t in $(seq 1 ${num_cores_per_sock}) ${num_cores_total}; 237 | do 238 | export OMP_NUM_THREADS=$t 239 | res_file=$(basename ${binary} .bin)_${t}t.log 240 | echo "Running ${binary} with ${t} threads in $aff pinning, output log will be saved in ${res_dir}/${res_file}" 241 | 242 | cat $$-runinfo.log > ${res_dir}/${res_file} 243 | ./${binary} &>> ${res_dir}/${res_file} 244 | done 245 | 246 | elif [ "$aff" == "distribute" ]; then 247 | unset KMP_AFFINITY 248 | if [ "${num_numa_domains_per_sock}" == "1" ]; then 249 | continue 250 | fi 251 | mkdir -p ${res_dir} 252 | cp $$-runinfo.log ${res_dir}/runinfo.log 253 | 254 | export OMP_PROC_BIND=spread 255 | export KMP_HW_SUBSET=1s 256 | 257 | for ((t=1;t<=${num_cores_per_sock};t++)); 258 | do 259 | res_file=$(basename ${binary} .bin)_${t}t.log 260 | echo "Running ${binary} with ${t} threads in $aff pinning, output log will be saved in ${res_dir}/${res_file}" 261 | cat $$-runinfo.log > ${res_dir}/${res_file} 262 | export OMP_NUM_THREADS=$t 263 | ./${binary} &>> ${res_dir}/${res_file} 264 | done 265 | fi 266 | done 267 | 268 | #numa runs (with compact affinity only) 269 | res_dir=${l0_dir}/${l1_dir}/numa/compact/${stype} 270 | mkdir -p ${res_dir} 271 | 272 | if [ "${ht_enabled}" == "true" ]; then 273 | export KMP_AFFINITY=granularity=fine,compact,1,0 274 | else 275 | export KMP_AFFINITY=compact 276 | fi 277 | unset OMP_PROC_BIND 278 | unset KMP_HW_SUBSET 279 | 280 | thread_list=(1 ${num_cores_per_numa_domain}) 281 | if [ "${num_cores_per_numa_domain}" != "${num_cores_per_sock}" ]; then 282 | thread_list+=(${num_cores_per_sock}) 283 | fi 284 | 285 | cp $$-runinfo.log ${res_dir}/runinfo.log 286 | for t in ${thread_list[*]} 287 | do 288 | export OMP_NUM_THREADS=$t 289 | for ((id=0;id<${num_numa_domains};id++)); 290 | do 291 | res_file=$(basename ${binary} .bin)_${t}t_m${id}.log 292 | echo "Running ${binary} with ${t} threads from numa-$id, output log will be saved in ${res_dir}/${res_file}" 293 | 294 | cat $$-runinfo.log > ${res_dir}/${res_file} 295 | numactl -m$id ./${binary} &>> ${res_dir}/${res_file} 296 | done 297 | done 298 | 299 | rm $$-runinfo.log 300 | } 301 | 302 | 303 | mach_info 304 | show_mach_info 2>&1 | tee $$-runinfo.log 305 | check_binary 306 | #bench_sweep 307 | bench_simple 308 | -------------------------------------------------------------------------------- /stream.c: -------------------------------------------------------------------------------- 1 | /*-----------------------------------------------------------------------*/ 2 | /* Program: STREAM */ 3 | /* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */ 4 | /* Original code developed by John D. McCalpin */ 5 | /* Programmers: John D. McCalpin */ 6 | /* Joe R. Zagar */ 7 | /* */ 8 | /* This program measures memory transfer rates in MB/s for simple */ 9 | /* computational kernels coded in C. */ 10 | /*-----------------------------------------------------------------------*/ 11 | /* Copyright 1991-2013: John D. McCalpin */ 12 | /*-----------------------------------------------------------------------*/ 13 | /* License: */ 14 | /* 1. You are free to use this program and/or to redistribute */ 15 | /* this program. */ 16 | /* 2. You are free to modify this program for your own use, */ 17 | /* including commercial use, subject to the publication */ 18 | /* restrictions in item 3. */ 19 | /* 3. You are free to publish results obtained from running this */ 20 | /* program, or from works that you derive from this program, */ 21 | /* with the following limitations: */ 22 | /* 3a. In order to be referred to as "STREAM benchmark results", */ 23 | /* published results must be in conformance to the STREAM */ 24 | /* Run Rules, (briefly reviewed below) published at */ 25 | /* http://www.cs.virginia.edu/stream/ref.html */ 26 | /* and incorporated herein by reference. */ 27 | /* As the copyright holder, John McCalpin retains the */ 28 | /* right to determine conformity with the Run Rules. */ 29 | /* 3b. Results based on modified source code or on runs not in */ 30 | /* accordance with the STREAM Run Rules must be clearly */ 31 | /* labelled whenever they are published. Examples of */ 32 | /* proper labelling include: */ 33 | /* "tuned STREAM benchmark results" */ 34 | /* "based on a variant of the STREAM benchmark code" */ 35 | /* Other comparable, clear, and reasonable labelling is */ 36 | /* acceptable. */ 37 | /* 3c. Submission of results to the STREAM benchmark web site */ 38 | /* is encouraged, but not required. */ 39 | /* 4. Use of this program or creation of derived works based on this */ 40 | /* program constitutes acceptance of these licensing restrictions. */ 41 | /* 5. Absolutely no warranty is expressed or implied. */ 42 | /*-----------------------------------------------------------------------*/ 43 | # include 44 | # include 45 | # include 46 | # include 47 | # include 48 | # include 49 | 50 | /*----------------------------------------------------------------------- 51 | * INSTRUCTIONS: 52 | * 53 | * 1) STREAM requires different amounts of memory to run on different 54 | * systems, depending on both the system cache size(s) and the 55 | * granularity of the system timer. 56 | * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) 57 | * to meet *both* of the following criteria: 58 | * (a) Each array must be at least 4 times the size of the 59 | * available cache memory. I don't worry about the difference 60 | * between 10^6 and 2^20, so in practice the minimum array size 61 | * is about 3.8 times the cache size. 62 | * Example 1: One Xeon E3 with 8 MB L3 cache 63 | * STREAM_ARRAY_SIZE should be >= 4 million, giving 64 | * an array size of 30.5 MB and a total memory requirement 65 | * of 91.5 MB. 66 | * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) 67 | * STREAM_ARRAY_SIZE should be >= 20 million, giving 68 | * an array size of 153 MB and a total memory requirement 69 | * of 458 MB. 70 | * (b) The size should be large enough so that the 'timing calibration' 71 | * output by the program is at least 20 clock-ticks. 72 | * Example: most versions of Windows have a 10 millisecond timer 73 | * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. 74 | * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. 75 | * This means the each array must be at least 1 GB, or 128M elements. 76 | * 77 | * Version 5.10 increases the default array size from 2 million 78 | * elements to 10 million elements in response to the increasing 79 | * size of L3 caches. The new default size is large enough for caches 80 | * up to 20 MB. 81 | * Version 5.10 changes the loop index variables from "register int" 82 | * to "ssize_t", which allows array indices >2^32 (4 billion) 83 | * on properly configured 64-bit systems. Additional compiler options 84 | * (such as "-mcmodel=medium") may be required for large memory runs. 85 | * 86 | * Array size can be set at compile time without modifying the source 87 | * code for the (many) compilers that support preprocessor definitions 88 | * on the compile line. E.g., 89 | * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M 90 | * will override the default size of 10M with a new size of 100M elements 91 | * per array. 92 | */ 93 | #ifndef STREAM_ARRAY_SIZE 94 | # define STREAM_ARRAY_SIZE 10000000 95 | #endif 96 | 97 | /* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result 98 | * for any iteration after the first, therefore the minimum value 99 | * for NTIMES is 2. 100 | * There are no rules on maximum allowable values for NTIMES, but 101 | * values larger than the default are unlikely to noticeably 102 | * increase the reported performance. 103 | * NTIMES can also be set on the compile line without changing the source 104 | * code using, for example, "-DNTIMES=7". 105 | */ 106 | #ifdef NTIMES 107 | #if NTIMES<=1 108 | # define NTIMES 10 109 | #endif 110 | #endif 111 | #ifndef NTIMES 112 | # define NTIMES 10 113 | #endif 114 | 115 | /* Users are allowed to modify the "OFFSET" variable, which *may* change the 116 | * relative alignment of the arrays (though compilers may change the 117 | * effective offset by making the arrays non-contiguous on some systems). 118 | * Use of non-zero values for OFFSET can be especially helpful if the 119 | * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. 120 | * OFFSET can also be set on the compile line without changing the source 121 | * code using, for example, "-DOFFSET=56". 122 | */ 123 | #ifndef OFFSET 124 | # define OFFSET 0 125 | #endif 126 | 127 | /* 128 | * 3) Compile the code with optimization. Many compilers generate 129 | * unreasonably bad code before the optimizer tightens things up. 130 | * If the results are unreasonably good, on the other hand, the 131 | * optimizer might be too smart for me! 132 | * 133 | * For a simple single-core version, try compiling with: 134 | * cc -O stream.c -o stream 135 | * This is known to work on many, many systems.... 136 | * 137 | * To use multiple cores, you need to tell the compiler to obey the OpenMP 138 | * directives in the code. This varies by compiler, but a common example is 139 | * gcc -O -fopenmp stream.c -o stream_omp 140 | * The environment variable OMP_NUM_THREADS allows runtime control of the 141 | * number of threads/cores used when the resulting "stream_omp" program 142 | * is executed. 143 | * 144 | * To run with single-precision variables and arithmetic, simply add 145 | * -DSTREAM_TYPE=float 146 | * to the compile line. 147 | * Note that this changes the minimum array sizes required --- see (1) above. 148 | * 149 | * The preprocessor directive "TUNED" does not do much -- it simply causes the 150 | * code to call separate functions to execute each kernel. Trivial versions 151 | * of these functions are provided, but they are *not* tuned -- they just 152 | * provide predefined interfaces to be replaced with tuned code. 153 | * 154 | * 155 | * 4) Optional: Mail the results to mccalpin@cs.virginia.edu 156 | * Be sure to include info that will help me understand: 157 | * a) the computer hardware configuration (e.g., processor model, memory type) 158 | * b) the compiler name/version and compilation flags 159 | * c) any run-time information (such as OMP_NUM_THREADS) 160 | * d) all of the output from the test case. 161 | * 162 | * Thanks! 163 | * 164 | *-----------------------------------------------------------------------*/ 165 | 166 | # define HLINE "-------------------------------------------------------------\n" 167 | 168 | # ifndef MIN 169 | # define MIN(x,y) ((x)<(y)?(x):(y)) 170 | # endif 171 | # ifndef MAX 172 | # define MAX(x,y) ((x)>(y)?(x):(y)) 173 | # endif 174 | 175 | #ifndef STREAM_TYPE 176 | #define STREAM_TYPE double 177 | #endif 178 | 179 | static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], 180 | b[STREAM_ARRAY_SIZE+OFFSET], 181 | c[STREAM_ARRAY_SIZE+OFFSET]; 182 | 183 | static double avgtime[4] = {0}, maxtime[4] = {0}, 184 | mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 185 | 186 | static char *label[4] = {"Copy: ", "Scale: ", 187 | "Add: ", "Triad: "}; 188 | 189 | static double bytes[4] = { 190 | 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, 191 | 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, 192 | 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, 193 | 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE 194 | }; 195 | 196 | extern double mysecond(); 197 | extern void checkSTREAMresults(); 198 | #ifdef TUNED 199 | extern void tuned_STREAM_Copy(); 200 | extern void tuned_STREAM_Scale(STREAM_TYPE scalar); 201 | extern void tuned_STREAM_Add(); 202 | extern void tuned_STREAM_Triad(STREAM_TYPE scalar); 203 | #endif 204 | #ifdef _OPENMP 205 | extern int omp_get_num_threads(); 206 | #endif 207 | int 208 | main() 209 | { 210 | int quantum, checktick(); 211 | int BytesPerWord; 212 | int k; 213 | ssize_t j; 214 | STREAM_TYPE scalar; 215 | double t, times[4][NTIMES]; 216 | 217 | /* --- SETUP --- determine precision and check timing --- */ 218 | 219 | printf(HLINE); 220 | printf("STREAM version $Revision: 5.10 $\n"); 221 | printf(HLINE); 222 | BytesPerWord = sizeof(STREAM_TYPE); 223 | printf("This system uses %d bytes per array element.\n", 224 | BytesPerWord); 225 | 226 | printf(HLINE); 227 | #ifdef N 228 | printf("***** WARNING: ******\n"); 229 | printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); 230 | printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); 231 | printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); 232 | printf("***** WARNING: ******\n"); 233 | #endif 234 | 235 | printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); 236 | printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 237 | BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), 238 | BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); 239 | printf("Total memory required = %.1f MiB (= %.1f GiB).\n", 240 | (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), 241 | (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); 242 | printf("Each kernel will be executed %d times.\n", NTIMES); 243 | printf(" The *best* time for each kernel (excluding the first iteration)\n"); 244 | printf(" will be used to compute the reported bandwidth.\n"); 245 | 246 | #ifdef _OPENMP 247 | printf(HLINE); 248 | #pragma omp parallel 249 | { 250 | #pragma omp master 251 | { 252 | k = omp_get_num_threads(); 253 | printf ("Number of Threads requested = %i\n",k); 254 | } 255 | } 256 | #endif 257 | 258 | #ifdef _OPENMP 259 | k = 0; 260 | #pragma omp parallel 261 | #pragma omp atomic 262 | k++; 263 | printf ("Number of Threads counted = %i\n",k); 264 | #endif 265 | 266 | /* Get initial value for system clock. */ 267 | #pragma omp parallel for 268 | for (j=0; j= 1) 277 | printf("Your clock granularity/precision appears to be " 278 | "%d microseconds.\n", quantum); 279 | else { 280 | printf("Your clock granularity appears to be " 281 | "less than one microsecond.\n"); 282 | quantum = 1; 283 | } 284 | 285 | t = mysecond(); 286 | #pragma omp parallel for 287 | for (j = 0; j < STREAM_ARRAY_SIZE; j++) 288 | a[j] = 2.0E0 * a[j]; 289 | t = 1.0E6 * (mysecond() - t); 290 | 291 | printf("Each test below will take on the order" 292 | " of %d microseconds.\n", (int) t ); 293 | printf(" (= %d clock ticks)\n", (int) (t/quantum) ); 294 | printf("Increase the size of the arrays if this shows that\n"); 295 | printf("you are not getting at least 20 clock ticks per test.\n"); 296 | 297 | printf(HLINE); 298 | 299 | printf("WARNING -- The above is only a rough guideline.\n"); 300 | printf("For best results, please be sure you know the\n"); 301 | printf("precision of your system timer.\n"); 302 | printf(HLINE); 303 | 304 | /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 305 | 306 | scalar = 3.0; 307 | for (k=0; k 419 | 420 | double mysecond() 421 | { 422 | struct timeval tp; 423 | struct timezone tzp; 424 | int i; 425 | 426 | i = gettimeofday(&tp,&tzp); 427 | return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); 428 | } 429 | 430 | #ifndef abs 431 | #define abs(a) ((a) >= 0 ? (a) : -(a)) 432 | #endif 433 | void checkSTREAMresults () 434 | { 435 | STREAM_TYPE aj,bj,cj,scalar; 436 | STREAM_TYPE aSumErr,bSumErr,cSumErr; 437 | STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; 438 | double epsilon; 439 | ssize_t j; 440 | int k,ierr,err; 441 | 442 | /* reproduce initialization */ 443 | aj = 1.0; 444 | bj = 2.0; 445 | cj = 0.0; 446 | /* a[] is modified during timing check */ 447 | aj = 2.0E0 * aj; 448 | /* now execute timing loop */ 449 | scalar = 3.0; 450 | for (k=0; k epsilon) { 485 | err++; 486 | printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); 487 | printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); 488 | ierr = 0; 489 | for (j=0; j epsilon) { 491 | ierr++; 492 | #ifdef VERBOSE 493 | if (ierr < 10) { 494 | printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", 495 | j,aj,a[j],abs((aj-a[j])/aAvgErr)); 496 | } 497 | #endif 498 | } 499 | } 500 | printf(" For array a[], %d errors were found.\n",ierr); 501 | } 502 | if (abs(bAvgErr/bj) > epsilon) { 503 | err++; 504 | printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); 505 | printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); 506 | printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); 507 | ierr = 0; 508 | for (j=0; j epsilon) { 510 | ierr++; 511 | #ifdef VERBOSE 512 | if (ierr < 10) { 513 | printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", 514 | j,bj,b[j],abs((bj-b[j])/bAvgErr)); 515 | } 516 | #endif 517 | } 518 | } 519 | printf(" For array b[], %d errors were found.\n",ierr); 520 | } 521 | if (abs(cAvgErr/cj) > epsilon) { 522 | err++; 523 | printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); 524 | printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); 525 | printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); 526 | ierr = 0; 527 | for (j=0; j epsilon) { 529 | ierr++; 530 | #ifdef VERBOSE 531 | if (ierr < 10) { 532 | printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", 533 | j,cj,c[j],abs((cj-c[j])/cAvgErr)); 534 | } 535 | #endif 536 | } 537 | } 538 | printf(" For array c[], %d errors were found.\n",ierr); 539 | } 540 | if (err == 0) { 541 | printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); 542 | } 543 | #ifdef VERBOSE 544 | printf ("Results Validation Verbose Results: \n"); 545 | printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); 546 | printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); 547 | printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); 548 | #endif 549 | } 550 | 551 | #ifdef TUNED 552 | /* stubs for "tuned" versions of the kernels */ 553 | void tuned_STREAM_Copy() 554 | { 555 | ssize_t j; 556 | #pragma omp parallel for 557 | for (j=0; j