├── README.md
├── RUNner.py
├── STREAMer.py
├── local_PMDK
    ├── local_PMDK_Add_plot.svg
    ├── local_PMDK_Copy_plot.svg
    ├── local_PMDK_Scale_plot.svg
    ├── local_PMDK_Triad_plot.svg
    └── noHT_Socket0_Socket0DDR5DAX_Close_noFT_@mnt@pmem0_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.2.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
├── noHT_Socket0Socket1_Socket1DDR5DAX_Spread_noFT_@mnt@pmem1_Arrays100000000_Cores10
    ├── output.1.txt
    ├── output.10.txt
    ├── output.11.txt
    ├── output.12.txt
    ├── output.13.txt
    ├── output.14.txt
    ├── output.15.txt
    ├── output.16.txt
    ├── output.17.txt
    ├── output.18.txt
    ├── output.19.txt
    ├── output.2.txt
    ├── output.20.txt
    ├── output.3.txt
    ├── output.4.txt
    ├── output.5.txt
    ├── output.6.txt
    ├── output.7.txt
    ├── output.8.txt
    ├── output.9.txt
    └── output_data.csv
├── plot_groups.py
├── plot_results.py
├── remote_NUMA
    ├── noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.2.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.2.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.2.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.2.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── remote_NUMA_Add_plot.svg
    ├── remote_NUMA_Copy_plot.svg
    ├── remote_NUMA_Scale_plot.svg
    └── remote_NUMA_Triad_plot.svg
├── remote_NUMA_allcores
    ├── noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.11.txt
    │   ├── output.12.txt
    │   ├── output.13.txt
    │   ├── output.14.txt
    │   ├── output.15.txt
    │   ├── output.16.txt
    │   ├── output.17.txt
    │   ├── output.18.txt
    │   ├── output.19.txt
    │   ├── output.2.txt
    │   ├── output.20.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.11.txt
    │   ├── output.12.txt
    │   ├── output.13.txt
    │   ├── output.14.txt
    │   ├── output.15.txt
    │   ├── output.16.txt
    │   ├── output.17.txt
    │   ├── output.18.txt
    │   ├── output.19.txt
    │   ├── output.2.txt
    │   ├── output.20.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.11.txt
    │   ├── output.12.txt
    │   ├── output.13.txt
    │   ├── output.14.txt
    │   ├── output.15.txt
    │   ├── output.16.txt
    │   ├── output.17.txt
    │   ├── output.18.txt
    │   ├── output.19.txt
    │   ├── output.2.txt
    │   ├── output.20.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.11.txt
    │   ├── output.12.txt
    │   ├── output.13.txt
    │   ├── output.14.txt
    │   ├── output.15.txt
    │   ├── output.16.txt
    │   ├── output.17.txt
    │   ├── output.18.txt
    │   ├── output.19.txt
    │   ├── output.2.txt
    │   ├── output.20.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket0Socket1_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.11.txt
    │   ├── output.12.txt
    │   ├── output.13.txt
    │   ├── output.14.txt
    │   ├── output.15.txt
    │   ├── output.16.txt
    │   ├── output.17.txt
    │   ├── output.18.txt
    │   ├── output.19.txt
    │   ├── output.2.txt
    │   ├── output.20.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── remote_NUMA_allcores_Add_plot.svg
    ├── remote_NUMA_allcores_Copy_plot.svg
    ├── remote_NUMA_allcores_Scale_plot.svg
    └── remote_NUMA_allcores_Triad_plot.svg
├── remote_PMDK
    ├── noHT_Socket0_CXLDAX_Close_noFT_@mnt@pmem2_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.2.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket0_Socket1DDR5DAX_Close_noFT_@mnt@pmem1_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.2.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket1_CXLDAX_Close_noFT_@mnt@pmem2_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.2.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── remote_PMDK_Add_plot.svg
    ├── remote_PMDK_Copy_plot.svg
    ├── remote_PMDK_Scale_plot.svg
    └── remote_PMDK_Triad_plot.svg
├── remote_PMDK_affinity
    ├── noHT_Socket0Socket1_CXLDAX_Close_noFT_@mnt@pmem2_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.11.txt
    │   ├── output.12.txt
    │   ├── output.13.txt
    │   ├── output.14.txt
    │   ├── output.15.txt
    │   ├── output.16.txt
    │   ├── output.17.txt
    │   ├── output.18.txt
    │   ├── output.19.txt
    │   ├── output.2.txt
    │   ├── output.20.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket0Socket1_CXLDAX_Spread_noFT_@mnt@pmem2_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.11.txt
    │   ├── output.12.txt
    │   ├── output.13.txt
    │   ├── output.14.txt
    │   ├── output.15.txt
    │   ├── output.16.txt
    │   ├── output.17.txt
    │   ├── output.18.txt
    │   ├── output.19.txt
    │   ├── output.2.txt
    │   ├── output.20.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket0Socket1_Socket0DDR5DAX_Close_noFT_@mnt@pmem0_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.11.txt
    │   ├── output.12.txt
    │   ├── output.13.txt
    │   ├── output.14.txt
    │   ├── output.15.txt
    │   ├── output.16.txt
    │   ├── output.17.txt
    │   ├── output.18.txt
    │   ├── output.19.txt
    │   ├── output.2.txt
    │   ├── output.20.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket0Socket1_Socket0DDR5DAX_Spread_noFT_@mnt@pmem0_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.11.txt
    │   ├── output.12.txt
    │   ├── output.13.txt
    │   ├── output.14.txt
    │   ├── output.15.txt
    │   ├── output.16.txt
    │   ├── output.17.txt
    │   ├── output.18.txt
    │   ├── output.19.txt
    │   ├── output.2.txt
    │   ├── output.20.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── noHT_Socket0Socket1_Socket1DDR5DAX_Close_noFT_@mnt@pmem1_Arrays100000000_Cores10
    │   ├── output.1.txt
    │   ├── output.10.txt
    │   ├── output.11.txt
    │   ├── output.12.txt
    │   ├── output.13.txt
    │   ├── output.14.txt
    │   ├── output.15.txt
    │   ├── output.16.txt
    │   ├── output.17.txt
    │   ├── output.18.txt
    │   ├── output.19.txt
    │   ├── output.2.txt
    │   ├── output.20.txt
    │   ├── output.3.txt
    │   ├── output.4.txt
    │   ├── output.5.txt
    │   ├── output.6.txt
    │   ├── output.7.txt
    │   ├── output.8.txt
    │   ├── output.9.txt
    │   └── output_data.csv
    ├── remote_PMDK_affinity_Add_plot.svg
    ├── remote_PMDK_affinity_Copy_plot.svg
    ├── remote_PMDK_affinity_Scale_plot.svg
    └── remote_PMDK_affinity_Triad_plot.svg
├── script_stream_run.sh
├── stream.c
└── stream_pmemobj.c


/RUNner.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import subprocess
 3 | 
 4 | # Define mutually exclusive sublists of options
 5 | exclusive_options = [
 6 |     ["--noHT", "--HT"],
 7 |     ["--Socket0", "--Socket1", "--Socket0Socket1"],
 8 |     ["--Socket0DDR4", "--Socket1DDR4", "--CXLDDR4", "--Socket0DDR5", "--Socket1DDR5", "--CXLDAX", "--Socket0DDR4DAX", "--Socket1DDR4DAX", "--Socket0DDR5DAX", "--Socket1DDR5DAX", "--Socket0OptaneDAX", "--Socket1OptaneDAX"],
 9 |     ["--Close", "--Spread"],
10 |     ["--noFT", "--FT"]
11 | ]
12 | 
13 | # DAX options and flag
14 | dax_options = ["--CXLDAX", "--Socket0DDR4DAX", "--Socket1DDR4DAX", "--Socket0DDR5DAX", "--Socket1DDR5DAX", "--Socket0OptaneDAX", "--Socket1OptaneDAX"]
15 | dax_flag = "--DAX_Path"
16 | 
17 | # DAX_Path options
18 | dax_path_options = ["/mnt/pmem0", "/mnt/pmem1", "/mnt/pmem2"]
19 | 
20 | # Generate all possible permutations of mutually exclusive sublists
21 | permutations = list(itertools.product(*exclusive_options))
22 | 
23 | # Iterate over each permutation and execute the STREAMer.py script
24 | for perm in permutations:
25 |     if any(option in perm for option in dax_options):
26 |         # Add DAX_Path flag when DAX-related options are present
27 |         for dax_path_option in dax_path_options:
28 |             command = ["python3", "STREAMer.py"] + list(perm) + [dax_flag, dax_path_option]
29 |             subprocess.run(command)
30 |     else:
31 |         command = ["python3", "STREAMer.py"] + list(perm)
32 |         subprocess.run(command)
33 | 


--------------------------------------------------------------------------------
/local_PMDK/noHT_Socket0_Socket0DDR5DAX_Close_noFT_@mnt@pmem0_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,8140.2,6716.8,6521.1,6852.5
 3 | 2,14915.5,12815.3,12892.1,13532.9
 4 | 3,18794.8,16889.2,18332.1,19110.7
 5 | 4,20983.6,19371.2,22218.3,22664.2
 6 | 5,19789.2,18180.4,21874.1,21589.5
 7 | 6,20115.4,18963.6,22291.4,21911.0
 8 | 7,20865.1,19722.4,22806.3,22587.4
 9 | 8,21261.6,20310.2,23367.9,23111.4
10 | 9,21320.8,20406.1,23399.6,23140.6
11 | 10,21356.9,20464.0,23407.1,23183.5
12 | 


--------------------------------------------------------------------------------
/noHT_Socket0Socket1_Socket1DDR5DAX_Spread_noFT_@mnt@pmem1_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,7208.4,6118.5,5971.5,6317.1
 3 | 2,12931.4,11357.7,11787.8,12397.1
 4 | 3,15356.7,13953.2,15963.0,16731.1
 5 | 4,17427.1,16382.5,19065.2,19593.0
 6 | 5,17111.9,16802.0,19584.8,20058.8
 7 | 6,17846.4,17700.3,20764.7,21041.9
 8 | 7,16260.1,15610.7,18711.2,19033.9
 9 | 8,16938.8,16465.4,19569.5,19843.9
10 | 9,17600.6,17175.9,20260.9,20559.0
11 | 10,17057.2,16702.8,19803.0,20253.2
12 | 11,17647.2,17284.2,20196.7,20634.7
13 | 12,18107.8,17765.5,20600.7,20962.3
14 | 13,18444.5,18121.1,20941.1,21242.5
15 | 14,18821.6,18463.4,21235.5,21503.6
16 | 15,19104.0,18577.9,21509.0,21702.8
17 | 16,18735.5,18324.2,21180.5,21453.1
18 | 17,18258.2,17902.1,20740.1,21177.1
19 | 18,17882.5,17792.8,20484.3,20945.3
20 | 19,17652.1,17522.0,20267.0,20712.7
21 | 20,17525.2,17324.9,20108.2,20595.2
22 | 


--------------------------------------------------------------------------------
/plot_results.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import matplotlib.pyplot as plt
 3 | import sys
 4 | 
 5 | 
 6 | def plot(path):
 7 | 	# Initialize empty lists to store data
 8 | 	number_of_threads = []
 9 | 	copy_rate = []
10 | 	scale_rate = []
11 | 	add_rate = []
12 | 	triad_rate = []
13 | 
14 | 	# Read data from CSV file
15 | 	with open(path+'/output_data.csv', 'r') as file:
16 | 	    reader = csv.reader(file)
17 | 	    next(reader)  # Skip header row
18 | 	    for row in reader:
19 | 	        # Extract data from each row
20 | 	        thread_count = int(row[0])
21 |         	copy = float(row[1])
22 |         	scale = float(row[2])
23 |        		add = float(row[3])
24 |         	triad = float(row[4])
25 |         	# Append data to respective lists
26 |         	number_of_threads.append(thread_count)
27 |         	copy_rate.append(copy)
28 |         	scale_rate.append(scale)
29 |         	add_rate.append(add)
30 |         	triad_rate.append(triad)
31 | 
32 |     	# Create plot
33 | 	plt.plot(number_of_threads, copy_rate, label='Copy Rate')
34 | 	plt.plot(number_of_threads, scale_rate, label='Scale Rate')
35 | 	plt.plot(number_of_threads, add_rate, label='Add Rate')
36 | 	plt.plot(number_of_threads, triad_rate, label='Triad Rate')
37 |        
38 |     	# Set plot labels and title
39 | 	plt.xlabel('Number of Threads')
40 | 	plt.ylabel('Rate (MB/s)')
41 | 	#plt.title('STREAM Performance Rates')
42 | 	# Add legend
43 | 	plt.legend()
44 | 	plt.tight_layout()
45 | 	# Save the plot as an SVG image
46 | 	plt.savefig(path+'/graph_results.svg', format='svg')
47 | 	print("plot saved in " + path + "/graph_results.svg")
48 |  	# Display the plot
49 | 	plt.show()
50 | 
51 | if __name__ == "__main__":
52 | 	if len(sys.argv) !=2:
53 | 		print("Need to provide relative path to directory with csv file")
54 | 		sys.exit(1)
55 | 
56 | 	path=sys.argv[1]
57 | 	plot(path)
58 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 1
14 | Number of Threads counted = 1
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 227669 microseconds.
18 |    (= 227669 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            4177.7     0.383159     0.382985     0.383744
28 | Scale:           4453.8     0.359574     0.359247     0.360168
29 | Add:             4638.5     0.517660     0.517408     0.517927
30 | Triad:           4678.9     0.513146     0.512946     0.513394
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 10
14 | Number of Threads counted = 10
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 118813 microseconds.
18 |    (= 118813 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9743.6     0.165786     0.164210     0.168108
28 | Scale:           9647.6     0.166856     0.165844     0.168576
29 | Add:            11230.2     0.214807     0.213710     0.217668
30 | Triad:          11130.2     0.217182     0.215629     0.217890
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 2
14 | Number of Threads counted = 2
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 160424 microseconds.
18 |    (= 160424 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            7367.8     0.217990     0.217162     0.218728
28 | Scale:           7741.0     0.207786     0.206692     0.209875
29 | Add:             8155.2     0.295132     0.294291     0.296964
30 | Triad:           8135.7     0.295597     0.294995     0.296586
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 3
14 | Number of Threads counted = 3
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 131028 microseconds.
18 |    (= 131028 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9006.1     0.179122     0.177658     0.180951
28 | Scale:           9311.2     0.172805     0.171836     0.174209
29 | Add:            10237.7     0.235379     0.234428     0.237341
30 | Triad:          10253.5     0.234449     0.234067     0.234923
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 4
14 | Number of Threads counted = 4
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 119729 microseconds.
18 |    (= 119729 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9638.0     0.168352     0.166010     0.170339
28 | Scale:           9771.8     0.164892     0.163737     0.166000
29 | Add:            11000.1     0.219277     0.218179     0.221557
30 | Triad:          11093.6     0.217202     0.216340     0.217835
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 5
14 | Number of Threads counted = 5
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 121135 microseconds.
18 |    (= 121135 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9687.0     0.167617     0.165169     0.171813
28 | Scale:           9851.2     0.163980     0.162417     0.165658
29 | Add:            11304.2     0.213760     0.212310     0.216574
30 | Triad:          11297.5     0.213347     0.212436     0.214055
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 6
14 | Number of Threads counted = 6
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 118418 microseconds.
18 |    (= 118418 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9630.7     0.167710     0.166135     0.169061
28 | Scale:           9676.3     0.166171     0.165353     0.168232
29 | Add:            11261.0     0.214420     0.213125     0.216631
30 | Triad:          11229.0     0.215100     0.213733     0.215776
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 7
14 | Number of Threads counted = 7
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 122730 microseconds.
18 |    (= 122730 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9543.9     0.169305     0.167646     0.171809
28 | Scale:           9496.2     0.169338     0.168489     0.170688
29 | Add:            11143.8     0.216745     0.215366     0.219450
30 | Triad:          11069.2     0.218164     0.216817     0.219454
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 8
14 | Number of Threads counted = 8
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 119171 microseconds.
18 |    (= 119171 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9576.4     0.168928     0.167078     0.171079
28 | Scale:           9610.1     0.167281     0.166492     0.168647
29 | Add:            11169.2     0.216210     0.214877     0.218697
30 | Triad:          11054.9     0.218647     0.217098     0.219202
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 9
14 | Number of Threads counted = 9
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 119756 microseconds.
18 |    (= 119756 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9764.8     0.166471     0.163854     0.169869
28 | Scale:           9585.1     0.168336     0.166925     0.170970
29 | Add:            11196.5     0.216214     0.214352     0.218981
30 | Triad:          11063.4     0.218335     0.216931     0.219295
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,4177.7,4453.8,4638.5,4678.9
 3 | 2,7367.8,7741.0,8155.2,8135.7
 4 | 3,9006.1,9311.2,10237.7,10253.5
 5 | 4,9638.0,9771.8,11000.1,11093.6
 6 | 5,9687.0,9851.2,11304.2,11297.5
 7 | 6,9630.7,9676.3,11261.0,11229.0
 8 | 7,9543.9,9496.2,11143.8,11069.2
 9 | 8,9576.4,9610.1,11169.2,11054.9
10 | 9,9764.8,9585.1,11196.5,11063.4
11 | 10,9743.6,9647.6,11230.2,11130.2
12 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 1
14 | Number of Threads counted = 1
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 142261 microseconds.
18 |    (= 142261 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            7649.2     0.209402     0.209173     0.209594
28 | Scale:           7111.8     0.225200     0.224978     0.225500
29 | Add:             7886.9     0.304502     0.304302     0.304624
30 | Triad:           7170.9     0.334785     0.334685     0.335058
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 10
14 | Number of Threads counted = 10
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 147850 microseconds.
18 |    (= 147850 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            8938.4     0.179859     0.179002     0.181012
28 | Scale:           6706.9     0.240479     0.238561     0.242180
29 | Add:             8953.6     0.269101     0.268050     0.270377
30 | Triad:           7165.7     0.336927     0.334930     0.339037
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 2
14 | Number of Threads counted = 2
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 135459 microseconds.
18 |    (= 135459 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9145.3     0.175240     0.174954     0.175798
28 | Scale:           7527.0     0.214455     0.212568     0.215430
29 | Add:             9131.0     0.263540     0.262842     0.264041
30 | Triad:           7965.5     0.303535     0.301298     0.307011
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 3
14 | Number of Threads counted = 3
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 133390 microseconds.
18 |    (= 133390 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9456.7     0.169369     0.169193     0.169611
28 | Scale:           7430.5     0.216405     0.215329     0.217398
29 | Add:             9226.2     0.260533     0.260129     0.260752
30 | Triad:           7730.6     0.314205     0.310453     0.317681
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 4
14 | Number of Threads counted = 4
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 135694 microseconds.
18 |    (= 135694 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9273.9     0.172736     0.172527     0.172923
28 | Scale:           7443.6     0.217441     0.214949     0.219167
29 | Add:             9200.9     0.261867     0.260844     0.263139
30 | Triad:           7643.4     0.315713     0.313995     0.318028
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 5
14 | Number of Threads counted = 5
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 138097 microseconds.
18 |    (= 138097 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9235.1     0.173603     0.173252     0.174022
28 | Scale:           7339.2     0.218982     0.218007     0.220910
29 | Add:             9121.8     0.263477     0.263107     0.264479
30 | Triad:           7643.1     0.315631     0.314008     0.317526
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 6
14 | Number of Threads counted = 6
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 138486 microseconds.
18 |    (= 138486 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9196.8     0.174216     0.173974     0.174672
28 | Scale:           7246.1     0.223006     0.220809     0.224585
29 | Add:             9149.7     0.262754     0.262305     0.264823
30 | Triad:           7540.7     0.319968     0.318274     0.321880
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 7
14 | Number of Threads counted = 7
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 144020 microseconds.
18 |    (= 144020 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9114.2     0.176091     0.175550     0.177001
28 | Scale:           6978.2     0.234265     0.229286     0.237691
29 | Add:             8585.9     0.285256     0.279529     0.289700
30 | Triad:           6907.6     0.355230     0.347442     0.362309
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 8
14 | Number of Threads counted = 8
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 145522 microseconds.
18 |    (= 145522 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9050.8     0.177671     0.176780     0.179603
28 | Scale:           6931.6     0.233437     0.230828     0.236525
29 | Add:             8958.8     0.268808     0.267893     0.270326
30 | Triad:           7357.1     0.326778     0.326217     0.327710
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 9
14 | Number of Threads counted = 9
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 150119 microseconds.
18 |    (= 150119 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9035.2     0.177318     0.177085     0.177999
28 | Scale:           6813.4     0.236120     0.234831     0.238476
29 | Add:             8982.7     0.268134     0.267179     0.268993
30 | Triad:           7297.4     0.331060     0.328883     0.332905
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,7649.2,7111.8,7886.9,7170.9
 3 | 2,9145.3,7527.0,9131.0,7965.5
 4 | 3,9456.7,7430.5,9226.2,7730.6
 5 | 4,9273.9,7443.6,9200.9,7643.4
 6 | 5,9235.1,7339.2,9121.8,7643.1
 7 | 6,9196.8,7246.1,9149.7,7540.7
 8 | 7,9114.2,6978.2,8585.9,6907.6
 9 | 8,9050.8,6931.6,8958.8,7357.1
10 | 9,9035.2,6813.4,8982.7,7297.4
11 | 10,8938.4,6706.9,8953.6,7165.7
12 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 1
14 | Number of Threads counted = 1
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 147510 microseconds.
18 |    (= 147510 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            8122.2     0.197648     0.196991     0.198985
28 | Scale:           8258.0     0.194080     0.193752     0.194539
29 | Add:             9051.4     0.266801     0.265151     0.274405
30 | Triad:           8948.6     0.269025     0.268199     0.270529
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 10
14 | Number of Threads counted = 10
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 84199 microseconds.
18 |    (= 84199 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           14735.2     0.108968     0.108583     0.109307
28 | Scale:          14633.4     0.110203     0.109339     0.111848
29 | Add:            17564.1     0.137254     0.136642     0.138523
30 | Triad:          17594.5     0.136560     0.136406     0.136818
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 2
14 | Number of Threads counted = 2
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 90823 microseconds.
18 |    (= 90823 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           12673.4     0.126774     0.126249     0.128944
28 | Scale:          12767.9     0.125896     0.125314     0.127201
29 | Add:            14735.1     0.163475     0.162876     0.164560
30 | Triad:          14830.3     0.162153     0.161831     0.162724
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 3
14 | Number of Threads counted = 3
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 82517 microseconds.
18 |    (= 82517 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           14437.9     0.110902     0.110819     0.110989
28 | Scale:          14506.4     0.110808     0.110296     0.111780
29 | Add:            17170.3     0.140238     0.139776     0.141177
30 | Triad:          17322.0     0.138728     0.138552     0.139019
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 4
14 | Number of Threads counted = 4
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 80292 microseconds.
18 |    (= 80292 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           15012.9     0.106614     0.106575     0.106645
28 | Scale:          14980.4     0.107572     0.106806     0.109009
29 | Add:            17824.5     0.135150     0.134646     0.136118
30 | Triad:          17974.8     0.133713     0.133520     0.133924
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 5
14 | Number of Threads counted = 5
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 84152 microseconds.
18 |    (= 84152 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           15036.0     0.106631     0.106411     0.106852
28 | Scale:          14909.5     0.107977     0.107314     0.109335
29 | Add:            17837.0     0.135107     0.134552     0.136241
30 | Triad:          17866.3     0.134406     0.134331     0.134471
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 6
14 | Number of Threads counted = 6
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 85071 microseconds.
18 |    (= 85071 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           14928.4     0.107416     0.107178     0.107723
28 | Scale:          14797.5     0.108833     0.108126     0.110281
29 | Add:            17739.3     0.135854     0.135293     0.137124
30 | Triad:          17783.4     0.135152     0.134957     0.135424
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 7
14 | Number of Threads counted = 7
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 83728 microseconds.
18 |    (= 83728 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           14848.8     0.108092     0.107753     0.109187
28 | Scale:          14691.9     0.109851     0.108904     0.113299
29 | Add:            17698.3     0.136251     0.135606     0.137606
30 | Triad:          17723.0     0.135560     0.135417     0.135799
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 8
14 | Number of Threads counted = 8
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 84120 microseconds.
18 |    (= 84120 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           14778.1     0.108549     0.108268     0.108815
28 | Scale:          14688.9     0.109721     0.108926     0.111354
29 | Add:            17645.1     0.136675     0.136015     0.137922
30 | Triad:          17677.3     0.136146     0.135767     0.137189
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 9
14 | Number of Threads counted = 9
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 84061 microseconds.
18 |    (= 84061 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           14765.1     0.108689     0.108364     0.109062
28 | Scale:          14669.5     0.109852     0.109070     0.111517
29 | Add:            17589.3     0.137128     0.136447     0.138518
30 | Triad:          17636.8     0.136229     0.136079     0.136503
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,8122.2,8258.0,9051.4,8948.6
 3 | 2,12673.4,12767.9,14735.1,14830.3
 4 | 3,14437.9,14506.4,17170.3,17322.0
 5 | 4,15012.9,14980.4,17824.5,17974.8
 6 | 5,15036.0,14909.5,17837.0,17866.3
 7 | 6,14928.4,14797.5,17739.3,17783.4
 8 | 7,14848.8,14691.9,17698.3,17723.0
 9 | 8,14778.1,14688.9,17645.1,17677.3
10 | 9,14765.1,14669.5,17589.3,17636.8
11 | 10,14735.2,14633.4,17564.1,17594.5
12 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 1
14 | Number of Threads counted = 1
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 434396 microseconds.
18 |    (= 434396 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            2450.4     0.654293     0.652952     0.655779
28 | Scale:           2602.0     0.615969     0.614914     0.617592
29 | Add:             2848.5     0.843352     0.842535     0.844672
30 | Triad:           2864.9     0.838663     0.837733     0.839957
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 10
14 | Number of Threads counted = 10
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 136446 microseconds.
18 |    (= 136446 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            8626.4     0.186404     0.185478     0.188040
28 | Scale:           8785.9     0.182599     0.182109     0.183500
29 | Add:            10370.5     0.231867     0.231425     0.232534
30 | Triad:          10348.9     0.232330     0.231909     0.232666
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 2
14 | Number of Threads counted = 2
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 238327 microseconds.
18 |    (= 238327 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            4528.1     0.357132     0.353352     0.358273
28 | Scale:           4746.5     0.340201     0.337094     0.341245
29 | Add:             5218.3     0.463050     0.459918     0.464174
30 | Triad:           5222.0     0.461689     0.459593     0.462247
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 3
14 | Number of Threads counted = 3
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 210314 microseconds.
18 |    (= 210314 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            5961.1     0.269586     0.268406     0.271293
28 | Scale:           6167.7     0.260234     0.259418     0.261446
29 | Add:             6966.9     0.345178     0.344487     0.346143
30 | Triad:           6925.5     0.347255     0.346547     0.348450
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 4
14 | Number of Threads counted = 4
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 176455 microseconds.
18 |    (= 176455 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            6949.1     0.231458     0.230245     0.232467
28 | Scale:           7138.8     0.224621     0.224126     0.225022
29 | Add:             8244.9     0.291539     0.291089     0.291957
30 | Triad:           8198.2     0.293055     0.292747     0.293226
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 5
14 | Number of Threads counted = 5
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 160072 microseconds.
18 |    (= 160072 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            7646.4     0.210942     0.209248     0.211725
28 | Scale:           7804.5     0.205275     0.205010     0.205561
29 | Add:             9089.8     0.264554     0.264032     0.265015
30 | Triad:           9037.3     0.265860     0.265567     0.266212
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 6
14 | Number of Threads counted = 6
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 148901 microseconds.
18 |    (= 148901 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            8192.3     0.196060     0.195306     0.197153
28 | Scale:           8439.6     0.190439     0.189583     0.191185
29 | Add:             9769.0     0.246163     0.245674     0.246740
30 | Triad:           9730.3     0.247081     0.246652     0.247457
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 7
14 | Number of Threads counted = 7
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 143800 microseconds.
18 |    (= 143800 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            8389.2     0.191643     0.190722     0.192348
28 | Scale:           8582.5     0.187288     0.186427     0.188239
29 | Add:            10042.0     0.239348     0.238995     0.239802
30 | Triad:          10030.9     0.239512     0.239260     0.239790
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 8
14 | Number of Threads counted = 8
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 140364 microseconds.
18 |    (= 140364 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            8508.6     0.188757     0.188045     0.189753
28 | Scale:           8704.8     0.184343     0.183806     0.185229
29 | Add:            10262.2     0.234329     0.233869     0.234844
30 | Triad:          10245.3     0.234708     0.234253     0.234987
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 9
14 | Number of Threads counted = 9
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 138659 microseconds.
18 |    (= 138659 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            8593.0     0.186830     0.186198     0.188966
28 | Scale:           8747.7     0.183420     0.182906     0.184697
29 | Add:            10309.7     0.233122     0.232790     0.233643
30 | Triad:          10300.9     0.233343     0.232989     0.233806
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,2450.4,2602.0,2848.5,2864.9
 3 | 2,4528.1,4746.5,5218.3,5222.0
 4 | 3,5961.1,6167.7,6966.9,6925.5
 5 | 4,6949.1,7138.8,8244.9,8198.2
 6 | 5,7646.4,7804.5,9089.8,9037.3
 7 | 6,8192.3,8439.6,9769.0,9730.3
 8 | 7,8389.2,8582.5,10042.0,10030.9
 9 | 8,8508.6,8704.8,10262.2,10245.3
10 | 9,8593.0,8747.7,10309.7,10300.9
11 | 10,8626.4,8785.9,10370.5,10348.9
12 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 1
14 | Number of Threads counted = 1
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 227902 microseconds.
18 |    (= 227902 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            4179.1     0.383256     0.382854     0.383994
28 | Scale:           4453.5     0.359587     0.359269     0.360131
29 | Add:             4636.2     0.517811     0.517666     0.517979
30 | Triad:           4678.6     0.513289     0.512969     0.513564
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 10
14 | Number of Threads counted = 10
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 118496 microseconds.
18 |    (= 118496 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9695.3     0.166336     0.165028     0.169559
28 | Scale:           9664.7     0.166638     0.165551     0.169249
29 | Add:            11234.8     0.215182     0.213621     0.219362
30 | Triad:          11107.6     0.217278     0.216069     0.217959
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.11.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 11
14 | Number of Threads counted = 11
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 122984 microseconds.
18 |    (= 122984 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9320.9     0.173304     0.171657     0.175774
28 | Scale:           9269.1     0.173222     0.172616     0.175255
29 | Add:            10717.8     0.225097     0.223926     0.227463
30 | Triad:          10655.4     0.225992     0.225238     0.226534
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.12.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 12
14 | Number of Threads counted = 12
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 131266 microseconds.
18 |    (= 131266 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9421.1     0.171242     0.169831     0.173345
28 | Scale:           9324.4     0.172572     0.171593     0.173529
29 | Add:            10832.3     0.222469     0.221560     0.223770
30 | Triad:          10770.4     0.223738     0.222833     0.224532
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.13.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 13
14 | Number of Threads counted = 13
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 131184 microseconds.
18 |    (= 131184 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9499.9     0.169635     0.168423     0.170948
28 | Scale:           9405.7     0.171128     0.170109     0.171827
29 | Add:            10906.8     0.220886     0.220047     0.222127
30 | Triad:          10830.5     0.222424     0.221596     0.222960
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.14.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 14
14 | Number of Threads counted = 14
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 132295 microseconds.
18 |    (= 132295 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9531.6     0.168828     0.167863     0.170388
28 | Scale:           9394.1     0.170825     0.170319     0.171582
29 | Add:            10961.4     0.219918     0.218951     0.221417
30 | Triad:          10859.9     0.221567     0.220996     0.221906
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.15.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 15
14 | Number of Threads counted = 15
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 130723 microseconds.
18 |    (= 130723 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9558.3     0.169207     0.167394     0.170829
28 | Scale:           9405.4     0.170808     0.170115     0.171596
29 | Add:            10944.5     0.220128     0.219288     0.221848
30 | Triad:          10887.3     0.221051     0.220440     0.221366
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.16.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 16
14 | Number of Threads counted = 16
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 128455 microseconds.
18 |    (= 128455 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9542.9     0.168918     0.167664     0.170478
28 | Scale:           9377.1     0.171206     0.170629     0.172111
29 | Add:            10946.4     0.220042     0.219250     0.221368
30 | Triad:          10876.5     0.221576     0.220659     0.221982
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.17.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 17
14 | Number of Threads counted = 17
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 126279 microseconds.
18 |    (= 126279 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9524.3     0.169623     0.167992     0.172267
28 | Scale:           9313.5     0.172264     0.171793     0.173948
29 | Add:            10886.2     0.221329     0.220463     0.222398
30 | Triad:          10796.1     0.223514     0.222303     0.226614
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.18.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 18
14 | Number of Threads counted = 18
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 127269 microseconds.
18 |    (= 127269 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9452.3     0.170355     0.169271     0.171998
28 | Scale:           9269.8     0.173278     0.172604     0.174098
29 | Add:            10821.1     0.222471     0.221789     0.223816
30 | Triad:          10733.0     0.224401     0.223609     0.224870
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.19.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 19
14 | Number of Threads counted = 19
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 127732 microseconds.
18 |    (= 127732 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9406.7     0.171181     0.170092     0.173099
28 | Scale:           9234.8     0.174006     0.173257     0.174614
29 | Add:            10785.6     0.223619     0.222518     0.225288
30 | Triad:          10686.8     0.225419     0.224576     0.225824
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 2
14 | Number of Threads counted = 2
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 135847 microseconds.
18 |    (= 135847 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            7337.1     0.219060     0.218070     0.220588
28 | Scale:           7716.4     0.208026     0.207351     0.209754
29 | Add:             8193.7     0.294265     0.292908     0.296555
30 | Triad:           8241.2     0.291917     0.291219     0.293191
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.20.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 20
14 | Number of Threads counted = 20
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 127912 microseconds.
18 |    (= 127912 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9351.1     0.171879     0.171103     0.173050
28 | Scale:           9205.1     0.174853     0.173817     0.176800
29 | Add:            10743.7     0.224536     0.223386     0.225779
30 | Triad:          10626.6     0.226989     0.225848     0.232955
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 3
14 | Number of Threads counted = 3
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 127053 microseconds.
18 |    (= 127053 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            8998.0     0.178954     0.177817     0.181178
28 | Scale:           9308.1     0.172936     0.171894     0.174343
29 | Add:            10254.5     0.235032     0.234044     0.237108
30 | Triad:          10238.3     0.234687     0.234414     0.235237
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 4
14 | Number of Threads counted = 4
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 123038 microseconds.
18 |    (= 123038 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9718.8     0.167448     0.164630     0.169944
28 | Scale:           9809.6     0.164312     0.163105     0.166159
29 | Add:            11022.9     0.218774     0.217728     0.220746
30 | Triad:          11092.4     0.217360     0.216364     0.217960
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 5
14 | Number of Threads counted = 5
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 119091 microseconds.
18 |    (= 119091 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9679.7     0.167780     0.165294     0.170541
28 | Scale:           9803.7     0.164740     0.163203     0.165928
29 | Add:            11268.0     0.214475     0.212993     0.216835
30 | Triad:          11315.9     0.213512     0.212091     0.214203
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 6
14 | Number of Threads counted = 6
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 119348 microseconds.
18 |    (= 119348 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9765.9     0.167010     0.163836     0.170884
28 | Scale:           9746.1     0.165846     0.164169     0.168201
29 | Add:            11322.1     0.213940     0.211974     0.217361
30 | Triad:          11224.1     0.214637     0.213825     0.215391
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 7
14 | Number of Threads counted = 7
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 120600 microseconds.
18 |    (= 120600 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9634.0     0.169093     0.166079     0.171605
28 | Scale:           9531.0     0.169752     0.167873     0.172459
29 | Add:            11203.6     0.216979     0.214217     0.220045
30 | Triad:          11060.4     0.218029     0.216990     0.218821
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 8
14 | Number of Threads counted = 8
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 119760 microseconds.
18 |    (= 119760 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9608.7     0.168858     0.166515     0.171386
28 | Scale:           9575.8     0.168100     0.167088     0.169256
29 | Add:            11156.7     0.216447     0.215117     0.218926
30 | Triad:          11037.6     0.218711     0.217439     0.219366
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 9
14 | Number of Threads counted = 9
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 118345 microseconds.
18 |    (= 118345 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9747.4     0.166639     0.164147     0.169889
28 | Scale:           9630.1     0.167524     0.166145     0.169930
29 | Add:            11201.7     0.215795     0.214253     0.219663
30 | Triad:          11047.2     0.218182     0.217249     0.218931
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,4179.1,4453.5,4636.2,4678.6
 3 | 2,7337.1,7716.4,8193.7,8241.2
 4 | 3,8998.0,9308.1,10254.5,10238.3
 5 | 4,9718.8,9809.6,11022.9,11092.4
 6 | 5,9679.7,9803.7,11268.0,11315.9
 7 | 6,9765.9,9746.1,11322.1,11224.1
 8 | 7,9634.0,9531.0,11203.6,11060.4
 9 | 8,9608.7,9575.8,11156.7,11037.6
10 | 9,9747.4,9630.1,11201.7,11047.2
11 | 10,9695.3,9664.7,11234.8,11107.6
12 | 11,9320.9,9269.1,10717.8,10655.4
13 | 12,9421.1,9324.4,10832.3,10770.4
14 | 13,9499.9,9405.7,10906.8,10830.5
15 | 14,9531.6,9394.1,10961.4,10859.9
16 | 15,9558.3,9405.4,10944.5,10887.3
17 | 16,9542.9,9377.1,10946.4,10876.5
18 | 17,9524.3,9313.5,10886.2,10796.1
19 | 18,9452.3,9269.8,10821.1,10733.0
20 | 19,9406.7,9234.8,10785.6,10686.8
21 | 20,9351.1,9205.1,10743.7,10626.6
22 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 1
14 | Number of Threads counted = 1
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 103253 microseconds.
18 |    (= 103253 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           10749.3     0.149110     0.148847     0.149554
28 | Scale:          10998.2     0.145643     0.145479     0.145896
29 | Add:            12198.7     0.197189     0.196743     0.197743
30 | Triad:          12131.6     0.198040     0.197830     0.198258
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 10
14 | Number of Threads counted = 10
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 94912 microseconds.
18 |    (= 94912 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           12011.3     0.134133     0.133208     0.135429
28 | Scale:          11928.5     0.135436     0.134132     0.137006
29 | Add:            13701.9     0.176716     0.175158     0.178243
30 | Triad:          13666.3     0.177067     0.175614     0.179717
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.11.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 11
14 | Number of Threads counted = 11
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 98126 microseconds.
18 |    (= 98126 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           11599.8     0.138732     0.137933     0.140258
28 | Scale:          11328.1     0.141536     0.141242     0.142320
29 | Add:            12939.4     0.186310     0.185480     0.187470
30 | Triad:          12679.4     0.189788     0.189284     0.190346
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 2
14 | Number of Threads counted = 2
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 89407 microseconds.
18 |    (= 89407 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           12267.9     0.131068     0.130422     0.132556
28 | Scale:          11891.3     0.136629     0.134552     0.138995
29 | Add:            13979.9     0.172135     0.171675     0.174251
30 | Triad:          13872.4     0.173677     0.173005     0.174913
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 3
14 | Number of Threads counted = 3
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 92433 microseconds.
18 |    (= 92433 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           12557.1     0.133354     0.127418     0.144179
28 | Scale:          12090.8     0.133383     0.132332     0.134496
29 | Add:            14114.6     0.171665     0.170037     0.175098
30 | Triad:          14105.0     0.171175     0.170152     0.171659
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 4
14 | Number of Threads counted = 4
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 93810 microseconds.
18 |    (= 93810 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           12478.3     0.134921     0.128223     0.147896
28 | Scale:          12243.4     0.132095     0.130683     0.134728
29 | Add:            14192.3     0.171367     0.169106     0.178828
30 | Triad:          14093.7     0.171056     0.170289     0.174184
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 5
14 | Number of Threads counted = 5
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 94319 microseconds.
18 |    (= 94319 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           12063.9     0.141477     0.132627     0.148486
28 | Scale:          12235.0     0.132526     0.130773     0.135351
29 | Add:            13954.7     0.173991     0.171985     0.176238
30 | Triad:          14057.4     0.171101     0.170729     0.171554
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 6
14 | Number of Threads counted = 6
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 95283 microseconds.
18 |    (= 95283 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           12179.6     0.134699     0.131367     0.143013
28 | Scale:          12159.1     0.133358     0.131589     0.135637
29 | Add:            14167.5     0.171946     0.169402     0.173677
30 | Triad:          14136.1     0.170780     0.169778     0.171887
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 7
14 | Number of Threads counted = 7
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 95813 microseconds.
18 |    (= 95813 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           11922.5     0.138979     0.134200     0.145591
28 | Scale:          11945.6     0.137373     0.133940     0.140009
29 | Add:            13139.3     0.188596     0.182658     0.196630
30 | Triad:          13505.8     0.181404     0.177702     0.186060
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 8
14 | Number of Threads counted = 8
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 94635 microseconds.
18 |    (= 94635 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           12008.3     0.134107     0.133241     0.135240
28 | Scale:          12079.2     0.133946     0.132459     0.135378
29 | Add:            13769.0     0.175073     0.174304     0.175722
30 | Triad:          13815.6     0.175299     0.173717     0.176703
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 9
14 | Number of Threads counted = 9
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 93999 microseconds.
18 |    (= 93999 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           12138.0     0.133188     0.131817     0.136354
28 | Scale:          12113.9     0.132852     0.132080     0.133873
29 | Add:            13834.8     0.174206     0.173476     0.174980
30 | Triad:          13842.5     0.173928     0.173379     0.174637
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,10749.3,10998.2,12198.7,12131.6
 3 | 2,12267.9,11891.3,13979.9,13872.4
 4 | 3,12557.1,12090.8,14114.6,14105.0
 5 | 4,12478.3,12243.4,14192.3,14093.7
 6 | 5,12063.9,12235.0,13954.7,14057.4
 7 | 6,12179.6,12159.1,14167.5,14136.1
 8 | 7,11922.5,11945.6,13139.3,13505.8
 9 | 8,12008.3,12079.2,13769.0,13815.6
10 | 9,12138.0,12113.9,13834.8,13842.5
11 | 10,12011.3,11928.5,13701.9,13666.3
12 | 11,11599.8,11328.1,12939.4,12679.4
13 | 12,11417.1,10814.3,12569.0,12105.1
14 | 13,11176.7,10430.0,12112.5,11548.8
15 | 14,11015.1,10011.5,11763.5,11083.7
16 | 15,10817.9,9736.0,11517.5,10775.8
17 | 16,10652.2,9458.2,11244.8,10438.2
18 | 17,10491.7,9212.1,11053.2,10189.1
19 | 18,10389.9,9063.8,10902.7,9938.3
20 | 19,10166.7,8889.2,10700.2,9687.7
21 | 20,10074.4,8561.7,10556.9,9444.8
22 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 1
14 | Number of Threads counted = 1
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 99098 microseconds.
18 |    (= 99098 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           11169.0     0.143379     0.143254     0.143599
28 | Scale:          11427.9     0.140131     0.140008     0.140532
29 | Add:            12515.9     0.191923     0.191756     0.192138
30 | Triad:          12386.9     0.193884     0.193753     0.194148
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 10
14 | Number of Threads counted = 10
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 51015 microseconds.
18 |    (= 51015 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           21824.8     0.073816     0.073311     0.074746
28 | Scale:          21271.2     0.075652     0.075219     0.076559
29 | Add:            24219.7     0.099511     0.099093     0.100451
30 | Triad:          23949.9     0.100611     0.100209     0.100764
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.11.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 11
14 | Number of Threads counted = 11
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 53475 microseconds.
18 |    (= 53475 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           22379.8     0.072495     0.071493     0.074542
28 | Scale:          21944.3     0.073600     0.072912     0.075188
29 | Add:            24884.1     0.097026     0.096447     0.098490
30 | Triad:          24561.5     0.097853     0.097714     0.098232
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.12.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 12
14 | Number of Threads counted = 12
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 53128 microseconds.
18 |    (= 53128 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           21753.6     0.073842     0.073551     0.074189
28 | Scale:          20644.9     0.078236     0.077501     0.079786
29 | Add:            23987.3     0.100812     0.100053     0.102518
30 | Triad:          24006.7     0.100223     0.099972     0.100374
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.13.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 13
14 | Number of Threads counted = 13
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 56228 microseconds.
18 |    (= 56228 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           21052.6     0.076175     0.076000     0.076391
28 | Scale:          20288.6     0.079909     0.078862     0.082120
29 | Add:            23470.5     0.103159     0.102256     0.105094
30 | Triad:          23451.5     0.102529     0.102339     0.102742
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.14.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 14
14 | Number of Threads counted = 14
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 58840 microseconds.
18 |    (= 58840 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           20363.5     0.078672     0.078572     0.078875
28 | Scale:          19825.5     0.081920     0.080704     0.084184
29 | Add:            22906.0     0.105705     0.104776     0.107502
30 | Triad:          22975.1     0.104746     0.104461     0.105137
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.15.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 15
14 | Number of Threads counted = 15
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 61054 microseconds.
18 |    (= 61054 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           19848.4     0.080768     0.080611     0.080894
28 | Scale:          19424.1     0.083487     0.082372     0.085731
29 | Add:            22472.5     0.107705     0.106797     0.109424
30 | Triad:          22595.3     0.106420     0.106217     0.107017
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.16.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 16
14 | Number of Threads counted = 16
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 62754 microseconds.
18 |    (= 62754 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           19342.8     0.082981     0.082718     0.083202
28 | Scale:          19020.7     0.085309     0.084119     0.088074
29 | Add:            22007.0     0.109913     0.109056     0.111735
30 | Triad:          22189.5     0.108260     0.108159     0.108453
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.17.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 17
14 | Number of Threads counted = 17
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 63843 microseconds.
18 |    (= 63843 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           18954.9     0.084897     0.084411     0.085132
28 | Scale:          18570.9     0.087470     0.086156     0.090096
29 | Add:            21632.2     0.111818     0.110946     0.113704
30 | Triad:          21798.8     0.110186     0.110098     0.110343
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.18.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 18
14 | Number of Threads counted = 18
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 64707 microseconds.
18 |    (= 64707 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           18559.1     0.086891     0.086211     0.087141
28 | Scale:          18280.1     0.088951     0.087527     0.091739
29 | Add:            21321.0     0.113434     0.112565     0.115459
30 | Triad:          21457.7     0.111982     0.111848     0.112349
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.19.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 19
14 | Number of Threads counted = 19
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 67893 microseconds.
18 |    (= 67893 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           17922.9     0.089976     0.089271     0.092796
28 | Scale:          17826.3     0.090803     0.089755     0.092935
29 | Add:            20920.1     0.115709     0.114722     0.118971
30 | Triad:          21061.3     0.114280     0.113953     0.114938
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 2
14 | Number of Threads counted = 2
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 65845 microseconds.
18 |    (= 65845 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           16891.9     0.095052     0.094720     0.095707
28 | Scale:          16953.4     0.094644     0.094376     0.095121
29 | Add:            19359.7     0.124284     0.123969     0.124973
30 | Triad:          19376.4     0.123982     0.123862     0.124171
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.20.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 20
14 | Number of Threads counted = 20
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 67439 microseconds.
18 |    (= 67439 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           17896.7     0.089699     0.089402     0.090050
28 | Scale:          17707.7     0.091581     0.090356     0.094052
29 | Add:            20873.9     0.115722     0.114976     0.117304
30 | Triad:          21031.4     0.114377     0.114115     0.114735
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 3
14 | Number of Threads counted = 3
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 58531 microseconds.
18 |    (= 58531 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           19575.4     0.082119     0.081735     0.082829
28 | Scale:          19450.3     0.082538     0.082261     0.083065
29 | Add:            22747.1     0.105959     0.105508     0.106768
30 | Triad:          22657.1     0.106208     0.105927     0.106470
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 4
14 | Number of Threads counted = 4
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 54024 microseconds.
18 |    (= 54024 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           21233.6     0.075713     0.075352     0.076412
28 | Scale:          21018.1     0.076475     0.076125     0.077094
29 | Add:            24262.1     0.099321     0.098920     0.100081
30 | Triad:          24100.8     0.099769     0.099582     0.099914
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 5
14 | Number of Threads counted = 5
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 52400 microseconds.
18 |    (= 52400 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           21982.6     0.073319     0.072785     0.074226
28 | Scale:          21471.9     0.074806     0.074516     0.075342
29 | Add:            24503.3     0.098370     0.097946     0.099153
30 | Triad:          24266.7     0.099380     0.098901     0.099614
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 6
14 | Number of Threads counted = 6
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 51758 microseconds.
18 |    (= 51758 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           21980.8     0.073274     0.072791     0.074173
28 | Scale:          21403.5     0.075070     0.074754     0.075791
29 | Add:            24401.2     0.098825     0.098356     0.099682
30 | Triad:          24088.2     0.099961     0.099634     0.100286
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 7
14 | Number of Threads counted = 7
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 51126 microseconds.
18 |    (= 51126 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           21943.0     0.073408     0.072916     0.074311
28 | Scale:          21351.2     0.075296     0.074937     0.076113
29 | Add:            24367.2     0.098915     0.098493     0.099845
30 | Triad:          24097.3     0.100070     0.099596     0.100271
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 8
14 | Number of Threads counted = 8
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 51128 microseconds.
18 |    (= 51128 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           21883.4     0.073652     0.073115     0.074568
28 | Scale:          21306.6     0.075492     0.075094     0.076318
29 | Add:            24289.3     0.099231     0.098809     0.100140
30 | Triad:          24031.2     0.100315     0.099870     0.100499
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 9
14 | Number of Threads counted = 9
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 51070 microseconds.
18 |    (= 51070 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           21839.7     0.073804     0.073261     0.074766
28 | Scale:          21287.6     0.075542     0.075161     0.076381
29 | Add:            24247.6     0.099440     0.098979     0.100322
30 | Triad:          23999.8     0.100416     0.100001     0.100628
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,11169.0,11427.9,12515.9,12386.9
 3 | 2,16891.9,16953.4,19359.7,19376.4
 4 | 3,19575.4,19450.3,22747.1,22657.1
 5 | 4,21233.6,21018.1,24262.1,24100.8
 6 | 5,21982.6,21471.9,24503.3,24266.7
 7 | 6,21980.8,21403.5,24401.2,24088.2
 8 | 7,21943.0,21351.2,24367.2,24097.3
 9 | 8,21883.4,21306.6,24289.3,24031.2
10 | 9,21839.7,21287.6,24247.6,23999.8
11 | 10,21824.8,21271.2,24219.7,23949.9
12 | 11,22379.8,21944.3,24884.1,24561.5
13 | 12,21753.6,20644.9,23987.3,24006.7
14 | 13,21052.6,20288.6,23470.5,23451.5
15 | 14,20363.5,19825.5,22906.0,22975.1
16 | 15,19848.4,19424.1,22472.5,22595.3
17 | 16,19342.8,19020.7,22007.0,22189.5
18 | 17,18954.9,18570.9,21632.2,21798.8
19 | 18,18559.1,18280.1,21321.0,21457.7
20 | 19,17922.9,17826.3,20920.1,21061.3
21 | 20,17896.7,17707.7,20873.9,21031.4
22 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 1
14 | Number of Threads counted = 1
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 142498 microseconds.
18 |    (= 142498 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            7650.3     0.209291     0.209142     0.209377
28 | Scale:           7114.0     0.225146     0.224908     0.225412
29 | Add:             7885.2     0.304556     0.304367     0.304696
30 | Triad:           7174.2     0.334648     0.334533     0.334807
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 2
14 | Number of Threads counted = 2
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 133343 microseconds.
18 |    (= 133343 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9165.3     0.175186     0.174571     0.175849
28 | Scale:           7481.1     0.214358     0.213872     0.215060
29 | Add:             9108.9     0.263958     0.263480     0.264399
30 | Triad:           7945.9     0.305224     0.302041     0.307478
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 3
14 | Number of Threads counted = 3
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 132772 microseconds.
18 |    (= 132772 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9456.8     0.169338     0.169191     0.169775
28 | Scale:           7429.1     0.216375     0.215370     0.217333
29 | Add:             9230.5     0.260784     0.260008     0.261224
30 | Triad:           7657.9     0.314127     0.313404     0.315308
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 4
14 | Number of Threads counted = 4
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 136607 microseconds.
18 |    (= 136607 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9292.2     0.173044     0.172188     0.173473
28 | Scale:           7423.6     0.217133     0.215530     0.218579
29 | Add:             9213.9     0.261356     0.260477     0.262803
30 | Triad:           7693.4     0.314494     0.311955     0.318190
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 5
14 | Number of Threads counted = 5
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 137934 microseconds.
18 |    (= 137934 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9238.6     0.173540     0.173186     0.173787
28 | Scale:           7345.0     0.219550     0.217836     0.222778
29 | Add:             9132.4     0.262974     0.262800     0.263200
30 | Triad:           7638.0     0.316099     0.314217     0.318165
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 6
14 | Number of Threads counted = 6
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 139503 microseconds.
18 |    (= 139503 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9196.7     0.174072     0.173975     0.174149
28 | Scale:           7236.5     0.222046     0.221100     0.223674
29 | Add:             9142.6     0.263525     0.262507     0.264566
30 | Triad:           7524.5     0.322067     0.318958     0.325934
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 7
14 | Number of Threads counted = 7
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 144078 microseconds.
18 |    (= 144078 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9110.8     0.176944     0.175615     0.178452
28 | Scale:           6968.3     0.233828     0.229611     0.236156
29 | Add:             8608.4     0.284117     0.278797     0.293368
30 | Triad:           6842.4     0.360628     0.350755     0.368618
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 8
14 | Number of Threads counted = 8
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 146898 microseconds.
18 |    (= 146898 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9071.3     0.176979     0.176380     0.177868
28 | Scale:           6952.8     0.231661     0.230122     0.233326
29 | Add:             8957.1     0.268989     0.267943     0.270572
30 | Triad:           7327.4     0.330048     0.327537     0.336869
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 9
14 | Number of Threads counted = 9
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 148554 microseconds.
18 |    (= 148554 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            9011.8     0.178404     0.177544     0.179654
28 | Scale:           6835.7     0.235976     0.234066     0.238939
29 | Add:             8954.4     0.269527     0.268024     0.270976
30 | Triad:           7274.3     0.331591     0.329930     0.334091
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,7650.3,7114.0,7885.2,7174.2
 3 | 2,9165.3,7481.1,9108.9,7945.9
 4 | 3,9456.8,7429.1,9230.5,7657.9
 5 | 4,9292.2,7423.6,9213.9,7693.4
 6 | 5,9238.6,7345.0,9132.4,7638.0
 7 | 6,9196.7,7236.5,9142.6,7524.5
 8 | 7,9110.8,6968.3,8608.4,6842.4
 9 | 8,9071.3,6952.8,8957.1,7327.4
10 | 9,9011.8,6835.7,8954.4,7274.3
11 | 10,8922.9,6711.6,8970.8,7155.6
12 | 11,9236.1,6966.2,9227.6,7609.6
13 | 12,9467.1,7269.0,9553.9,7960.3
14 | 13,9584.4,7505.9,9682.8,8239.9
15 | 14,9702.0,7557.0,9890.8,8375.4
16 | 15,9755.0,7916.4,10070.1,8652.9
17 | 16,9953.2,8089.6,10279.3,8878.9
18 | 17,10021.0,8268.8,10385.6,9064.2
19 | 18,10073.3,8370.9,10503.7,9224.0
20 | 19,10035.9,8494.6,10497.7,9335.8
21 | 20,9999.4,8548.2,10594.2,9463.4
22 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 1
14 | Number of Threads counted = 1
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 147413 microseconds.
18 |    (= 147413 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:            8121.6     0.197196     0.197005     0.197409
28 | Scale:           8256.7     0.193908     0.193782     0.194059
29 | Add:             9056.7     0.265123     0.264997     0.265290
30 | Triad:           8951.4     0.268308     0.268113     0.268659
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 10
14 | Number of Threads counted = 10
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 84671 microseconds.
18 |    (= 84671 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           14703.4     0.109125     0.108818     0.109462
28 | Scale:          14629.5     0.110176     0.109368     0.111873
29 | Add:            17544.8     0.137415     0.136793     0.138812
30 | Triad:          17579.9     0.136705     0.136519     0.136982
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.11.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------
 2 | STREAM version $Revision: 5.10 $
 3 | -------------------------------------------------------------
 4 | This system uses 8 bytes per array element.
 5 | -------------------------------------------------------------
 6 | Array size = 100000000 (elements), Offset = 0 (elements)
 7 | Memory per array = 762.9 MiB (= 0.7 GiB).
 8 | Total memory required = 2288.8 MiB (= 2.2 GiB).
 9 | Each kernel will be executed 10 times.
10 |  The *best* time for each kernel (excluding the first iteration)
11 |  will be used to compute the reported bandwidth.
12 | -------------------------------------------------------------
13 | Number of Threads requested = 11
14 | Number of Threads counted = 11
15 | -------------------------------------------------------------
16 | Your clock granularity/precision appears to be 1 microseconds.
17 | Each test below will take on the order of 85628 microseconds.
18 |    (= 85628 clock ticks)
19 | Increase the size of the arrays if this shows that
20 | you are not getting at least 20 clock ticks per test.
21 | -------------------------------------------------------------
22 | WARNING -- The above is only a rough guideline.
23 | For best results, please be sure you know the
24 | precision of your system timer.
25 | -------------------------------------------------------------
26 | Function    Best Rate MB/s  Avg time     Min time     Max time
27 | Copy:           15726.4     0.102540     0.101740     0.104150
28 | Scale:          15441.2     0.104464     0.103619     0.106550
29 | Add:            18612.0     0.129761     0.128949     0.131864
30 | Triad:          18493.8     0.130050     0.129773     0.130910
31 | -------------------------------------------------------------
32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays
33 | -------------------------------------------------------------
34 | 


--------------------------------------------------------------------------------
/remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,8121.6,8256.7,9056.7,8951.4
 3 | 2,12699.6,12775.8,14846.7,14763.8
 4 | 3,14504.3,14494.4,17258.4,17266.3
 5 | 4,15047.4,14908.5,17845.5,17862.3
 6 | 5,15049.6,14888.3,17836.7,17846.1
 7 | 6,14926.3,14780.9,17727.5,17764.4
 8 | 7,14831.1,14718.2,17684.9,17720.4
 9 | 8,14777.1,14687.6,17628.9,17660.2
10 | 9,14768.0,14666.9,17593.5,17632.0
11 | 10,14703.4,14629.5,17544.8,17579.9
12 | 11,15726.4,15441.2,18612.0,18493.8
13 | 12,15835.9,15453.0,18748.7,18971.3
14 | 13,16166.5,15805.3,19052.6,19136.6
15 | 14,16450.6,16086.7,19352.8,19401.8
16 | 15,16519.4,16270.6,19455.6,19568.0
17 | 16,16800.9,16654.8,19831.4,20036.1
18 | 17,17032.1,16997.6,20110.8,20366.9
19 | 18,17292.0,17295.8,20386.1,20643.2
20 | 19,17561.4,17589.2,20615.3,20900.6
21 | 20,17870.3,17844.4,20895.0,21125.1
22 | 


--------------------------------------------------------------------------------
/remote_PMDK/noHT_Socket0_CXLDAX_Close_noFT_@mnt@pmem2_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,3929.9,3787.4,3863.1,3881.9
 3 | 2,6767.7,6691.4,6840.1,6858.8
 4 | 3,8102.7,7986.7,8420.1,8326.7
 5 | 4,8545.3,8371.0,9151.6,9019.2
 6 | 5,8130.1,7471.5,8852.3,8723.4
 7 | 6,8305.4,7667.6,9060.9,8837.9
 8 | 7,8557.8,8051.5,9187.6,9023.0
 9 | 8,8770.0,8472.1,9414.1,9287.4
10 | 9,8827.4,8403.6,9529.9,9345.6
11 | 10,8817.3,8453.2,9507.3,9335.8
12 | 


--------------------------------------------------------------------------------
/remote_PMDK/noHT_Socket0_Socket1DDR5DAX_Close_noFT_@mnt@pmem1_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,7201.7,6109.9,5952.7,6304.4
 3 | 2,11963.1,10631.6,11463.0,12043.5
 4 | 3,14026.3,12967.4,15141.2,15769.1
 5 | 4,14826.5,14297.1,16887.2,17305.6
 6 | 5,13933.5,13237.6,16095.4,16129.2
 7 | 6,14040.4,13570.4,16396.8,16377.8
 8 | 7,14475.7,14002.9,16855.3,16973.7
 9 | 8,14481.9,13997.3,16971.6,17117.9
10 | 9,14468.1,14044.4,16981.6,17110.5
11 | 10,14500.2,14093.8,16981.9,17087.9
12 | 


--------------------------------------------------------------------------------
/remote_PMDK/noHT_Socket1_CXLDAX_Close_noFT_@mnt@pmem2_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,2449.3,2326.4,2174.6,2260.7
 3 | 2,4154.7,3995.4,3700.7,3902.9
 4 | 3,5589.5,5360.6,4829.4,5170.1
 5 | 4,6514.4,6214.5,5897.5,6224.2
 6 | 5,7174.2,6882.2,6811.2,7167.6
 7 | 6,7662.5,7345.1,7453.8,7768.2
 8 | 7,7946.9,7723.7,7944.8,8199.6
 9 | 8,8098.0,7933.2,8404.8,8625.4
10 | 9,8232.6,7979.2,8627.5,8822.1
11 | 10,8273.8,8148.9,8814.3,8994.2
12 | 


--------------------------------------------------------------------------------
/remote_PMDK_affinity/noHT_Socket0Socket1_CXLDAX_Close_noFT_@mnt@pmem2_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,3949.1,3812.0,3881.6,3916.9
 3 | 2,6793.4,6622.6,6873.9,6817.3
 4 | 3,8195.0,8109.0,8442.1,8368.0
 5 | 4,8568.4,8348.4,9234.9,9013.4
 6 | 5,8051.0,7390.4,8867.4,8753.9
 7 | 6,8287.3,7672.1,9037.1,8893.2
 8 | 7,8618.3,8045.4,9160.7,9011.4
 9 | 8,8744.9,8392.2,9390.7,9298.7
10 | 9,8827.5,8419.8,9504.1,9349.9
11 | 10,8773.6,8430.1,9529.1,9328.8
12 | 11,8870.3,8558.1,8607.5,8844.9
13 | 12,8888.4,8570.4,8885.8,8980.1
14 | 13,8914.8,8627.0,9037.2,9075.3
15 | 14,8894.0,8627.1,9109.9,9093.0
16 | 15,8961.7,8672.6,9285.9,9255.9
17 | 16,8994.3,8700.6,9400.4,9325.8
18 | 17,8958.2,8697.3,9389.1,9352.4
19 | 18,8994.5,8729.2,9553.8,9458.8
20 | 19,8996.5,8751.4,9535.3,9463.8
21 | 20,9024.1,8737.2,9596.8,9513.8
22 | 


--------------------------------------------------------------------------------
/remote_PMDK_affinity/noHT_Socket0Socket1_CXLDAX_Spread_noFT_@mnt@pmem2_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,3983.3,3812.2,3897.5,3931.7
 3 | 2,4512.6,4314.8,3695.1,4040.5
 4 | 3,5946.5,5700.5,4616.5,5285.8
 5 | 4,6982.0,6725.3,6000.1,6411.6
 6 | 5,7757.5,7577.7,7016.4,7390.5
 7 | 6,8221.7,7949.7,7708.1,8027.0
 8 | 7,8413.1,8161.6,8097.1,8335.5
 9 | 8,8557.5,8312.3,8472.5,8678.0
10 | 9,8681.2,8441.4,8760.0,8913.0
11 | 10,8817.1,8582.4,8932.7,9035.7
12 | 11,8875.4,8585.2,9137.0,9171.2
13 | 12,8935.9,8619.6,9239.4,9292.6
14 | 13,8941.1,8614.7,9282.3,9342.0
15 | 14,8788.8,8668.0,9325.2,9334.4
16 | 15,8840.6,8692.5,9440.2,9475.0
17 | 16,8975.4,8685.9,9492.5,9437.1
18 | 17,8985.6,8689.9,9495.4,9443.8
19 | 18,9010.4,8672.3,9558.7,9467.0
20 | 19,9005.5,8687.5,9588.5,9511.7
21 | 20,9046.5,8705.3,9619.0,9496.7
22 | 


--------------------------------------------------------------------------------
/remote_PMDK_affinity/noHT_Socket0Socket1_Socket0DDR5DAX_Close_noFT_@mnt@pmem0_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,8181.5,6714.7,6517.6,6876.5
 3 | 2,14955.0,12813.4,12899.0,13511.0
 4 | 3,18813.1,16876.4,18345.7,19094.7
 5 | 4,20870.0,19344.2,22251.9,22646.0
 6 | 5,19639.9,18214.1,21896.2,21598.3
 7 | 6,20079.1,18945.6,22303.6,21892.8
 8 | 7,20867.6,19733.1,22795.1,22583.4
 9 | 8,20940.2,19863.9,22863.0,22645.8
10 | 9,20983.1,20095.5,22989.2,22720.2
11 | 10,21141.1,20142.7,23095.3,22888.5
12 | 11,21777.3,20798.7,23856.9,23675.4
13 | 12,21118.0,20085.4,23262.6,23080.7
14 | 13,20523.8,19681.2,22812.8,22649.2
15 | 14,19967.8,19294.3,22381.6,22294.3
16 | 15,19446.3,18904.3,21940.3,21968.1
17 | 16,18962.6,18570.5,21549.2,21648.9
18 | 17,18500.7,18209.1,21149.0,21371.5
19 | 18,18115.7,17873.3,20854.3,21133.8
20 | 19,17836.4,17627.5,20590.6,20920.9
21 | 20,17630.6,17376.8,20396.3,20710.4
22 | 


--------------------------------------------------------------------------------
/remote_PMDK_affinity/noHT_Socket0Socket1_Socket0DDR5DAX_Spread_noFT_@mnt@pmem0_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,8207.1,6718.1,6526.3,6876.9
 3 | 2,12790.8,11254.0,11726.3,12348.9
 4 | 3,16493.1,15207.8,17011.6,17323.6
 5 | 4,17405.9,16263.5,18894.2,19453.0
 6 | 5,18152.7,16616.6,20438.0,20608.1
 7 | 6,17408.1,16344.9,20129.7,20296.0
 8 | 7,18124.7,17116.2,20715.9,20951.5
 9 | 8,17298.8,16788.4,20065.2,20330.7
10 | 9,16892.6,16285.3,19795.3,19946.6
11 | 10,17508.1,17027.4,20359.5,20591.0
12 | 11,17069.4,16621.3,19930.9,20121.9
13 | 12,16967.7,16383.7,19681.8,19780.8
14 | 13,16699.5,16166.0,19435.6,19544.0
15 | 14,16625.6,16069.1,19259.3,19343.0
16 | 15,16518.2,15968.7,19137.5,19224.8
17 | 16,16693.1,16212.1,19373.1,19515.8
18 | 17,16792.6,16412.9,19508.5,19754.9
19 | 18,17138.5,16800.9,19870.5,20139.1
20 | 19,17393.6,17119.7,20140.5,20434.7
21 | 20,17660.0,17386.0,20434.1,20697.9
22 | 


--------------------------------------------------------------------------------
/remote_PMDK_affinity/noHT_Socket0Socket1_Socket1DDR5DAX_Close_noFT_@mnt@pmem1_Arrays100000000_Cores10/output_data.csv:
--------------------------------------------------------------------------------
 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s)
 2 | 1,7238.1,6113.1,5958.8,6219.7
 3 | 2,11978.1,10644.7,11509.1,12046.7
 4 | 3,14044.8,12981.4,15139.5,15782.6
 5 | 4,14839.6,14301.3,16896.4,17310.3
 6 | 5,14997.1,14722.0,17343.2,17737.5
 7 | 6,14940.0,14795.8,17513.4,17735.1
 8 | 7,14139.3,13780.6,16594.4,16579.6
 9 | 8,14181.0,13783.1,16812.4,16797.0
10 | 9,14229.2,13822.5,16768.6,16802.8
11 | 10,14295.0,13806.7,16721.0,16808.7
12 | 11,15245.8,14545.9,17698.1,17680.2
13 | 12,15475.0,14822.8,17973.5,18062.5
14 | 13,15717.9,15088.8,18226.9,18393.9
15 | 14,15960.4,15356.9,18482.8,18681.4
16 | 15,16221.1,15631.7,18797.3,19034.5
17 | 16,16524.1,15946.6,19088.4,19347.5
18 | 17,16685.8,16281.7,19310.3,19661.2
19 | 18,16816.3,16459.4,19435.2,19928.9
20 | 19,17164.3,16916.0,19811.4,20264.8
21 | 20,17396.8,17102.0,19949.1,20324.4
22 | 


--------------------------------------------------------------------------------