├── README.md ├── RUNner.py ├── STREAMer.py ├── local_PMDK ├── local_PMDK_Add_plot.svg ├── local_PMDK_Copy_plot.svg ├── local_PMDK_Scale_plot.svg ├── local_PMDK_Triad_plot.svg └── noHT_Socket0_Socket0DDR5DAX_Close_noFT_@mnt@pmem0_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.2.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket0Socket1_Socket1DDR5DAX_Spread_noFT_@mnt@pmem1_Arrays100000000_Cores10 ├── output.1.txt ├── output.10.txt ├── output.11.txt ├── output.12.txt ├── output.13.txt ├── output.14.txt ├── output.15.txt ├── output.16.txt ├── output.17.txt ├── output.18.txt ├── output.19.txt ├── output.2.txt ├── output.20.txt ├── output.3.txt ├── output.4.txt ├── output.5.txt ├── output.6.txt ├── output.7.txt ├── output.8.txt ├── output.9.txt └── output_data.csv ├── plot_groups.py ├── plot_results.py ├── remote_NUMA ├── noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.2.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.2.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.2.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.2.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── remote_NUMA_Add_plot.svg ├── remote_NUMA_Copy_plot.svg ├── remote_NUMA_Scale_plot.svg └── remote_NUMA_Triad_plot.svg ├── remote_NUMA_allcores ├── noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.11.txt │ ├── output.12.txt │ ├── output.13.txt │ ├── output.14.txt │ ├── output.15.txt │ ├── output.16.txt │ ├── output.17.txt │ ├── output.18.txt │ ├── output.19.txt │ ├── output.2.txt │ ├── output.20.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.11.txt │ ├── output.12.txt │ ├── output.13.txt │ ├── output.14.txt │ ├── output.15.txt │ ├── output.16.txt │ ├── output.17.txt │ ├── output.18.txt │ ├── output.19.txt │ ├── output.2.txt │ ├── output.20.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.11.txt │ ├── output.12.txt │ ├── output.13.txt │ ├── output.14.txt │ ├── output.15.txt │ ├── output.16.txt │ ├── output.17.txt │ ├── output.18.txt │ ├── output.19.txt │ ├── output.2.txt │ ├── output.20.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.11.txt │ ├── output.12.txt │ ├── output.13.txt │ ├── output.14.txt │ ├── output.15.txt │ ├── output.16.txt │ ├── output.17.txt │ ├── output.18.txt │ ├── output.19.txt │ ├── output.2.txt │ ├── output.20.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket0Socket1_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.11.txt │ ├── output.12.txt │ ├── output.13.txt │ ├── output.14.txt │ ├── output.15.txt │ ├── output.16.txt │ ├── output.17.txt │ ├── output.18.txt │ ├── output.19.txt │ ├── output.2.txt │ ├── output.20.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── remote_NUMA_allcores_Add_plot.svg ├── remote_NUMA_allcores_Copy_plot.svg ├── remote_NUMA_allcores_Scale_plot.svg └── remote_NUMA_allcores_Triad_plot.svg ├── remote_PMDK ├── noHT_Socket0_CXLDAX_Close_noFT_@mnt@pmem2_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.2.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket0_Socket1DDR5DAX_Close_noFT_@mnt@pmem1_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.2.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket1_CXLDAX_Close_noFT_@mnt@pmem2_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.2.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── remote_PMDK_Add_plot.svg ├── remote_PMDK_Copy_plot.svg ├── remote_PMDK_Scale_plot.svg └── remote_PMDK_Triad_plot.svg ├── remote_PMDK_affinity ├── noHT_Socket0Socket1_CXLDAX_Close_noFT_@mnt@pmem2_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.11.txt │ ├── output.12.txt │ ├── output.13.txt │ ├── output.14.txt │ ├── output.15.txt │ ├── output.16.txt │ ├── output.17.txt │ ├── output.18.txt │ ├── output.19.txt │ ├── output.2.txt │ ├── output.20.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket0Socket1_CXLDAX_Spread_noFT_@mnt@pmem2_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.11.txt │ ├── output.12.txt │ ├── output.13.txt │ ├── output.14.txt │ ├── output.15.txt │ ├── output.16.txt │ ├── output.17.txt │ ├── output.18.txt │ ├── output.19.txt │ ├── output.2.txt │ ├── output.20.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket0Socket1_Socket0DDR5DAX_Close_noFT_@mnt@pmem0_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.11.txt │ ├── output.12.txt │ ├── output.13.txt │ ├── output.14.txt │ ├── output.15.txt │ ├── output.16.txt │ ├── output.17.txt │ ├── output.18.txt │ ├── output.19.txt │ ├── output.2.txt │ ├── output.20.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket0Socket1_Socket0DDR5DAX_Spread_noFT_@mnt@pmem0_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.11.txt │ ├── output.12.txt │ ├── output.13.txt │ ├── output.14.txt │ ├── output.15.txt │ ├── output.16.txt │ ├── output.17.txt │ ├── output.18.txt │ ├── output.19.txt │ ├── output.2.txt │ ├── output.20.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── noHT_Socket0Socket1_Socket1DDR5DAX_Close_noFT_@mnt@pmem1_Arrays100000000_Cores10 │ ├── output.1.txt │ ├── output.10.txt │ ├── output.11.txt │ ├── output.12.txt │ ├── output.13.txt │ ├── output.14.txt │ ├── output.15.txt │ ├── output.16.txt │ ├── output.17.txt │ ├── output.18.txt │ ├── output.19.txt │ ├── output.2.txt │ ├── output.20.txt │ ├── output.3.txt │ ├── output.4.txt │ ├── output.5.txt │ ├── output.6.txt │ ├── output.7.txt │ ├── output.8.txt │ ├── output.9.txt │ └── output_data.csv ├── remote_PMDK_affinity_Add_plot.svg ├── remote_PMDK_affinity_Copy_plot.svg ├── remote_PMDK_affinity_Scale_plot.svg └── remote_PMDK_affinity_Triad_plot.svg ├── script_stream_run.sh ├── stream.c └── stream_pmemobj.c /RUNner.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import subprocess 3 | 4 | # Define mutually exclusive sublists of options 5 | exclusive_options = [ 6 | ["--noHT", "--HT"], 7 | ["--Socket0", "--Socket1", "--Socket0Socket1"], 8 | ["--Socket0DDR4", "--Socket1DDR4", "--CXLDDR4", "--Socket0DDR5", "--Socket1DDR5", "--CXLDAX", "--Socket0DDR4DAX", "--Socket1DDR4DAX", "--Socket0DDR5DAX", "--Socket1DDR5DAX", "--Socket0OptaneDAX", "--Socket1OptaneDAX"], 9 | ["--Close", "--Spread"], 10 | ["--noFT", "--FT"] 11 | ] 12 | 13 | # DAX options and flag 14 | dax_options = ["--CXLDAX", "--Socket0DDR4DAX", "--Socket1DDR4DAX", "--Socket0DDR5DAX", "--Socket1DDR5DAX", "--Socket0OptaneDAX", "--Socket1OptaneDAX"] 15 | dax_flag = "--DAX_Path" 16 | 17 | # DAX_Path options 18 | dax_path_options = ["/mnt/pmem0", "/mnt/pmem1", "/mnt/pmem2"] 19 | 20 | # Generate all possible permutations of mutually exclusive sublists 21 | permutations = list(itertools.product(*exclusive_options)) 22 | 23 | # Iterate over each permutation and execute the STREAMer.py script 24 | for perm in permutations: 25 | if any(option in perm for option in dax_options): 26 | # Add DAX_Path flag when DAX-related options are present 27 | for dax_path_option in dax_path_options: 28 | command = ["python3", "STREAMer.py"] + list(perm) + [dax_flag, dax_path_option] 29 | subprocess.run(command) 30 | else: 31 | command = ["python3", "STREAMer.py"] + list(perm) 32 | subprocess.run(command) 33 | -------------------------------------------------------------------------------- /local_PMDK/noHT_Socket0_Socket0DDR5DAX_Close_noFT_@mnt@pmem0_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,8140.2,6716.8,6521.1,6852.5 3 | 2,14915.5,12815.3,12892.1,13532.9 4 | 3,18794.8,16889.2,18332.1,19110.7 5 | 4,20983.6,19371.2,22218.3,22664.2 6 | 5,19789.2,18180.4,21874.1,21589.5 7 | 6,20115.4,18963.6,22291.4,21911.0 8 | 7,20865.1,19722.4,22806.3,22587.4 9 | 8,21261.6,20310.2,23367.9,23111.4 10 | 9,21320.8,20406.1,23399.6,23140.6 11 | 10,21356.9,20464.0,23407.1,23183.5 12 | -------------------------------------------------------------------------------- /noHT_Socket0Socket1_Socket1DDR5DAX_Spread_noFT_@mnt@pmem1_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,7208.4,6118.5,5971.5,6317.1 3 | 2,12931.4,11357.7,11787.8,12397.1 4 | 3,15356.7,13953.2,15963.0,16731.1 5 | 4,17427.1,16382.5,19065.2,19593.0 6 | 5,17111.9,16802.0,19584.8,20058.8 7 | 6,17846.4,17700.3,20764.7,21041.9 8 | 7,16260.1,15610.7,18711.2,19033.9 9 | 8,16938.8,16465.4,19569.5,19843.9 10 | 9,17600.6,17175.9,20260.9,20559.0 11 | 10,17057.2,16702.8,19803.0,20253.2 12 | 11,17647.2,17284.2,20196.7,20634.7 13 | 12,18107.8,17765.5,20600.7,20962.3 14 | 13,18444.5,18121.1,20941.1,21242.5 15 | 14,18821.6,18463.4,21235.5,21503.6 16 | 15,19104.0,18577.9,21509.0,21702.8 17 | 16,18735.5,18324.2,21180.5,21453.1 18 | 17,18258.2,17902.1,20740.1,21177.1 19 | 18,17882.5,17792.8,20484.3,20945.3 20 | 19,17652.1,17522.0,20267.0,20712.7 21 | 20,17525.2,17324.9,20108.2,20595.2 22 | -------------------------------------------------------------------------------- /plot_results.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import matplotlib.pyplot as plt 3 | import sys 4 | 5 | 6 | def plot(path): 7 | # Initialize empty lists to store data 8 | number_of_threads = [] 9 | copy_rate = [] 10 | scale_rate = [] 11 | add_rate = [] 12 | triad_rate = [] 13 | 14 | # Read data from CSV file 15 | with open(path+'/output_data.csv', 'r') as file: 16 | reader = csv.reader(file) 17 | next(reader) # Skip header row 18 | for row in reader: 19 | # Extract data from each row 20 | thread_count = int(row[0]) 21 | copy = float(row[1]) 22 | scale = float(row[2]) 23 | add = float(row[3]) 24 | triad = float(row[4]) 25 | # Append data to respective lists 26 | number_of_threads.append(thread_count) 27 | copy_rate.append(copy) 28 | scale_rate.append(scale) 29 | add_rate.append(add) 30 | triad_rate.append(triad) 31 | 32 | # Create plot 33 | plt.plot(number_of_threads, copy_rate, label='Copy Rate') 34 | plt.plot(number_of_threads, scale_rate, label='Scale Rate') 35 | plt.plot(number_of_threads, add_rate, label='Add Rate') 36 | plt.plot(number_of_threads, triad_rate, label='Triad Rate') 37 | 38 | # Set plot labels and title 39 | plt.xlabel('Number of Threads') 40 | plt.ylabel('Rate (MB/s)') 41 | #plt.title('STREAM Performance Rates') 42 | # Add legend 43 | plt.legend() 44 | plt.tight_layout() 45 | # Save the plot as an SVG image 46 | plt.savefig(path+'/graph_results.svg', format='svg') 47 | print("plot saved in " + path + "/graph_results.svg") 48 | # Display the plot 49 | plt.show() 50 | 51 | if __name__ == "__main__": 52 | if len(sys.argv) !=2: 53 | print("Need to provide relative path to directory with csv file") 54 | sys.exit(1) 55 | 56 | path=sys.argv[1] 57 | plot(path) 58 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 1 14 | Number of Threads counted = 1 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 227669 microseconds. 18 | (= 227669 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 4177.7 0.383159 0.382985 0.383744 28 | Scale: 4453.8 0.359574 0.359247 0.360168 29 | Add: 4638.5 0.517660 0.517408 0.517927 30 | Triad: 4678.9 0.513146 0.512946 0.513394 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 10 14 | Number of Threads counted = 10 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 118813 microseconds. 18 | (= 118813 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9743.6 0.165786 0.164210 0.168108 28 | Scale: 9647.6 0.166856 0.165844 0.168576 29 | Add: 11230.2 0.214807 0.213710 0.217668 30 | Triad: 11130.2 0.217182 0.215629 0.217890 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 2 14 | Number of Threads counted = 2 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 160424 microseconds. 18 | (= 160424 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 7367.8 0.217990 0.217162 0.218728 28 | Scale: 7741.0 0.207786 0.206692 0.209875 29 | Add: 8155.2 0.295132 0.294291 0.296964 30 | Triad: 8135.7 0.295597 0.294995 0.296586 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 3 14 | Number of Threads counted = 3 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 131028 microseconds. 18 | (= 131028 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9006.1 0.179122 0.177658 0.180951 28 | Scale: 9311.2 0.172805 0.171836 0.174209 29 | Add: 10237.7 0.235379 0.234428 0.237341 30 | Triad: 10253.5 0.234449 0.234067 0.234923 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 4 14 | Number of Threads counted = 4 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 119729 microseconds. 18 | (= 119729 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9638.0 0.168352 0.166010 0.170339 28 | Scale: 9771.8 0.164892 0.163737 0.166000 29 | Add: 11000.1 0.219277 0.218179 0.221557 30 | Triad: 11093.6 0.217202 0.216340 0.217835 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 5 14 | Number of Threads counted = 5 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 121135 microseconds. 18 | (= 121135 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9687.0 0.167617 0.165169 0.171813 28 | Scale: 9851.2 0.163980 0.162417 0.165658 29 | Add: 11304.2 0.213760 0.212310 0.216574 30 | Triad: 11297.5 0.213347 0.212436 0.214055 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 6 14 | Number of Threads counted = 6 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 118418 microseconds. 18 | (= 118418 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9630.7 0.167710 0.166135 0.169061 28 | Scale: 9676.3 0.166171 0.165353 0.168232 29 | Add: 11261.0 0.214420 0.213125 0.216631 30 | Triad: 11229.0 0.215100 0.213733 0.215776 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 7 14 | Number of Threads counted = 7 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 122730 microseconds. 18 | (= 122730 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9543.9 0.169305 0.167646 0.171809 28 | Scale: 9496.2 0.169338 0.168489 0.170688 29 | Add: 11143.8 0.216745 0.215366 0.219450 30 | Triad: 11069.2 0.218164 0.216817 0.219454 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 8 14 | Number of Threads counted = 8 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 119171 microseconds. 18 | (= 119171 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9576.4 0.168928 0.167078 0.171079 28 | Scale: 9610.1 0.167281 0.166492 0.168647 29 | Add: 11169.2 0.216210 0.214877 0.218697 30 | Triad: 11054.9 0.218647 0.217098 0.219202 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 9 14 | Number of Threads counted = 9 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 119756 microseconds. 18 | (= 119756 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9764.8 0.166471 0.163854 0.169869 28 | Scale: 9585.1 0.168336 0.166925 0.170970 29 | Add: 11196.5 0.216214 0.214352 0.218981 30 | Triad: 11063.4 0.218335 0.216931 0.219295 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,4177.7,4453.8,4638.5,4678.9 3 | 2,7367.8,7741.0,8155.2,8135.7 4 | 3,9006.1,9311.2,10237.7,10253.5 5 | 4,9638.0,9771.8,11000.1,11093.6 6 | 5,9687.0,9851.2,11304.2,11297.5 7 | 6,9630.7,9676.3,11261.0,11229.0 8 | 7,9543.9,9496.2,11143.8,11069.2 9 | 8,9576.4,9610.1,11169.2,11054.9 10 | 9,9764.8,9585.1,11196.5,11063.4 11 | 10,9743.6,9647.6,11230.2,11130.2 12 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 1 14 | Number of Threads counted = 1 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 142261 microseconds. 18 | (= 142261 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 7649.2 0.209402 0.209173 0.209594 28 | Scale: 7111.8 0.225200 0.224978 0.225500 29 | Add: 7886.9 0.304502 0.304302 0.304624 30 | Triad: 7170.9 0.334785 0.334685 0.335058 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 10 14 | Number of Threads counted = 10 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 147850 microseconds. 18 | (= 147850 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 8938.4 0.179859 0.179002 0.181012 28 | Scale: 6706.9 0.240479 0.238561 0.242180 29 | Add: 8953.6 0.269101 0.268050 0.270377 30 | Triad: 7165.7 0.336927 0.334930 0.339037 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 2 14 | Number of Threads counted = 2 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 135459 microseconds. 18 | (= 135459 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9145.3 0.175240 0.174954 0.175798 28 | Scale: 7527.0 0.214455 0.212568 0.215430 29 | Add: 9131.0 0.263540 0.262842 0.264041 30 | Triad: 7965.5 0.303535 0.301298 0.307011 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 3 14 | Number of Threads counted = 3 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 133390 microseconds. 18 | (= 133390 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9456.7 0.169369 0.169193 0.169611 28 | Scale: 7430.5 0.216405 0.215329 0.217398 29 | Add: 9226.2 0.260533 0.260129 0.260752 30 | Triad: 7730.6 0.314205 0.310453 0.317681 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 4 14 | Number of Threads counted = 4 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 135694 microseconds. 18 | (= 135694 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9273.9 0.172736 0.172527 0.172923 28 | Scale: 7443.6 0.217441 0.214949 0.219167 29 | Add: 9200.9 0.261867 0.260844 0.263139 30 | Triad: 7643.4 0.315713 0.313995 0.318028 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 5 14 | Number of Threads counted = 5 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 138097 microseconds. 18 | (= 138097 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9235.1 0.173603 0.173252 0.174022 28 | Scale: 7339.2 0.218982 0.218007 0.220910 29 | Add: 9121.8 0.263477 0.263107 0.264479 30 | Triad: 7643.1 0.315631 0.314008 0.317526 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 6 14 | Number of Threads counted = 6 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 138486 microseconds. 18 | (= 138486 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9196.8 0.174216 0.173974 0.174672 28 | Scale: 7246.1 0.223006 0.220809 0.224585 29 | Add: 9149.7 0.262754 0.262305 0.264823 30 | Triad: 7540.7 0.319968 0.318274 0.321880 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 7 14 | Number of Threads counted = 7 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 144020 microseconds. 18 | (= 144020 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9114.2 0.176091 0.175550 0.177001 28 | Scale: 6978.2 0.234265 0.229286 0.237691 29 | Add: 8585.9 0.285256 0.279529 0.289700 30 | Triad: 6907.6 0.355230 0.347442 0.362309 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 8 14 | Number of Threads counted = 8 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 145522 microseconds. 18 | (= 145522 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9050.8 0.177671 0.176780 0.179603 28 | Scale: 6931.6 0.233437 0.230828 0.236525 29 | Add: 8958.8 0.268808 0.267893 0.270326 30 | Triad: 7357.1 0.326778 0.326217 0.327710 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 9 14 | Number of Threads counted = 9 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 150119 microseconds. 18 | (= 150119 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9035.2 0.177318 0.177085 0.177999 28 | Scale: 6813.4 0.236120 0.234831 0.238476 29 | Add: 8982.7 0.268134 0.267179 0.268993 30 | Triad: 7297.4 0.331060 0.328883 0.332905 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,7649.2,7111.8,7886.9,7170.9 3 | 2,9145.3,7527.0,9131.0,7965.5 4 | 3,9456.7,7430.5,9226.2,7730.6 5 | 4,9273.9,7443.6,9200.9,7643.4 6 | 5,9235.1,7339.2,9121.8,7643.1 7 | 6,9196.8,7246.1,9149.7,7540.7 8 | 7,9114.2,6978.2,8585.9,6907.6 9 | 8,9050.8,6931.6,8958.8,7357.1 10 | 9,9035.2,6813.4,8982.7,7297.4 11 | 10,8938.4,6706.9,8953.6,7165.7 12 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 1 14 | Number of Threads counted = 1 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 147510 microseconds. 18 | (= 147510 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 8122.2 0.197648 0.196991 0.198985 28 | Scale: 8258.0 0.194080 0.193752 0.194539 29 | Add: 9051.4 0.266801 0.265151 0.274405 30 | Triad: 8948.6 0.269025 0.268199 0.270529 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 10 14 | Number of Threads counted = 10 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 84199 microseconds. 18 | (= 84199 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 14735.2 0.108968 0.108583 0.109307 28 | Scale: 14633.4 0.110203 0.109339 0.111848 29 | Add: 17564.1 0.137254 0.136642 0.138523 30 | Triad: 17594.5 0.136560 0.136406 0.136818 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 2 14 | Number of Threads counted = 2 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 90823 microseconds. 18 | (= 90823 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 12673.4 0.126774 0.126249 0.128944 28 | Scale: 12767.9 0.125896 0.125314 0.127201 29 | Add: 14735.1 0.163475 0.162876 0.164560 30 | Triad: 14830.3 0.162153 0.161831 0.162724 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 3 14 | Number of Threads counted = 3 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 82517 microseconds. 18 | (= 82517 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 14437.9 0.110902 0.110819 0.110989 28 | Scale: 14506.4 0.110808 0.110296 0.111780 29 | Add: 17170.3 0.140238 0.139776 0.141177 30 | Triad: 17322.0 0.138728 0.138552 0.139019 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 4 14 | Number of Threads counted = 4 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 80292 microseconds. 18 | (= 80292 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 15012.9 0.106614 0.106575 0.106645 28 | Scale: 14980.4 0.107572 0.106806 0.109009 29 | Add: 17824.5 0.135150 0.134646 0.136118 30 | Triad: 17974.8 0.133713 0.133520 0.133924 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 5 14 | Number of Threads counted = 5 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 84152 microseconds. 18 | (= 84152 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 15036.0 0.106631 0.106411 0.106852 28 | Scale: 14909.5 0.107977 0.107314 0.109335 29 | Add: 17837.0 0.135107 0.134552 0.136241 30 | Triad: 17866.3 0.134406 0.134331 0.134471 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 6 14 | Number of Threads counted = 6 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 85071 microseconds. 18 | (= 85071 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 14928.4 0.107416 0.107178 0.107723 28 | Scale: 14797.5 0.108833 0.108126 0.110281 29 | Add: 17739.3 0.135854 0.135293 0.137124 30 | Triad: 17783.4 0.135152 0.134957 0.135424 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 7 14 | Number of Threads counted = 7 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 83728 microseconds. 18 | (= 83728 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 14848.8 0.108092 0.107753 0.109187 28 | Scale: 14691.9 0.109851 0.108904 0.113299 29 | Add: 17698.3 0.136251 0.135606 0.137606 30 | Triad: 17723.0 0.135560 0.135417 0.135799 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 8 14 | Number of Threads counted = 8 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 84120 microseconds. 18 | (= 84120 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 14778.1 0.108549 0.108268 0.108815 28 | Scale: 14688.9 0.109721 0.108926 0.111354 29 | Add: 17645.1 0.136675 0.136015 0.137922 30 | Triad: 17677.3 0.136146 0.135767 0.137189 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 9 14 | Number of Threads counted = 9 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 84061 microseconds. 18 | (= 84061 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 14765.1 0.108689 0.108364 0.109062 28 | Scale: 14669.5 0.109852 0.109070 0.111517 29 | Add: 17589.3 0.137128 0.136447 0.138518 30 | Triad: 17636.8 0.136229 0.136079 0.136503 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket0_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,8122.2,8258.0,9051.4,8948.6 3 | 2,12673.4,12767.9,14735.1,14830.3 4 | 3,14437.9,14506.4,17170.3,17322.0 5 | 4,15012.9,14980.4,17824.5,17974.8 6 | 5,15036.0,14909.5,17837.0,17866.3 7 | 6,14928.4,14797.5,17739.3,17783.4 8 | 7,14848.8,14691.9,17698.3,17723.0 9 | 8,14778.1,14688.9,17645.1,17677.3 10 | 9,14765.1,14669.5,17589.3,17636.8 11 | 10,14735.2,14633.4,17564.1,17594.5 12 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 1 14 | Number of Threads counted = 1 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 434396 microseconds. 18 | (= 434396 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 2450.4 0.654293 0.652952 0.655779 28 | Scale: 2602.0 0.615969 0.614914 0.617592 29 | Add: 2848.5 0.843352 0.842535 0.844672 30 | Triad: 2864.9 0.838663 0.837733 0.839957 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 10 14 | Number of Threads counted = 10 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 136446 microseconds. 18 | (= 136446 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 8626.4 0.186404 0.185478 0.188040 28 | Scale: 8785.9 0.182599 0.182109 0.183500 29 | Add: 10370.5 0.231867 0.231425 0.232534 30 | Triad: 10348.9 0.232330 0.231909 0.232666 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 2 14 | Number of Threads counted = 2 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 238327 microseconds. 18 | (= 238327 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 4528.1 0.357132 0.353352 0.358273 28 | Scale: 4746.5 0.340201 0.337094 0.341245 29 | Add: 5218.3 0.463050 0.459918 0.464174 30 | Triad: 5222.0 0.461689 0.459593 0.462247 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 3 14 | Number of Threads counted = 3 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 210314 microseconds. 18 | (= 210314 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 5961.1 0.269586 0.268406 0.271293 28 | Scale: 6167.7 0.260234 0.259418 0.261446 29 | Add: 6966.9 0.345178 0.344487 0.346143 30 | Triad: 6925.5 0.347255 0.346547 0.348450 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 4 14 | Number of Threads counted = 4 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 176455 microseconds. 18 | (= 176455 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 6949.1 0.231458 0.230245 0.232467 28 | Scale: 7138.8 0.224621 0.224126 0.225022 29 | Add: 8244.9 0.291539 0.291089 0.291957 30 | Triad: 8198.2 0.293055 0.292747 0.293226 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 5 14 | Number of Threads counted = 5 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 160072 microseconds. 18 | (= 160072 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 7646.4 0.210942 0.209248 0.211725 28 | Scale: 7804.5 0.205275 0.205010 0.205561 29 | Add: 9089.8 0.264554 0.264032 0.265015 30 | Triad: 9037.3 0.265860 0.265567 0.266212 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 6 14 | Number of Threads counted = 6 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 148901 microseconds. 18 | (= 148901 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 8192.3 0.196060 0.195306 0.197153 28 | Scale: 8439.6 0.190439 0.189583 0.191185 29 | Add: 9769.0 0.246163 0.245674 0.246740 30 | Triad: 9730.3 0.247081 0.246652 0.247457 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 7 14 | Number of Threads counted = 7 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 143800 microseconds. 18 | (= 143800 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 8389.2 0.191643 0.190722 0.192348 28 | Scale: 8582.5 0.187288 0.186427 0.188239 29 | Add: 10042.0 0.239348 0.238995 0.239802 30 | Triad: 10030.9 0.239512 0.239260 0.239790 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 8 14 | Number of Threads counted = 8 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 140364 microseconds. 18 | (= 140364 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 8508.6 0.188757 0.188045 0.189753 28 | Scale: 8704.8 0.184343 0.183806 0.185229 29 | Add: 10262.2 0.234329 0.233869 0.234844 30 | Triad: 10245.3 0.234708 0.234253 0.234987 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 9 14 | Number of Threads counted = 9 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 138659 microseconds. 18 | (= 138659 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 8593.0 0.186830 0.186198 0.188966 28 | Scale: 8747.7 0.183420 0.182906 0.184697 29 | Add: 10309.7 0.233122 0.232790 0.233643 30 | Triad: 10300.9 0.233343 0.232989 0.233806 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA/noHT_Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,2450.4,2602.0,2848.5,2864.9 3 | 2,4528.1,4746.5,5218.3,5222.0 4 | 3,5961.1,6167.7,6966.9,6925.5 5 | 4,6949.1,7138.8,8244.9,8198.2 6 | 5,7646.4,7804.5,9089.8,9037.3 7 | 6,8192.3,8439.6,9769.0,9730.3 8 | 7,8389.2,8582.5,10042.0,10030.9 9 | 8,8508.6,8704.8,10262.2,10245.3 10 | 9,8593.0,8747.7,10309.7,10300.9 11 | 10,8626.4,8785.9,10370.5,10348.9 12 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 1 14 | Number of Threads counted = 1 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 227902 microseconds. 18 | (= 227902 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 4179.1 0.383256 0.382854 0.383994 28 | Scale: 4453.5 0.359587 0.359269 0.360131 29 | Add: 4636.2 0.517811 0.517666 0.517979 30 | Triad: 4678.6 0.513289 0.512969 0.513564 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 10 14 | Number of Threads counted = 10 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 118496 microseconds. 18 | (= 118496 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9695.3 0.166336 0.165028 0.169559 28 | Scale: 9664.7 0.166638 0.165551 0.169249 29 | Add: 11234.8 0.215182 0.213621 0.219362 30 | Triad: 11107.6 0.217278 0.216069 0.217959 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.11.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 11 14 | Number of Threads counted = 11 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 122984 microseconds. 18 | (= 122984 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9320.9 0.173304 0.171657 0.175774 28 | Scale: 9269.1 0.173222 0.172616 0.175255 29 | Add: 10717.8 0.225097 0.223926 0.227463 30 | Triad: 10655.4 0.225992 0.225238 0.226534 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.12.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 12 14 | Number of Threads counted = 12 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 131266 microseconds. 18 | (= 131266 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9421.1 0.171242 0.169831 0.173345 28 | Scale: 9324.4 0.172572 0.171593 0.173529 29 | Add: 10832.3 0.222469 0.221560 0.223770 30 | Triad: 10770.4 0.223738 0.222833 0.224532 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.13.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 13 14 | Number of Threads counted = 13 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 131184 microseconds. 18 | (= 131184 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9499.9 0.169635 0.168423 0.170948 28 | Scale: 9405.7 0.171128 0.170109 0.171827 29 | Add: 10906.8 0.220886 0.220047 0.222127 30 | Triad: 10830.5 0.222424 0.221596 0.222960 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.14.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 14 14 | Number of Threads counted = 14 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 132295 microseconds. 18 | (= 132295 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9531.6 0.168828 0.167863 0.170388 28 | Scale: 9394.1 0.170825 0.170319 0.171582 29 | Add: 10961.4 0.219918 0.218951 0.221417 30 | Triad: 10859.9 0.221567 0.220996 0.221906 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.15.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 15 14 | Number of Threads counted = 15 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 130723 microseconds. 18 | (= 130723 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9558.3 0.169207 0.167394 0.170829 28 | Scale: 9405.4 0.170808 0.170115 0.171596 29 | Add: 10944.5 0.220128 0.219288 0.221848 30 | Triad: 10887.3 0.221051 0.220440 0.221366 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.16.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 16 14 | Number of Threads counted = 16 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 128455 microseconds. 18 | (= 128455 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9542.9 0.168918 0.167664 0.170478 28 | Scale: 9377.1 0.171206 0.170629 0.172111 29 | Add: 10946.4 0.220042 0.219250 0.221368 30 | Triad: 10876.5 0.221576 0.220659 0.221982 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.17.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 17 14 | Number of Threads counted = 17 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 126279 microseconds. 18 | (= 126279 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9524.3 0.169623 0.167992 0.172267 28 | Scale: 9313.5 0.172264 0.171793 0.173948 29 | Add: 10886.2 0.221329 0.220463 0.222398 30 | Triad: 10796.1 0.223514 0.222303 0.226614 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.18.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 18 14 | Number of Threads counted = 18 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 127269 microseconds. 18 | (= 127269 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9452.3 0.170355 0.169271 0.171998 28 | Scale: 9269.8 0.173278 0.172604 0.174098 29 | Add: 10821.1 0.222471 0.221789 0.223816 30 | Triad: 10733.0 0.224401 0.223609 0.224870 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.19.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 19 14 | Number of Threads counted = 19 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 127732 microseconds. 18 | (= 127732 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9406.7 0.171181 0.170092 0.173099 28 | Scale: 9234.8 0.174006 0.173257 0.174614 29 | Add: 10785.6 0.223619 0.222518 0.225288 30 | Triad: 10686.8 0.225419 0.224576 0.225824 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 2 14 | Number of Threads counted = 2 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 135847 microseconds. 18 | (= 135847 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 7337.1 0.219060 0.218070 0.220588 28 | Scale: 7716.4 0.208026 0.207351 0.209754 29 | Add: 8193.7 0.294265 0.292908 0.296555 30 | Triad: 8241.2 0.291917 0.291219 0.293191 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.20.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 20 14 | Number of Threads counted = 20 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 127912 microseconds. 18 | (= 127912 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9351.1 0.171879 0.171103 0.173050 28 | Scale: 9205.1 0.174853 0.173817 0.176800 29 | Add: 10743.7 0.224536 0.223386 0.225779 30 | Triad: 10626.6 0.226989 0.225848 0.232955 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 3 14 | Number of Threads counted = 3 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 127053 microseconds. 18 | (= 127053 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 8998.0 0.178954 0.177817 0.181178 28 | Scale: 9308.1 0.172936 0.171894 0.174343 29 | Add: 10254.5 0.235032 0.234044 0.237108 30 | Triad: 10238.3 0.234687 0.234414 0.235237 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 4 14 | Number of Threads counted = 4 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 123038 microseconds. 18 | (= 123038 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9718.8 0.167448 0.164630 0.169944 28 | Scale: 9809.6 0.164312 0.163105 0.166159 29 | Add: 11022.9 0.218774 0.217728 0.220746 30 | Triad: 11092.4 0.217360 0.216364 0.217960 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 5 14 | Number of Threads counted = 5 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 119091 microseconds. 18 | (= 119091 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9679.7 0.167780 0.165294 0.170541 28 | Scale: 9803.7 0.164740 0.163203 0.165928 29 | Add: 11268.0 0.214475 0.212993 0.216835 30 | Triad: 11315.9 0.213512 0.212091 0.214203 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 6 14 | Number of Threads counted = 6 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 119348 microseconds. 18 | (= 119348 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9765.9 0.167010 0.163836 0.170884 28 | Scale: 9746.1 0.165846 0.164169 0.168201 29 | Add: 11322.1 0.213940 0.211974 0.217361 30 | Triad: 11224.1 0.214637 0.213825 0.215391 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 7 14 | Number of Threads counted = 7 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 120600 microseconds. 18 | (= 120600 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9634.0 0.169093 0.166079 0.171605 28 | Scale: 9531.0 0.169752 0.167873 0.172459 29 | Add: 11203.6 0.216979 0.214217 0.220045 30 | Triad: 11060.4 0.218029 0.216990 0.218821 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 8 14 | Number of Threads counted = 8 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 119760 microseconds. 18 | (= 119760 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9608.7 0.168858 0.166515 0.171386 28 | Scale: 9575.8 0.168100 0.167088 0.169256 29 | Add: 11156.7 0.216447 0.215117 0.218926 30 | Triad: 11037.6 0.218711 0.217439 0.219366 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 9 14 | Number of Threads counted = 9 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 118345 microseconds. 18 | (= 118345 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9747.4 0.166639 0.164147 0.169889 28 | Scale: 9630.1 0.167524 0.166145 0.169930 29 | Add: 11201.7 0.215795 0.214253 0.219663 30 | Triad: 11047.2 0.218182 0.217249 0.218931 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_CXLDDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,4179.1,4453.5,4636.2,4678.6 3 | 2,7337.1,7716.4,8193.7,8241.2 4 | 3,8998.0,9308.1,10254.5,10238.3 5 | 4,9718.8,9809.6,11022.9,11092.4 6 | 5,9679.7,9803.7,11268.0,11315.9 7 | 6,9765.9,9746.1,11322.1,11224.1 8 | 7,9634.0,9531.0,11203.6,11060.4 9 | 8,9608.7,9575.8,11156.7,11037.6 10 | 9,9747.4,9630.1,11201.7,11047.2 11 | 10,9695.3,9664.7,11234.8,11107.6 12 | 11,9320.9,9269.1,10717.8,10655.4 13 | 12,9421.1,9324.4,10832.3,10770.4 14 | 13,9499.9,9405.7,10906.8,10830.5 15 | 14,9531.6,9394.1,10961.4,10859.9 16 | 15,9558.3,9405.4,10944.5,10887.3 17 | 16,9542.9,9377.1,10946.4,10876.5 18 | 17,9524.3,9313.5,10886.2,10796.1 19 | 18,9452.3,9269.8,10821.1,10733.0 20 | 19,9406.7,9234.8,10785.6,10686.8 21 | 20,9351.1,9205.1,10743.7,10626.6 22 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 1 14 | Number of Threads counted = 1 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 103253 microseconds. 18 | (= 103253 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 10749.3 0.149110 0.148847 0.149554 28 | Scale: 10998.2 0.145643 0.145479 0.145896 29 | Add: 12198.7 0.197189 0.196743 0.197743 30 | Triad: 12131.6 0.198040 0.197830 0.198258 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 10 14 | Number of Threads counted = 10 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 94912 microseconds. 18 | (= 94912 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 12011.3 0.134133 0.133208 0.135429 28 | Scale: 11928.5 0.135436 0.134132 0.137006 29 | Add: 13701.9 0.176716 0.175158 0.178243 30 | Triad: 13666.3 0.177067 0.175614 0.179717 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.11.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 11 14 | Number of Threads counted = 11 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 98126 microseconds. 18 | (= 98126 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 11599.8 0.138732 0.137933 0.140258 28 | Scale: 11328.1 0.141536 0.141242 0.142320 29 | Add: 12939.4 0.186310 0.185480 0.187470 30 | Triad: 12679.4 0.189788 0.189284 0.190346 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 2 14 | Number of Threads counted = 2 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 89407 microseconds. 18 | (= 89407 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 12267.9 0.131068 0.130422 0.132556 28 | Scale: 11891.3 0.136629 0.134552 0.138995 29 | Add: 13979.9 0.172135 0.171675 0.174251 30 | Triad: 13872.4 0.173677 0.173005 0.174913 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 3 14 | Number of Threads counted = 3 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 92433 microseconds. 18 | (= 92433 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 12557.1 0.133354 0.127418 0.144179 28 | Scale: 12090.8 0.133383 0.132332 0.134496 29 | Add: 14114.6 0.171665 0.170037 0.175098 30 | Triad: 14105.0 0.171175 0.170152 0.171659 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 4 14 | Number of Threads counted = 4 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 93810 microseconds. 18 | (= 93810 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 12478.3 0.134921 0.128223 0.147896 28 | Scale: 12243.4 0.132095 0.130683 0.134728 29 | Add: 14192.3 0.171367 0.169106 0.178828 30 | Triad: 14093.7 0.171056 0.170289 0.174184 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 5 14 | Number of Threads counted = 5 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 94319 microseconds. 18 | (= 94319 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 12063.9 0.141477 0.132627 0.148486 28 | Scale: 12235.0 0.132526 0.130773 0.135351 29 | Add: 13954.7 0.173991 0.171985 0.176238 30 | Triad: 14057.4 0.171101 0.170729 0.171554 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 6 14 | Number of Threads counted = 6 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 95283 microseconds. 18 | (= 95283 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 12179.6 0.134699 0.131367 0.143013 28 | Scale: 12159.1 0.133358 0.131589 0.135637 29 | Add: 14167.5 0.171946 0.169402 0.173677 30 | Triad: 14136.1 0.170780 0.169778 0.171887 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 7 14 | Number of Threads counted = 7 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 95813 microseconds. 18 | (= 95813 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 11922.5 0.138979 0.134200 0.145591 28 | Scale: 11945.6 0.137373 0.133940 0.140009 29 | Add: 13139.3 0.188596 0.182658 0.196630 30 | Triad: 13505.8 0.181404 0.177702 0.186060 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 8 14 | Number of Threads counted = 8 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 94635 microseconds. 18 | (= 94635 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 12008.3 0.134107 0.133241 0.135240 28 | Scale: 12079.2 0.133946 0.132459 0.135378 29 | Add: 13769.0 0.175073 0.174304 0.175722 30 | Triad: 13815.6 0.175299 0.173717 0.176703 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 9 14 | Number of Threads counted = 9 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 93999 microseconds. 18 | (= 93999 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 12138.0 0.133188 0.131817 0.136354 28 | Scale: 12113.9 0.132852 0.132080 0.133873 29 | Add: 13834.8 0.174206 0.173476 0.174980 30 | Triad: 13842.5 0.173928 0.173379 0.174637 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,10749.3,10998.2,12198.7,12131.6 3 | 2,12267.9,11891.3,13979.9,13872.4 4 | 3,12557.1,12090.8,14114.6,14105.0 5 | 4,12478.3,12243.4,14192.3,14093.7 6 | 5,12063.9,12235.0,13954.7,14057.4 7 | 6,12179.6,12159.1,14167.5,14136.1 8 | 7,11922.5,11945.6,13139.3,13505.8 9 | 8,12008.3,12079.2,13769.0,13815.6 10 | 9,12138.0,12113.9,13834.8,13842.5 11 | 10,12011.3,11928.5,13701.9,13666.3 12 | 11,11599.8,11328.1,12939.4,12679.4 13 | 12,11417.1,10814.3,12569.0,12105.1 14 | 13,11176.7,10430.0,12112.5,11548.8 15 | 14,11015.1,10011.5,11763.5,11083.7 16 | 15,10817.9,9736.0,11517.5,10775.8 17 | 16,10652.2,9458.2,11244.8,10438.2 18 | 17,10491.7,9212.1,11053.2,10189.1 19 | 18,10389.9,9063.8,10902.7,9938.3 20 | 19,10166.7,8889.2,10700.2,9687.7 21 | 20,10074.4,8561.7,10556.9,9444.8 22 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 1 14 | Number of Threads counted = 1 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 99098 microseconds. 18 | (= 99098 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 11169.0 0.143379 0.143254 0.143599 28 | Scale: 11427.9 0.140131 0.140008 0.140532 29 | Add: 12515.9 0.191923 0.191756 0.192138 30 | Triad: 12386.9 0.193884 0.193753 0.194148 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 10 14 | Number of Threads counted = 10 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 51015 microseconds. 18 | (= 51015 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 21824.8 0.073816 0.073311 0.074746 28 | Scale: 21271.2 0.075652 0.075219 0.076559 29 | Add: 24219.7 0.099511 0.099093 0.100451 30 | Triad: 23949.9 0.100611 0.100209 0.100764 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.11.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 11 14 | Number of Threads counted = 11 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 53475 microseconds. 18 | (= 53475 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 22379.8 0.072495 0.071493 0.074542 28 | Scale: 21944.3 0.073600 0.072912 0.075188 29 | Add: 24884.1 0.097026 0.096447 0.098490 30 | Triad: 24561.5 0.097853 0.097714 0.098232 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.12.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 12 14 | Number of Threads counted = 12 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 53128 microseconds. 18 | (= 53128 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 21753.6 0.073842 0.073551 0.074189 28 | Scale: 20644.9 0.078236 0.077501 0.079786 29 | Add: 23987.3 0.100812 0.100053 0.102518 30 | Triad: 24006.7 0.100223 0.099972 0.100374 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.13.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 13 14 | Number of Threads counted = 13 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 56228 microseconds. 18 | (= 56228 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 21052.6 0.076175 0.076000 0.076391 28 | Scale: 20288.6 0.079909 0.078862 0.082120 29 | Add: 23470.5 0.103159 0.102256 0.105094 30 | Triad: 23451.5 0.102529 0.102339 0.102742 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.14.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 14 14 | Number of Threads counted = 14 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 58840 microseconds. 18 | (= 58840 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 20363.5 0.078672 0.078572 0.078875 28 | Scale: 19825.5 0.081920 0.080704 0.084184 29 | Add: 22906.0 0.105705 0.104776 0.107502 30 | Triad: 22975.1 0.104746 0.104461 0.105137 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.15.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 15 14 | Number of Threads counted = 15 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 61054 microseconds. 18 | (= 61054 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 19848.4 0.080768 0.080611 0.080894 28 | Scale: 19424.1 0.083487 0.082372 0.085731 29 | Add: 22472.5 0.107705 0.106797 0.109424 30 | Triad: 22595.3 0.106420 0.106217 0.107017 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.16.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 16 14 | Number of Threads counted = 16 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 62754 microseconds. 18 | (= 62754 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 19342.8 0.082981 0.082718 0.083202 28 | Scale: 19020.7 0.085309 0.084119 0.088074 29 | Add: 22007.0 0.109913 0.109056 0.111735 30 | Triad: 22189.5 0.108260 0.108159 0.108453 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.17.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 17 14 | Number of Threads counted = 17 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 63843 microseconds. 18 | (= 63843 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 18954.9 0.084897 0.084411 0.085132 28 | Scale: 18570.9 0.087470 0.086156 0.090096 29 | Add: 21632.2 0.111818 0.110946 0.113704 30 | Triad: 21798.8 0.110186 0.110098 0.110343 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.18.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 18 14 | Number of Threads counted = 18 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 64707 microseconds. 18 | (= 64707 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 18559.1 0.086891 0.086211 0.087141 28 | Scale: 18280.1 0.088951 0.087527 0.091739 29 | Add: 21321.0 0.113434 0.112565 0.115459 30 | Triad: 21457.7 0.111982 0.111848 0.112349 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.19.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 19 14 | Number of Threads counted = 19 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 67893 microseconds. 18 | (= 67893 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 17922.9 0.089976 0.089271 0.092796 28 | Scale: 17826.3 0.090803 0.089755 0.092935 29 | Add: 20920.1 0.115709 0.114722 0.118971 30 | Triad: 21061.3 0.114280 0.113953 0.114938 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 2 14 | Number of Threads counted = 2 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 65845 microseconds. 18 | (= 65845 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 16891.9 0.095052 0.094720 0.095707 28 | Scale: 16953.4 0.094644 0.094376 0.095121 29 | Add: 19359.7 0.124284 0.123969 0.124973 30 | Triad: 19376.4 0.123982 0.123862 0.124171 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.20.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 20 14 | Number of Threads counted = 20 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 67439 microseconds. 18 | (= 67439 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 17896.7 0.089699 0.089402 0.090050 28 | Scale: 17707.7 0.091581 0.090356 0.094052 29 | Add: 20873.9 0.115722 0.114976 0.117304 30 | Triad: 21031.4 0.114377 0.114115 0.114735 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 3 14 | Number of Threads counted = 3 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 58531 microseconds. 18 | (= 58531 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 19575.4 0.082119 0.081735 0.082829 28 | Scale: 19450.3 0.082538 0.082261 0.083065 29 | Add: 22747.1 0.105959 0.105508 0.106768 30 | Triad: 22657.1 0.106208 0.105927 0.106470 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 4 14 | Number of Threads counted = 4 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 54024 microseconds. 18 | (= 54024 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 21233.6 0.075713 0.075352 0.076412 28 | Scale: 21018.1 0.076475 0.076125 0.077094 29 | Add: 24262.1 0.099321 0.098920 0.100081 30 | Triad: 24100.8 0.099769 0.099582 0.099914 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 5 14 | Number of Threads counted = 5 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 52400 microseconds. 18 | (= 52400 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 21982.6 0.073319 0.072785 0.074226 28 | Scale: 21471.9 0.074806 0.074516 0.075342 29 | Add: 24503.3 0.098370 0.097946 0.099153 30 | Triad: 24266.7 0.099380 0.098901 0.099614 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 6 14 | Number of Threads counted = 6 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 51758 microseconds. 18 | (= 51758 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 21980.8 0.073274 0.072791 0.074173 28 | Scale: 21403.5 0.075070 0.074754 0.075791 29 | Add: 24401.2 0.098825 0.098356 0.099682 30 | Triad: 24088.2 0.099961 0.099634 0.100286 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 7 14 | Number of Threads counted = 7 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 51126 microseconds. 18 | (= 51126 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 21943.0 0.073408 0.072916 0.074311 28 | Scale: 21351.2 0.075296 0.074937 0.076113 29 | Add: 24367.2 0.098915 0.098493 0.099845 30 | Triad: 24097.3 0.100070 0.099596 0.100271 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 8 14 | Number of Threads counted = 8 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 51128 microseconds. 18 | (= 51128 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 21883.4 0.073652 0.073115 0.074568 28 | Scale: 21306.6 0.075492 0.075094 0.076318 29 | Add: 24289.3 0.099231 0.098809 0.100140 30 | Triad: 24031.2 0.100315 0.099870 0.100499 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 9 14 | Number of Threads counted = 9 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 51070 microseconds. 18 | (= 51070 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 21839.7 0.073804 0.073261 0.074766 28 | Scale: 21287.6 0.075542 0.075161 0.076381 29 | Add: 24247.6 0.099440 0.098979 0.100322 30 | Triad: 23999.8 0.100416 0.100001 0.100628 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket0DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,11169.0,11427.9,12515.9,12386.9 3 | 2,16891.9,16953.4,19359.7,19376.4 4 | 3,19575.4,19450.3,22747.1,22657.1 5 | 4,21233.6,21018.1,24262.1,24100.8 6 | 5,21982.6,21471.9,24503.3,24266.7 7 | 6,21980.8,21403.5,24401.2,24088.2 8 | 7,21943.0,21351.2,24367.2,24097.3 9 | 8,21883.4,21306.6,24289.3,24031.2 10 | 9,21839.7,21287.6,24247.6,23999.8 11 | 10,21824.8,21271.2,24219.7,23949.9 12 | 11,22379.8,21944.3,24884.1,24561.5 13 | 12,21753.6,20644.9,23987.3,24006.7 14 | 13,21052.6,20288.6,23470.5,23451.5 15 | 14,20363.5,19825.5,22906.0,22975.1 16 | 15,19848.4,19424.1,22472.5,22595.3 17 | 16,19342.8,19020.7,22007.0,22189.5 18 | 17,18954.9,18570.9,21632.2,21798.8 19 | 18,18559.1,18280.1,21321.0,21457.7 20 | 19,17922.9,17826.3,20920.1,21061.3 21 | 20,17896.7,17707.7,20873.9,21031.4 22 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 1 14 | Number of Threads counted = 1 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 142498 microseconds. 18 | (= 142498 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 7650.3 0.209291 0.209142 0.209377 28 | Scale: 7114.0 0.225146 0.224908 0.225412 29 | Add: 7885.2 0.304556 0.304367 0.304696 30 | Triad: 7174.2 0.334648 0.334533 0.334807 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.2.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 2 14 | Number of Threads counted = 2 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 133343 microseconds. 18 | (= 133343 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9165.3 0.175186 0.174571 0.175849 28 | Scale: 7481.1 0.214358 0.213872 0.215060 29 | Add: 9108.9 0.263958 0.263480 0.264399 30 | Triad: 7945.9 0.305224 0.302041 0.307478 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.3.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 3 14 | Number of Threads counted = 3 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 132772 microseconds. 18 | (= 132772 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9456.8 0.169338 0.169191 0.169775 28 | Scale: 7429.1 0.216375 0.215370 0.217333 29 | Add: 9230.5 0.260784 0.260008 0.261224 30 | Triad: 7657.9 0.314127 0.313404 0.315308 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.4.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 4 14 | Number of Threads counted = 4 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 136607 microseconds. 18 | (= 136607 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9292.2 0.173044 0.172188 0.173473 28 | Scale: 7423.6 0.217133 0.215530 0.218579 29 | Add: 9213.9 0.261356 0.260477 0.262803 30 | Triad: 7693.4 0.314494 0.311955 0.318190 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.5.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 5 14 | Number of Threads counted = 5 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 137934 microseconds. 18 | (= 137934 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9238.6 0.173540 0.173186 0.173787 28 | Scale: 7345.0 0.219550 0.217836 0.222778 29 | Add: 9132.4 0.262974 0.262800 0.263200 30 | Triad: 7638.0 0.316099 0.314217 0.318165 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.6.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 6 14 | Number of Threads counted = 6 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 139503 microseconds. 18 | (= 139503 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9196.7 0.174072 0.173975 0.174149 28 | Scale: 7236.5 0.222046 0.221100 0.223674 29 | Add: 9142.6 0.263525 0.262507 0.264566 30 | Triad: 7524.5 0.322067 0.318958 0.325934 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.7.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 7 14 | Number of Threads counted = 7 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 144078 microseconds. 18 | (= 144078 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9110.8 0.176944 0.175615 0.178452 28 | Scale: 6968.3 0.233828 0.229611 0.236156 29 | Add: 8608.4 0.284117 0.278797 0.293368 30 | Triad: 6842.4 0.360628 0.350755 0.368618 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.8.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 8 14 | Number of Threads counted = 8 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 146898 microseconds. 18 | (= 146898 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9071.3 0.176979 0.176380 0.177868 28 | Scale: 6952.8 0.231661 0.230122 0.233326 29 | Add: 8957.1 0.268989 0.267943 0.270572 30 | Triad: 7327.4 0.330048 0.327537 0.336869 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output.9.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 9 14 | Number of Threads counted = 9 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 148554 microseconds. 18 | (= 148554 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 9011.8 0.178404 0.177544 0.179654 28 | Scale: 6835.7 0.235976 0.234066 0.238939 29 | Add: 8954.4 0.269527 0.268024 0.270976 30 | Triad: 7274.3 0.331591 0.329930 0.334091 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR4_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,7650.3,7114.0,7885.2,7174.2 3 | 2,9165.3,7481.1,9108.9,7945.9 4 | 3,9456.8,7429.1,9230.5,7657.9 5 | 4,9292.2,7423.6,9213.9,7693.4 6 | 5,9238.6,7345.0,9132.4,7638.0 7 | 6,9196.7,7236.5,9142.6,7524.5 8 | 7,9110.8,6968.3,8608.4,6842.4 9 | 8,9071.3,6952.8,8957.1,7327.4 10 | 9,9011.8,6835.7,8954.4,7274.3 11 | 10,8922.9,6711.6,8970.8,7155.6 12 | 11,9236.1,6966.2,9227.6,7609.6 13 | 12,9467.1,7269.0,9553.9,7960.3 14 | 13,9584.4,7505.9,9682.8,8239.9 15 | 14,9702.0,7557.0,9890.8,8375.4 16 | 15,9755.0,7916.4,10070.1,8652.9 17 | 16,9953.2,8089.6,10279.3,8878.9 18 | 17,10021.0,8268.8,10385.6,9064.2 19 | 18,10073.3,8370.9,10503.7,9224.0 20 | 19,10035.9,8494.6,10497.7,9335.8 21 | 20,9999.4,8548.2,10594.2,9463.4 22 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.1.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 1 14 | Number of Threads counted = 1 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 147413 microseconds. 18 | (= 147413 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 8121.6 0.197196 0.197005 0.197409 28 | Scale: 8256.7 0.193908 0.193782 0.194059 29 | Add: 9056.7 0.265123 0.264997 0.265290 30 | Triad: 8951.4 0.268308 0.268113 0.268659 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.10.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 10 14 | Number of Threads counted = 10 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 84671 microseconds. 18 | (= 84671 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 14703.4 0.109125 0.108818 0.109462 28 | Scale: 14629.5 0.110176 0.109368 0.111873 29 | Add: 17544.8 0.137415 0.136793 0.138812 30 | Triad: 17579.9 0.136705 0.136519 0.136982 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output.11.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 100000000 (elements), Offset = 0 (elements) 7 | Memory per array = 762.9 MiB (= 0.7 GiB). 8 | Total memory required = 2288.8 MiB (= 2.2 GiB). 9 | Each kernel will be executed 10 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 11 14 | Number of Threads counted = 11 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 85628 microseconds. 18 | (= 85628 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 15726.4 0.102540 0.101740 0.104150 28 | Scale: 15441.2 0.104464 0.103619 0.106550 29 | Add: 18612.0 0.129761 0.128949 0.131864 30 | Triad: 18493.8 0.130050 0.129773 0.130910 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /remote_NUMA_allcores/noHT_Socket0Socket1_Socket1DDR5_Close_noFT_NOPATH_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,8121.6,8256.7,9056.7,8951.4 3 | 2,12699.6,12775.8,14846.7,14763.8 4 | 3,14504.3,14494.4,17258.4,17266.3 5 | 4,15047.4,14908.5,17845.5,17862.3 6 | 5,15049.6,14888.3,17836.7,17846.1 7 | 6,14926.3,14780.9,17727.5,17764.4 8 | 7,14831.1,14718.2,17684.9,17720.4 9 | 8,14777.1,14687.6,17628.9,17660.2 10 | 9,14768.0,14666.9,17593.5,17632.0 11 | 10,14703.4,14629.5,17544.8,17579.9 12 | 11,15726.4,15441.2,18612.0,18493.8 13 | 12,15835.9,15453.0,18748.7,18971.3 14 | 13,16166.5,15805.3,19052.6,19136.6 15 | 14,16450.6,16086.7,19352.8,19401.8 16 | 15,16519.4,16270.6,19455.6,19568.0 17 | 16,16800.9,16654.8,19831.4,20036.1 18 | 17,17032.1,16997.6,20110.8,20366.9 19 | 18,17292.0,17295.8,20386.1,20643.2 20 | 19,17561.4,17589.2,20615.3,20900.6 21 | 20,17870.3,17844.4,20895.0,21125.1 22 | -------------------------------------------------------------------------------- /remote_PMDK/noHT_Socket0_CXLDAX_Close_noFT_@mnt@pmem2_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,3929.9,3787.4,3863.1,3881.9 3 | 2,6767.7,6691.4,6840.1,6858.8 4 | 3,8102.7,7986.7,8420.1,8326.7 5 | 4,8545.3,8371.0,9151.6,9019.2 6 | 5,8130.1,7471.5,8852.3,8723.4 7 | 6,8305.4,7667.6,9060.9,8837.9 8 | 7,8557.8,8051.5,9187.6,9023.0 9 | 8,8770.0,8472.1,9414.1,9287.4 10 | 9,8827.4,8403.6,9529.9,9345.6 11 | 10,8817.3,8453.2,9507.3,9335.8 12 | -------------------------------------------------------------------------------- /remote_PMDK/noHT_Socket0_Socket1DDR5DAX_Close_noFT_@mnt@pmem1_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,7201.7,6109.9,5952.7,6304.4 3 | 2,11963.1,10631.6,11463.0,12043.5 4 | 3,14026.3,12967.4,15141.2,15769.1 5 | 4,14826.5,14297.1,16887.2,17305.6 6 | 5,13933.5,13237.6,16095.4,16129.2 7 | 6,14040.4,13570.4,16396.8,16377.8 8 | 7,14475.7,14002.9,16855.3,16973.7 9 | 8,14481.9,13997.3,16971.6,17117.9 10 | 9,14468.1,14044.4,16981.6,17110.5 11 | 10,14500.2,14093.8,16981.9,17087.9 12 | -------------------------------------------------------------------------------- /remote_PMDK/noHT_Socket1_CXLDAX_Close_noFT_@mnt@pmem2_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,2449.3,2326.4,2174.6,2260.7 3 | 2,4154.7,3995.4,3700.7,3902.9 4 | 3,5589.5,5360.6,4829.4,5170.1 5 | 4,6514.4,6214.5,5897.5,6224.2 6 | 5,7174.2,6882.2,6811.2,7167.6 7 | 6,7662.5,7345.1,7453.8,7768.2 8 | 7,7946.9,7723.7,7944.8,8199.6 9 | 8,8098.0,7933.2,8404.8,8625.4 10 | 9,8232.6,7979.2,8627.5,8822.1 11 | 10,8273.8,8148.9,8814.3,8994.2 12 | -------------------------------------------------------------------------------- /remote_PMDK_affinity/noHT_Socket0Socket1_CXLDAX_Close_noFT_@mnt@pmem2_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,3949.1,3812.0,3881.6,3916.9 3 | 2,6793.4,6622.6,6873.9,6817.3 4 | 3,8195.0,8109.0,8442.1,8368.0 5 | 4,8568.4,8348.4,9234.9,9013.4 6 | 5,8051.0,7390.4,8867.4,8753.9 7 | 6,8287.3,7672.1,9037.1,8893.2 8 | 7,8618.3,8045.4,9160.7,9011.4 9 | 8,8744.9,8392.2,9390.7,9298.7 10 | 9,8827.5,8419.8,9504.1,9349.9 11 | 10,8773.6,8430.1,9529.1,9328.8 12 | 11,8870.3,8558.1,8607.5,8844.9 13 | 12,8888.4,8570.4,8885.8,8980.1 14 | 13,8914.8,8627.0,9037.2,9075.3 15 | 14,8894.0,8627.1,9109.9,9093.0 16 | 15,8961.7,8672.6,9285.9,9255.9 17 | 16,8994.3,8700.6,9400.4,9325.8 18 | 17,8958.2,8697.3,9389.1,9352.4 19 | 18,8994.5,8729.2,9553.8,9458.8 20 | 19,8996.5,8751.4,9535.3,9463.8 21 | 20,9024.1,8737.2,9596.8,9513.8 22 | -------------------------------------------------------------------------------- /remote_PMDK_affinity/noHT_Socket0Socket1_CXLDAX_Spread_noFT_@mnt@pmem2_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,3983.3,3812.2,3897.5,3931.7 3 | 2,4512.6,4314.8,3695.1,4040.5 4 | 3,5946.5,5700.5,4616.5,5285.8 5 | 4,6982.0,6725.3,6000.1,6411.6 6 | 5,7757.5,7577.7,7016.4,7390.5 7 | 6,8221.7,7949.7,7708.1,8027.0 8 | 7,8413.1,8161.6,8097.1,8335.5 9 | 8,8557.5,8312.3,8472.5,8678.0 10 | 9,8681.2,8441.4,8760.0,8913.0 11 | 10,8817.1,8582.4,8932.7,9035.7 12 | 11,8875.4,8585.2,9137.0,9171.2 13 | 12,8935.9,8619.6,9239.4,9292.6 14 | 13,8941.1,8614.7,9282.3,9342.0 15 | 14,8788.8,8668.0,9325.2,9334.4 16 | 15,8840.6,8692.5,9440.2,9475.0 17 | 16,8975.4,8685.9,9492.5,9437.1 18 | 17,8985.6,8689.9,9495.4,9443.8 19 | 18,9010.4,8672.3,9558.7,9467.0 20 | 19,9005.5,8687.5,9588.5,9511.7 21 | 20,9046.5,8705.3,9619.0,9496.7 22 | -------------------------------------------------------------------------------- /remote_PMDK_affinity/noHT_Socket0Socket1_Socket0DDR5DAX_Close_noFT_@mnt@pmem0_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,8181.5,6714.7,6517.6,6876.5 3 | 2,14955.0,12813.4,12899.0,13511.0 4 | 3,18813.1,16876.4,18345.7,19094.7 5 | 4,20870.0,19344.2,22251.9,22646.0 6 | 5,19639.9,18214.1,21896.2,21598.3 7 | 6,20079.1,18945.6,22303.6,21892.8 8 | 7,20867.6,19733.1,22795.1,22583.4 9 | 8,20940.2,19863.9,22863.0,22645.8 10 | 9,20983.1,20095.5,22989.2,22720.2 11 | 10,21141.1,20142.7,23095.3,22888.5 12 | 11,21777.3,20798.7,23856.9,23675.4 13 | 12,21118.0,20085.4,23262.6,23080.7 14 | 13,20523.8,19681.2,22812.8,22649.2 15 | 14,19967.8,19294.3,22381.6,22294.3 16 | 15,19446.3,18904.3,21940.3,21968.1 17 | 16,18962.6,18570.5,21549.2,21648.9 18 | 17,18500.7,18209.1,21149.0,21371.5 19 | 18,18115.7,17873.3,20854.3,21133.8 20 | 19,17836.4,17627.5,20590.6,20920.9 21 | 20,17630.6,17376.8,20396.3,20710.4 22 | -------------------------------------------------------------------------------- /remote_PMDK_affinity/noHT_Socket0Socket1_Socket0DDR5DAX_Spread_noFT_@mnt@pmem0_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,8207.1,6718.1,6526.3,6876.9 3 | 2,12790.8,11254.0,11726.3,12348.9 4 | 3,16493.1,15207.8,17011.6,17323.6 5 | 4,17405.9,16263.5,18894.2,19453.0 6 | 5,18152.7,16616.6,20438.0,20608.1 7 | 6,17408.1,16344.9,20129.7,20296.0 8 | 7,18124.7,17116.2,20715.9,20951.5 9 | 8,17298.8,16788.4,20065.2,20330.7 10 | 9,16892.6,16285.3,19795.3,19946.6 11 | 10,17508.1,17027.4,20359.5,20591.0 12 | 11,17069.4,16621.3,19930.9,20121.9 13 | 12,16967.7,16383.7,19681.8,19780.8 14 | 13,16699.5,16166.0,19435.6,19544.0 15 | 14,16625.6,16069.1,19259.3,19343.0 16 | 15,16518.2,15968.7,19137.5,19224.8 17 | 16,16693.1,16212.1,19373.1,19515.8 18 | 17,16792.6,16412.9,19508.5,19754.9 19 | 18,17138.5,16800.9,19870.5,20139.1 20 | 19,17393.6,17119.7,20140.5,20434.7 21 | 20,17660.0,17386.0,20434.1,20697.9 22 | -------------------------------------------------------------------------------- /remote_PMDK_affinity/noHT_Socket0Socket1_Socket1DDR5DAX_Close_noFT_@mnt@pmem1_Arrays100000000_Cores10/output_data.csv: -------------------------------------------------------------------------------- 1 | Number of Threads,Copy Rate (MB/s),Scale Rate (MB/s),Add Rate (MB/s),Triad Rate (MB/s) 2 | 1,7238.1,6113.1,5958.8,6219.7 3 | 2,11978.1,10644.7,11509.1,12046.7 4 | 3,14044.8,12981.4,15139.5,15782.6 5 | 4,14839.6,14301.3,16896.4,17310.3 6 | 5,14997.1,14722.0,17343.2,17737.5 7 | 6,14940.0,14795.8,17513.4,17735.1 8 | 7,14139.3,13780.6,16594.4,16579.6 9 | 8,14181.0,13783.1,16812.4,16797.0 10 | 9,14229.2,13822.5,16768.6,16802.8 11 | 10,14295.0,13806.7,16721.0,16808.7 12 | 11,15245.8,14545.9,17698.1,17680.2 13 | 12,15475.0,14822.8,17973.5,18062.5 14 | 13,15717.9,15088.8,18226.9,18393.9 15 | 14,15960.4,15356.9,18482.8,18681.4 16 | 15,16221.1,15631.7,18797.3,19034.5 17 | 16,16524.1,15946.6,19088.4,19347.5 18 | 17,16685.8,16281.7,19310.3,19661.2 19 | 18,16816.3,16459.4,19435.2,19928.9 20 | 19,17164.3,16916.0,19811.4,20264.8 21 | 20,17396.8,17102.0,19949.1,20324.4 22 | --------------------------------------------------------------------------------