├── .gitmodules ├── LICENSE ├── README.md ├── basic_cuda ├── Makefile ├── README.md ├── cudamacro.h ├── main.cu ├── plot_ising.py └── sample_plot.png ├── basic_python ├── README.md ├── ising_basic.py ├── plot_ising_multi.py └── sample_plot.png ├── optimized ├── cuBlumeCapel │ ├── LICENSE │ ├── Makefile │ ├── README.md │ ├── cudamacro.h │ ├── main.cu │ ├── utils.c │ ├── utils.h │ ├── vmm_alloc.cu │ └── vmm_alloc.h ├── cuIsingModel │ ├── LICENSE │ ├── Makefile │ ├── README.md │ ├── cudamacro.h │ ├── main.cu │ ├── utils.c │ ├── utils.h │ ├── vmm_alloc.cu │ └── vmm_alloc.h └── old │ ├── Makefile │ ├── README.md │ ├── cudamacro.h │ ├── images │ └── lattice_8192x8192_T_1.500000_IT_00001024_0.txt.png │ ├── main.cu │ ├── plotLattice.py │ ├── utils.c │ └── utils.h └── tensorcore ├── Makefile ├── README.md ├── cudamacro.h ├── main.cu ├── plot_ising.py └── sample_plot.png /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/cub"] 2 | path = external/cub 3 | url = https://github.com/NVlabs/cub.git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 NVIDIA Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### GPU-accelerated Monte Carlo simulations of 2D Ising Model 2 | This repository contains several implementations of the checkerboard Metropolis algorithm to simulate the 2D Ising model, which are referred to in a paper in preparation ([link](https://arxiv.org/abs/1906.06297)). 3 | 4 | ### License 5 | This code is released under an MIT license which can be found in `LICENSE`. 6 | -------------------------------------------------------------------------------- /basic_cuda/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_HOME=/usr/local/cuda 2 | CUDACC=$(CUDA_HOME)/bin/nvcc 3 | CC=gcc 4 | LD=$(CUDACC) 5 | LDFLAGS=-lcurand 6 | CFLAGS=-c -O3 -g -I$(CUDA_HOME)/include 7 | CUDACFLAGS= -std=c++11 -c -O3 -lineinfo -arch=sm_70 -Xptxas=-v -I../external/cub 8 | 9 | all: ising_basic 10 | 11 | ising_basic: main.o 12 | $(LD) -o ising_basic main.o $(LDFLAGS) 13 | 14 | %.o: %.cu 15 | nvcc -c $(CUDACFLAGS) $< 16 | 17 | clean: 18 | rm *.o ising_basic 19 | -------------------------------------------------------------------------------- /basic_cuda/README.md: -------------------------------------------------------------------------------- 1 | ### Basic Implementation using CUDA C 2 | 3 | ### Basic Usage 4 | Compile binary with `make`. 5 | 6 | Example run command: 7 | 8 | `./ising_basic -x -y -n ` 9 | 10 | Run `./ising_basic --help` for more options. 11 | 12 | ### Visualizing Results 13 | `-o` flag enables output of final lattice configuration to text file `final.txt`. Use provided `plot_ising.py` to visualize output. 14 | 15 | For example: 16 | ``` 17 | $ ./ising_basic -x 2048 -y 2048 -n 100 -a 0.5 -o 18 | ... 19 | Writing lattice to final.txt... 20 | 21 | $ python plot_ising.py 22 | ``` 23 | 24 | This will produce the following output: 25 | 26 | ![sample_plot.png](sample_plot.png) 27 | -------------------------------------------------------------------------------- /basic_cuda/cudamacro.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | #ifndef __CUDA_MACRO_H__ 23 | #define __CUDA_MACRO_H__ 24 | 25 | #define CHECK_CUDA(call) { \ 26 | cudaError_t err = call; \ 27 | if( cudaSuccess != err) { \ 28 | fprintf(stderr, "CUDA error in file '%s' in line %i : %s.\n", \ 29 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 30 | exit(EXIT_FAILURE); \ 31 | }} 32 | 33 | #define CHECK_CUBLAS(call) { \ 34 | cublasStatus_t status = call; \ 35 | if( CUBLAS_STATUS_SUCCESS != status) { \ 36 | fprintf(stderr, "CUBLAS error: %s = %d at (%s:%d)\n", #call, \ 37 | status, __FILE__, __LINE__); \ 38 | exit(EXIT_FAILURE); \ 39 | }} 40 | 41 | #define CHECK_CURAND(call) { \ 42 | curandStatus_t status = call; \ 43 | if( CURAND_STATUS_SUCCESS != status) { \ 44 | fprintf(stderr, "CURAND error: %s = %d at (%s:%d)\n", #call, \ 45 | status, __FILE__, __LINE__); \ 46 | exit(EXIT_FAILURE); \ 47 | }} 48 | 49 | #define CHECK_ERROR(errorMessage) { \ 50 | cudaError_t err = cudaGetLastError(); \ 51 | if( cudaSuccess != err) { \ 52 | fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ 53 | errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ 54 | exit(EXIT_FAILURE); \ 55 | }} 56 | #endif 57 | -------------------------------------------------------------------------------- /basic_cuda/main.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include 30 | #include 31 | #include 32 | 33 | #include 34 | #define CUB_CHUNK_SIZE ((1ll<<31) - (1ll<<28)) 35 | 36 | #include "cudamacro.h" 37 | 38 | #define TCRIT 2.26918531421f 39 | #define THREADS 128 40 | 41 | // Initialize lattice spins 42 | __global__ void init_spins(signed char* lattice, 43 | const float* __restrict__ randvals, 44 | const long long nx, 45 | const long long ny) { 46 | const long long tid = static_cast(blockDim.x) * blockIdx.x + threadIdx.x; 47 | if (tid >= nx * ny) return; 48 | 49 | float randval = randvals[tid]; 50 | signed char val = (randval < 0.5f) ? -1 : 1; 51 | lattice[tid] = val; 52 | } 53 | 54 | template 55 | __global__ void update_lattice(signed char* lattice, 56 | const signed char* __restrict__ op_lattice, 57 | const float* __restrict__ randvals, 58 | const float inv_temp, 59 | const long long nx, 60 | const long long ny) { 61 | const long long tid = static_cast(blockDim.x) * blockIdx.x + threadIdx.x; 62 | const int i = tid / ny; 63 | const int j = tid % ny; 64 | 65 | if (i >= nx || j >= ny) return; 66 | 67 | // Set stencil indices with periodicity 68 | int ipp = (i + 1 < nx) ? i + 1 : 0; 69 | int inn = (i - 1 >= 0) ? i - 1: nx - 1; 70 | int jpp = (j + 1 < ny) ? j + 1 : 0; 71 | int jnn = (j - 1 >= 0) ? j - 1: ny - 1; 72 | 73 | // Select off-column index based on color and row index parity 74 | int joff; 75 | if (is_black) { 76 | joff = (i % 2) ? jpp : jnn; 77 | } else { 78 | joff = (i % 2) ? jnn : jpp; 79 | } 80 | 81 | // Compute sum of nearest neighbor spins 82 | signed char nn_sum = op_lattice[inn * ny + j] + op_lattice[i * ny + j] + op_lattice[ipp * ny + j] + op_lattice[i * ny + joff]; 83 | 84 | // Determine whether to flip spin 85 | signed char lij = lattice[i * ny + j]; 86 | float acceptance_ratio = exp(-2.0f * inv_temp * nn_sum * lij); 87 | if (randvals[i*ny + j] < acceptance_ratio) { 88 | lattice[i * ny + j] = -lij; 89 | } 90 | } 91 | 92 | // Write lattice configuration to file 93 | void write_lattice(signed char *lattice_b, signed char *lattice_w, std::string filename, long long nx, long long ny) { 94 | printf("Writing lattice to %s...\n", filename.c_str()); 95 | signed char *lattice_h, *lattice_b_h, *lattice_w_h; 96 | lattice_h = (signed char*) malloc(nx * ny * sizeof(*lattice_h)); 97 | lattice_b_h = (signed char*) malloc(nx * ny/2 * sizeof(*lattice_b_h)); 98 | lattice_w_h = (signed char*) malloc(nx * ny/2 * sizeof(*lattice_w_h)); 99 | 100 | CHECK_CUDA(cudaMemcpy(lattice_b_h, lattice_b, nx * ny/2 * sizeof(*lattice_b), cudaMemcpyDeviceToHost)); 101 | CHECK_CUDA(cudaMemcpy(lattice_w_h, lattice_b, nx * ny/2 * sizeof(*lattice_w), cudaMemcpyDeviceToHost)); 102 | 103 | for (int i = 0; i < nx; i++) { 104 | for (int j = 0; j < ny/2; j++) { 105 | if (i % 2) { 106 | lattice_h[i*ny + 2*j+1] = lattice_b_h[i*ny/2 + j]; 107 | lattice_h[i*ny + 2*j] = lattice_w_h[i*ny/2 + j]; 108 | } else { 109 | lattice_h[i*ny + 2*j] = lattice_b_h[i*ny/2 + j]; 110 | lattice_h[i*ny + 2*j+1] = lattice_w_h[i*ny/2 + j]; 111 | } 112 | } 113 | } 114 | 115 | std::ofstream f; 116 | f.open(filename); 117 | if (f.is_open()) { 118 | for (int i = 0; i < nx; i++) { 119 | for (int j = 0; j < ny; j++) { 120 | f << (int)lattice_h[i * ny + j] << " "; 121 | } 122 | f << std::endl; 123 | } 124 | } 125 | f.close(); 126 | 127 | free(lattice_h); 128 | free(lattice_b_h); 129 | free(lattice_w_h); 130 | } 131 | 132 | void update(signed char *lattice_b, signed char *lattice_w, float* randvals, curandGenerator_t rng, float inv_temp, long long nx, long long ny) { 133 | 134 | // Setup CUDA launch configuration 135 | int blocks = (nx * ny/2 + THREADS - 1) / THREADS; 136 | 137 | // Update black 138 | CHECK_CURAND(curandGenerateUniform(rng, randvals, nx*ny/2)); 139 | update_lattice<<>>(lattice_b, lattice_w, randvals, inv_temp, nx, ny/2); 140 | 141 | // Update white 142 | CHECK_CURAND(curandGenerateUniform(rng, randvals, nx*ny/2)); 143 | update_lattice<<>>(lattice_w, lattice_b, randvals, inv_temp, nx, ny/2); 144 | } 145 | 146 | static void usage(const char *pname) { 147 | 148 | const char *bname = rindex(pname, '/'); 149 | if (!bname) {bname = pname;} 150 | else {bname++;} 151 | 152 | fprintf(stdout, 153 | "Usage: %s [options]\n" 154 | "options:\n" 155 | "\t-x|--lattice-n \n" 156 | "\t\tnumber of lattice rows\n" 157 | "\n" 158 | "\t-y|--lattice_m \n" 159 | "\t\tnumber of lattice columns\n" 160 | "\n" 161 | "\t-w|--nwarmup \n" 162 | "\t\tnumber of warmup iterations\n" 163 | "\n" 164 | "\t-n|--niters \n" 165 | "\t\tnumber of trial iterations\n" 166 | "\n" 167 | "\t-a|--alpha \n" 168 | "\t\tcoefficient of critical temperature\n" 169 | "\n" 170 | "\t-s|--seed \n" 171 | "\t\tseed for random number generation\n" 172 | "\n" 173 | "\t-o|--write-lattice\n" 174 | "\t\twrite final lattice configuration to file\n\n", 175 | bname); 176 | exit(EXIT_SUCCESS); 177 | } 178 | 179 | int main(int argc, char **argv) { 180 | 181 | // Defaults 182 | long long nx = 5120; 183 | long long ny = 5120; 184 | float alpha = 0.1f; 185 | int nwarmup = 100; 186 | int niters = 1000; 187 | bool write = false; 188 | unsigned long long seed = 1234ULL; 189 | 190 | while (1) { 191 | static struct option long_options[] = { 192 | { "lattice-n", required_argument, 0, 'x'}, 193 | { "lattice-m", required_argument, 0, 'y'}, 194 | { "alpha", required_argument, 0, 'y'}, 195 | { "seed", required_argument, 0, 's'}, 196 | { "nwarmup", required_argument, 0, 'w'}, 197 | { "niters", required_argument, 0, 'n'}, 198 | { "write-lattice", no_argument, 0, 'o'}, 199 | { "help", no_argument, 0, 'h'}, 200 | { 0, 0, 0, 0} 201 | }; 202 | 203 | int option_index = 0; 204 | int ch = getopt_long(argc, argv, "x:y:a:s:w:n:oh", long_options, &option_index); 205 | if (ch == -1) break; 206 | 207 | switch(ch) { 208 | case 0: 209 | break; 210 | case 'x': 211 | nx = atoll(optarg); break; 212 | case 'y': 213 | ny = atoll(optarg); break; 214 | case 'a': 215 | alpha = atof(optarg); break; 216 | case 's': 217 | seed = atoll(optarg); break; 218 | case 'w': 219 | nwarmup = atoi(optarg); break; 220 | case 'n': 221 | niters = atoi(optarg); break; 222 | case 'o': 223 | write = true; break; 224 | case 'h': 225 | usage(argv[0]); break; 226 | case '?': 227 | exit(EXIT_FAILURE); 228 | default: 229 | fprintf(stderr, "unknown option: %c\n", ch); 230 | exit(EXIT_FAILURE); 231 | } 232 | } 233 | 234 | // Check arguments 235 | if (nx % 2 != 0 || ny % 2 != 0) { 236 | fprintf(stderr, "ERROR: Lattice dimensions must be even values.\n"); 237 | exit(EXIT_FAILURE); 238 | } 239 | 240 | float inv_temp = 1.0f / (alpha*TCRIT); 241 | 242 | // Setup cuRAND generator 243 | curandGenerator_t rng; 244 | CHECK_CURAND(curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_PHILOX4_32_10)); 245 | CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(rng, seed)); 246 | float *randvals; 247 | CHECK_CUDA(cudaMalloc(&randvals, nx * ny/2 * sizeof(*randvals))); 248 | 249 | // Setup black and white lattice arrays on device 250 | signed char *lattice_b, *lattice_w; 251 | CHECK_CUDA(cudaMalloc(&lattice_b, nx * ny/2 * sizeof(*lattice_b))); 252 | CHECK_CUDA(cudaMalloc(&lattice_w, nx * ny/2 * sizeof(*lattice_w))); 253 | 254 | int blocks = (nx * ny/2 + THREADS - 1) / THREADS; 255 | CHECK_CURAND(curandGenerateUniform(rng, randvals, nx*ny/2)); 256 | init_spins<<>>(lattice_b, randvals, nx, ny/2); 257 | CHECK_CURAND(curandGenerateUniform(rng, randvals, nx*ny/2)); 258 | init_spins<<>>(lattice_w, randvals, nx, ny/2); 259 | 260 | // Warmup iterations 261 | printf("Starting warmup...\n"); 262 | for (int i = 0; i < nwarmup; i++) { 263 | update(lattice_b, lattice_w, randvals, rng, inv_temp, nx, ny); 264 | } 265 | 266 | CHECK_CUDA(cudaDeviceSynchronize()); 267 | 268 | printf("Starting trial iterations...\n"); 269 | auto t0 = std::chrono::high_resolution_clock::now(); 270 | for (int i = 0; i < niters; i++) { 271 | update(lattice_b, lattice_w, randvals, rng, inv_temp, nx, ny); 272 | if (i % 1000 == 0) printf("Completed %d/%d iterations...\n", i+1, niters); 273 | } 274 | 275 | CHECK_CUDA(cudaDeviceSynchronize()); 276 | auto t1 = std::chrono::high_resolution_clock::now(); 277 | 278 | double duration = (double) std::chrono::duration_cast(t1-t0).count(); 279 | printf("REPORT:\n"); 280 | printf("\tnGPUs: %d\n", 1); 281 | printf("\ttemperature: %f * %f\n", alpha, TCRIT); 282 | printf("\tseed: %llu\n", seed); 283 | printf("\twarmup iterations: %d\n", nwarmup); 284 | printf("\ttrial iterations: %d\n", niters); 285 | printf("\tlattice dimensions: %lld x %lld\n", nx, ny); 286 | printf("\telapsed time: %f sec\n", duration * 1e-6); 287 | printf("\tupdates per ns: %f\n", (double) (nx * ny) * niters / duration * 1e-3); 288 | 289 | // Reduce 290 | double* devsum; 291 | int nchunks = (nx * ny/2 + CUB_CHUNK_SIZE - 1)/ CUB_CHUNK_SIZE; 292 | CHECK_CUDA(cudaMalloc(&devsum, 2 * nchunks * sizeof(*devsum))); 293 | size_t cub_workspace_bytes = 0; 294 | void* workspace = NULL; 295 | CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, lattice_b, devsum, CUB_CHUNK_SIZE)); 296 | CHECK_CUDA(cudaMalloc(&workspace, cub_workspace_bytes)); 297 | for (int i = 0; i < nchunks; i++) { 298 | CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, &lattice_b[i*CUB_CHUNK_SIZE], devsum + 2*i, 299 | std::min((long long) CUB_CHUNK_SIZE, nx * ny/2 - i * CUB_CHUNK_SIZE))); 300 | CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, &lattice_w[i*CUB_CHUNK_SIZE], devsum + 2*i + 1, 301 | std::min((long long) CUB_CHUNK_SIZE, nx * ny/2 - i * CUB_CHUNK_SIZE))); 302 | } 303 | 304 | double* hostsum; 305 | hostsum = (double*)malloc(2 * nchunks * sizeof(*hostsum)); 306 | CHECK_CUDA(cudaMemcpy(hostsum, devsum, 2 * nchunks * sizeof(*devsum), cudaMemcpyDeviceToHost)); 307 | double fullsum = 0.0; 308 | for (int i = 0; i < 2 * nchunks; i++) { 309 | fullsum += hostsum[i]; 310 | } 311 | std::cout << "\taverage magnetism (absolute): " << abs(fullsum / (nx * ny)) << std::endl; 312 | 313 | if (write) write_lattice(lattice_b, lattice_w, "final.txt", nx, ny); 314 | 315 | return 0; 316 | } 317 | -------------------------------------------------------------------------------- /basic_cuda/plot_ising.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | lattice = np.loadtxt("final.txt", dtype=np.int32) 5 | plt.imshow(lattice) 6 | plt.title('Final Lattice Configuration') 7 | plt.colorbar() 8 | plt.show() 9 | 10 | -------------------------------------------------------------------------------- /basic_cuda/sample_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ising-gpu/935796d7a26016670363af7a0dced8a9ebcd4714/basic_cuda/sample_plot.png -------------------------------------------------------------------------------- /basic_python/README.md: -------------------------------------------------------------------------------- 1 | ### Basic Implementation using Python 2 | ### Required packages: 3 | - numpy 4 | - numba 5 | - cupy 6 | - matplotlib (optional, for plotting only) 7 | 8 | ### Basic Usage 9 | Single GPU: 10 | 11 | `python ising_basic.py -x -y -n ` 12 | 13 | Multi GPU using MPI: 14 | 15 | `mpirun -np <# of GPUS> python ising_basic.py -x -y -n ` 16 | 17 | Run `python ising_basic.py --help` for more options. 18 | 19 | ### Visualizing Results 20 | `-o` flag enables output of final lattice configuration to text files `final_rank*.txt`. Use provided `plot_ising_multi.py` to visualize output. 21 | 22 | For example: 23 | ``` 24 | $ mpirun -np 2 python ising_basic.py -x 2048 -y 2048 -n 100 -a 0.5 -o 25 | ... 26 | Writing lattice to final_rank0.txt... 27 | Writing lattice to final_rank1.txt... 28 | 29 | $ python plot_ising_multi.py 30 | ``` 31 | 32 | This will produce the following output: 33 | 34 | ![sample_plot.png](sample_plot.png) 35 | -------------------------------------------------------------------------------- /basic_python/ising_basic.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a 4 | # copy of this software and associated documentation files (the "Software"), 5 | # to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | # and/or sell copies of the Software, and to permit persons to whom the 8 | # Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | import argparse 22 | import math 23 | import sys 24 | import time 25 | 26 | import cupy.cuda.curand as curand 27 | from mpi4py import MPI 28 | from numba import cuda 29 | from numba import vectorize 30 | import numpy as np 31 | 32 | # Set constants 33 | TCRIT = 2.26918531421 # critical temperature 34 | 35 | # Setup MPI and get neighbor ranks 36 | comm = MPI.COMM_WORLD 37 | rank = comm.rank 38 | rank_up = comm.rank - 1 if (comm.rank - 1 >= 0) else comm.size - 1 39 | rank_down = comm.rank + 1 if (comm.rank + 1 < comm.size) else 0 40 | 41 | # Parse command line arguments 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("--lattice-n", '-x', type=int, default=40*128, help="number of lattice rows") 44 | parser.add_argument("--lattice-m", '-y', type=int, default=40*128, help="number of lattice columns") 45 | parser.add_argument("--nwarmup", '-w', type=int, default=100, help="number of warmup iterations") 46 | parser.add_argument("--niters", '-n', type=int, default=1000, help="number of trial iterations") 47 | parser.add_argument("--alpha", '-a', type=float, default=0.1, help="coefficient of critical temperature") 48 | parser.add_argument("--seed", '-s', type=int, default=1234, help="seed for random number generation") 49 | parser.add_argument("--write-lattice", '-o', action='store_true', help="write final lattice configuration to file/s") 50 | parser.add_argument("--use-common-seed", '-c', action='store_true', help="Use common seed for all ranks + updating offset. " + 51 | "Yields consistent results independent of number " + 52 | "of GPUs but is slower.") 53 | args = parser.parse_args() 54 | 55 | # Check arguments 56 | if args.lattice_m % 2 != 0: 57 | raise Exception("lattice_m must be an even value. Aborting.") 58 | if args.lattice_n % comm.size != 0: 59 | raise Exception("lattice_n must be evenly divisible by number of GPUs. Aborting.") 60 | if (args.lattice_n / comm.size) % 2 != 0: 61 | raise Exception("Slab width (lattice_n / nGPUs) must be an even value. Aborting.") 62 | 63 | # Compute slab width 64 | lattice_slab_n = args.lattice_n // comm.size 65 | 66 | inv_temp = (1.0) / (args.alpha * TCRIT) 67 | 68 | # Generate lattice with random spins with shape of randval array 69 | @vectorize(['int8(float32)'], target='cuda') 70 | def generate_lattice(randval): 71 | return 1 if randval > 0.5 else -1 72 | 73 | @cuda.jit 74 | def update_lattice_multi(lattice, op_lattice, op_lattice_up, op_lattice_down, randvals, is_black): 75 | n,m = lattice.shape 76 | tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x 77 | j = tid % m 78 | i = tid // m 79 | 80 | if (i >= n or j >= m): return 81 | 82 | # Set stencil indices with periodicity 83 | jpp = (j + 1) if (j + 1) < m else 0 84 | jnn = (j - 1) if (j - 1) >= 0 else (m - 1) 85 | 86 | # Select off-column index based on color and row index parity 87 | if (is_black): 88 | joff = jpp if (i % 2) else jnn 89 | else: 90 | joff = jnn if (i % 2) else jpp 91 | 92 | # Compute sum of nearest neighbor spins (taking values from neighboring 93 | # lattice slabs if required) 94 | nn_sum = op_lattice[i, j] + op_lattice[i, joff] 95 | nn_sum += op_lattice[i - 1, j] if (i - 1) >= 0 else op_lattice_up[n - 1, j] 96 | nn_sum += op_lattice[i + 1, j] if (i + 1) < n else op_lattice_down[0, j] 97 | 98 | # Determine whether to flip spin 99 | lij = lattice[i, j] 100 | acceptance_ratio = math.exp(-2.0 * inv_temp * nn_sum * lij) 101 | if (randvals[i, j] < acceptance_ratio): 102 | lattice[i, j] = -lij 103 | 104 | # Create lattice update kernel (for single GPU case, this version with fewer arguments 105 | # is a bit faster due to launch overhead introduced by numba) 106 | @cuda.jit 107 | def update_lattice(lattice, op_lattice, randvals, is_black): 108 | n,m = lattice.shape 109 | tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x 110 | i = tid // m 111 | j = tid % m 112 | 113 | if (i >= n or j >= m): return 114 | 115 | # Set stencil indices with periodicity 116 | ipp = (i + 1) if (i + 1) < n else 0 117 | jpp = (j + 1) if (j + 1) < m else 0 118 | inn = (i - 1) if (i - 1) >= 0 else (n - 1) 119 | jnn = (j - 1) if (j - 1) >= 0 else (m - 1) 120 | 121 | # Select off-column index based on color and row index parity 122 | if (is_black): 123 | joff = jpp if (i % 2) else jnn 124 | else: 125 | joff = jnn if (i % 2) else jpp 126 | 127 | # Compute sum of nearest neighbor spins 128 | nn_sum = op_lattice[inn, j] + op_lattice[i, j] + op_lattice[ipp, j] + op_lattice[i, joff] 129 | 130 | # Determine whether to flip spin 131 | lij = lattice[i, j] 132 | acceptance_ratio = math.exp(-2.0 * inv_temp * nn_sum * lij) 133 | if (randvals[i, j] < acceptance_ratio): 134 | lattice[i, j] = -lij 135 | 136 | # Write lattice configuration to file 137 | def write_lattice(prefix, lattice_b, lattice_w): 138 | lattice_b_h = lattice_b.copy_to_host() 139 | lattice_w_h = lattice_w.copy_to_host() 140 | lattice = np.zeros((lattice_slab_n, args.lattice_m), dtype=np.int8) 141 | for i in range(lattice_slab_n): 142 | for j in range(args.lattice_m // 2): 143 | if (i % 2): 144 | lattice[i, 2*j+1] = lattice_b_h[i, j] 145 | lattice[i, 2*j] = lattice_w_h[i, j] 146 | else: 147 | lattice[i, 2*j] = lattice_b_h[i, j] 148 | lattice[i, 2*j+1] = lattice_w_h[i, j] 149 | 150 | print("Writing lattice to {}_rank{}.txt...".format(prefix, rank)) 151 | np.savetxt("{}_rank{}.txt".format(prefix, rank), lattice, fmt='%d') 152 | 153 | # Helper class for random number generation 154 | class curandUniformRNG: 155 | def __init__(self, seed=0): 156 | rng = curand.createGenerator(curand.CURAND_RNG_PSEUDO_PHILOX4_32_10) 157 | curand.setPseudoRandomGeneratorSeed(rng, seed) 158 | if (args.use_common_seed): 159 | self.offset = rank * lattice_slab_n * args.lattice_m // 2 160 | curand.setGeneratorOffset(rng, self.offset) 161 | self._rng = rng 162 | 163 | def fill_random(self, arr): 164 | ptr = arr.__cuda_array_interface__['data'][0] 165 | curand.generateUniform(self._rng, ptr, arr.size) 166 | if (args.use_common_seed): 167 | self.offset += args.lattice_n * args.lattice_m // 2 168 | curand.setGeneratorOffset(self._rng, self.offset) 169 | 170 | # Helper function to perform device sync plus MPI barrier 171 | def sync(): 172 | cuda.synchronize() 173 | comm.barrier() 174 | 175 | def update(lattices_b, lattices_w, randvals, rng): 176 | # Setup CUDA launch configuration 177 | threads = 128 178 | blocks = (args.lattice_m // 2 * lattice_slab_n + threads - 1) // threads 179 | 180 | if (comm.size > 1): 181 | # Update black 182 | rng.fill_random(randvals) 183 | update_lattice_multi[blocks, threads](lattices_b[rank], lattices_w[rank], lattices_w[rank_up], lattices_w[rank_down], randvals, True) 184 | sync() 185 | # Update white 186 | rng.fill_random(randvals) 187 | update_lattice_multi[blocks, threads](lattices_w[rank], lattices_b[rank], lattices_b[rank_up], lattices_b[rank_down], randvals, False) 188 | sync() 189 | else: 190 | # Update black 191 | rng.fill_random(randvals) 192 | update_lattice[blocks, threads](lattices_b[rank], lattices_w[rank], randvals, True) 193 | # Update white 194 | rng.fill_random(randvals) 195 | update_lattice[blocks, threads](lattices_w[rank], lattices_b[rank], randvals, False) 196 | 197 | 198 | # Set device 199 | cuda.select_device(rank) 200 | 201 | # Setup cuRAND generator 202 | rng = curandUniformRNG(seed=args.seed if args.use_common_seed else args.seed + 42 * rank) 203 | randvals = cuda.device_array((lattice_slab_n, args.lattice_m // 2), dtype=np.float32) 204 | 205 | # Setup black and white lattice arrays on device 206 | rng.fill_random(randvals) 207 | lattice_b = generate_lattice(randvals) 208 | rng.fill_random(randvals) 209 | lattice_w = generate_lattice(randvals) 210 | 211 | # Setup/open CUDA IPC handles 212 | ipch_b = comm.allgather(lattice_b.get_ipc_handle()) 213 | ipch_w = comm.allgather(lattice_w.get_ipc_handle()) 214 | lattices_b = [x.open() if i != rank else lattice_b for i,x in enumerate(ipch_b)] 215 | lattices_w = [x.open() if i != rank else lattice_w for i,x in enumerate(ipch_w)] 216 | 217 | # Warmup iterations 218 | if rank == 0: 219 | print("Starting warmup...") 220 | sys.stdout.flush() 221 | sync() 222 | for i in range(args.nwarmup): 223 | update(lattices_b, lattices_w, randvals, rng) 224 | sync() 225 | 226 | # Trial iterations 227 | if rank == 0: 228 | print("Starting trial iterations...") 229 | sys.stdout.flush() 230 | t0 = time.time() 231 | for i in range(args.niters): 232 | update(lattices_b, lattices_w, randvals, rng) 233 | if (rank == 0 and i % 1000 == 0): 234 | print("Completed {}/{} iterations...".format(i+1, args.niters)) 235 | sys.stdout.flush() 236 | sync() 237 | 238 | t1 = time.time() 239 | t = t1 - t0 240 | 241 | # Compute average magnetism 242 | m = (np.sum(lattices_b[rank], dtype=np.int64) + np.sum(lattices_w[rank], dtype=np.int64)) / float(args.lattice_n * args.lattice_m) 243 | m_global = comm.allreduce(m, MPI.SUM) 244 | 245 | if (rank == 0): 246 | print("REPORT:") 247 | print("\tnGPUs: {}".format(comm.size)) 248 | print("\ttemperature: {} * {}".format(args.alpha, TCRIT)) 249 | print("\tseed: {}".format(args.seed)) 250 | print("\twarmup iterations: {}".format(args.nwarmup)) 251 | print("\ttrial iterations: {}".format(args.niters)) 252 | print("\tlattice dimensions: {} x {}".format(args.lattice_n, args.lattice_m)) 253 | print("\telapsed time: {} sec".format(t)) 254 | print("\tupdates per ns: {}".format((args.lattice_n * args.lattice_m * args.niters) / t * 1e-9)) 255 | print("\taverage magnetism (absolute): {}".format(np.abs(m_global))) 256 | sys.stdout.flush() 257 | 258 | sync() 259 | 260 | if (args.write_lattice): 261 | write_lattice("final", lattices_b[rank], lattices_w[rank]) 262 | -------------------------------------------------------------------------------- /basic_python/plot_ising_multi.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | files = sorted(glob.glob("final_rank*.txt")) 6 | 7 | if (len(files) == 0): 8 | raise Exception("Could not find any lattice files. Expecting files named 'final_rank*.txt' for processing") 9 | 10 | lattice = np.loadtxt(files[0], dtype=np.int32) 11 | for i,f in enumerate(files): 12 | if i == 0: continue 13 | lattice = np.concatenate((lattice, np.loadtxt(f, dtype=np.int32))) 14 | 15 | plt.imshow(lattice) 16 | plt.title('Final Lattice Configuration') 17 | plt.colorbar() 18 | plt.show() 19 | 20 | -------------------------------------------------------------------------------- /basic_python/sample_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ising-gpu/935796d7a26016670363af7a0dced8a9ebcd4714/basic_python/sample_plot.png -------------------------------------------------------------------------------- /optimized/cuBlumeCapel/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 NVIDIA Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /optimized/cuBlumeCapel/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_HOME = /opt/cuda-12.8.1 2 | CUDACC = $(CUDA_HOME)/bin/nvcc 3 | CC = gcc 4 | LD = $(CUDACC) 5 | 6 | CFLAGS = -c -O3 -g -I$(CUDA_HOME)/include 7 | 8 | SMS ?= 89 9 | CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM)) 10 | 11 | CUDACFLAGS = -c -O3 -lineinfo $(CUDA_ARCH) -Xptxas=-v 12 | LDFLAGS = -Xcompiler=-fopenmp -lcurand 13 | 14 | C_SRCS = utils.c 15 | CU_SRCS = main.cu 16 | 17 | ifdef USE_MNNVL 18 | $(info Compiling with MNNVL support...) 19 | MPI_HOME = /project/coreai_hpc_hpc/hpc_sdk/Linux_aarch64/dev/comm_libs/12.8/hpcx/latest/ompi 20 | 21 | CUDACFLAGS += -I$(MPI_HOME)/include -DUSE_MNNVL 22 | LDFLAGS += -L$(MPI_HOME)/lib -lcuda -lmpi 23 | CU_SRCS += vmm_alloc.cu 24 | endif 25 | 26 | C_OBJS = $(patsubst %.c, %.o, $(C_SRCS)) 27 | CU_OBJS = $(patsubst %.cu, %.o, $(CU_SRCS)) 28 | 29 | all: cuBlume 30 | 31 | cuBlume: $(CU_OBJS) $(C_OBJS) 32 | $(LD) -o cuBlume $(CU_OBJS) $(C_OBJS) $(LDFLAGS) 33 | 34 | %.o: %.cu 35 | $(CUDACC) $(CUDACFLAGS) $< 36 | 37 | %.o: %.c 38 | $(CC) $(CFLAGS) $< -o $@ 39 | 40 | clean: 41 | -@rm -f *.o cuBlume *.sass &> /dev/null || true 42 | -------------------------------------------------------------------------------- /optimized/cuBlumeCapel/README.md: -------------------------------------------------------------------------------- 1 | # A CUDA implementation for the Blume-Capel model supporting Multi-Node NVLink 2 | 3 | A high performance Blume Capel model implementation for GPU. The code can run on 4 | multiple GPUs connected to the same node or on multiple nodes connected via 5 | NVLink (MNNVL). 6 | 7 | To compile the code to run on single node, adjust the Makefile to point to your CUDA 8 | installation, specify the CUDA architecture you want to compile for and then 9 | run `make`. That should be enough to produce the ``cuBlume`` binary. 10 | 11 | For multi-node, in addition to the Makefile adjustment above, also modify it to 12 | point to your MPI installation and then compile it with `make USE_MNNVL=1`. 13 | 14 | When running on a single node, the code uses managed memory. On multiple nodes 15 | with MNNVL, it uses [fabric memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#fabric-memory). 16 | 17 | When more than one GPU is used, the spin system is partitioned vertically. 18 | 19 | ## Usage 20 | 21 |
 22 | Usage: cuBlume [options]
 23 | options:
 24 |         -x|--x <HORIZ_DIM>
 25 |                 Specifies the horizontal dimension of the entire  lattice  (black+white  spins).
 26 |                 This dimension must be a multiple of 2048.
 27 | 
 28 |         -y|--y <VERT_DIM>
 29 |                 Specifies the vertical dimension of the per-GPU lattice.  This dimension must be
 30 |                 a multiple of 16.
 31 | 
 32 |         -n|--n <NSTEPS>
 33 |                 Specifies the number of iteration to run.
 34 |                 Defualt: 1
 35 | 
 36 |         -g|--gpus <NUM_DEVICES>
 37 |                 Specifies the number of GPUs to use. Will use devices with ids [0, NUM_DEVS-1].
 38 |                 Defualt: 1.
 39 | 
 40 |         -s|--seed <SEED>
 41 |                 Specifies the seed used to generate random numbers.
 42 |                 Default: 463463564571
 43 | 
 44 |         -a|--alpha <ALPHA>
 45 |                 Specifies the temperature in T_CRIT units.  If both this  option  and  '-t'  are
 46 |                 specified then the '-t' option is used.
 47 |                 Default: 0.100000
 48 | 
 49 |         -d|--delta <DELTA>
 50 |                 Specifies the delta parameter for the Blume-Capel model.
 51 |                 Default: 1.000000
 52 | 
 53 |         -t|--temp <TEMP_0>[[,<IT_1>:<TEMP_1>]...]
 54 |                 Specifies the temperature(s), in absolute  units.   It  is  possible  to  use  a
 55 |                 temperature-changing   protocol   by   specifying   a   sequence   of    couples
 56 |                 <IT_i>:<TEMP_i> after the first temperature <TEMP_0>. The value <IT_i> specifies
 57 |                 the time step at which the temperature  changes  from  <TEMP_i-1>  to  <TEMP_i>.
 58 |                 Temperature <TEMP_0> is the starting temperature and thus  does  not  require  a
 59 |                 time step specification.
 60 |                 Default: 0.226919
 61 | 
 62 |         -p|--print <STAT_FREQ>
 63 |                 Specifies the frequency, in no.  of  iteration,  with  which  the  magnetization
 64 |                 statistics is printed.  If this option is used together to the '-e' option, this
 65 |                 option is ignored.
 66 |                 Default: only at the beginning and at end of the simulation
 67 | 
 68 |         --pexp
 69 |                 Prints statistics every power-of-2 time steps.  This  option  overrides  the  -p
 70 |                 option.
 71 |                 Default: disabled
 72 | 
 73 |         -c|--corr
 74 |                 Dumps  to  a  file  named  corr_{TYPE}_{X}x{Y}_T_{TEMP} the correlation o   each
 75 |                 point with the vertical and horizontal neighbors at distance r <= 256.   Beyond
 76 |                 that, distance as chosen according to an exponential rule, with 32  values  per
 77 |                 power of 2.  The  correlation  is  computed  every  time  the  magnetization  is
 78 |                 printed on screen (based  on  either  the  '-p'  or  '-e'  options)  and  it  is
 79 |                 written in the  file one line per measure.
 80 |                 Default: full correlation (see --corrfull option)
 81 | 
 82 |         --corrfull
 83 |                 Compute the correlation for each spin in the system.
 84 | 
 85 |         --corrdiag
 86 |                 Compute the correlation only for diagonal spins.
 87 | 
 88 |         --corrchkb
 89 |                 Computes the correlation for only one spin (the top-left one)  for each block of
 90 |                 16x16 spins (checkerboard pattern).
 91 | 
 92 |         --corrmixd
 93 |                 Computes the correlation using a mix of full and checkerboard modes.   The  full
 94 |                 correlation is used for  all distances  r <= 32. Then,  for each spin in a 16x16
 95 |                 square, it is computed for each r > 32.
 96 | 
 97 |         --writechkp <CHECKPOINT_FILE_PATH>
 98 |                 Enables write of checkpoint file at the end of the simulation.  The file can  be
 99 |                 later used to resume the simulation with the '-r' option.  This option and  '-r'
100 |                 can be used together to break down a  large  run  into  multiple  smaller  runs.
101 |                 When running with multiple processes,  the file name must contain either '%i' or
102 |                 '%d' which will be substituted with the process number.
103 |                 
104 |         --readchkp <CHECKPOINT_FILE_PATH>
105 |                 Enables the restart of a simulation from the state in a checkpoint file.  Please
106 |                 note that in order for that to work, the non-checkpoint  command  lines  options
107 |                 used in the run where the checkpoint file was created must match with those used
108 |                 in the run where the checkpoint file is read.  This option and '-r' can be  used
109 |                 together  to  break   down   a   large   run   into   multiple   smaller   runs.
110 |                 When running with multiple processes,  the file name must contain either '%i' or
111 |                 '%d' which will be substituted with the process number.
112 |         -o|--o
113 |                 Enables the file dump of  the lattice  every time  the magnetization is printed.
114 |                 Default: off
115 | 
116 | 117 | For example, to run 102400 steps on a 16384^2 lattice using one GPU, using temperature 1.5 and 118 | printing the statistics every 10240 steps: 119 | 120 |
121 | $ ./cuBlume -y 32768 -x 32768 -n 1024 -p 128 -g 1 -t 1.5
122 | 
123 | Using GPUs:
124 |          0 (NVIDIA RTX 6000 Ada Generation, 48 GB, 142 SMs, 1536 th/SM max, CC 8.9, ECC off)
125 | 
126 | Run configuration:
127 |         word size: 16
128 |         bits per spin: 4 (mask: 0xF)
129 |         spins/word: 32
130 |         spins: 1073741824 (~1.07E+09)
131 |         seed: 463463564571
132 |         block size (X, Y): 16, 16
133 |         tile  size (X, Y): 32, 16
134 |         grid size 1D: 32768
135 |         virtual grid size 2D (X, Y): 16, 2048
136 |         spins per tile (X, Y): 1024, 512
137 | 
138 |         iterations:
139 |                 beg: 1
140 |                 end: 1024
141 |                 tot: 1024
142 | 
143 |         print stats every 128 steps
144 |         delta: 1
145 |         temperature: 1.5 (0.661030190265538*T_crit)
146 | 
147 |         no. of  processes: 1
148 |         GPUs  per process: 1
149 |         total no. of GPUs: 1
150 |         GPUs  memory type: managed
151 | 
152 |         per-GPU lattice size:         32768 x    32768 spins
153 |         per-GPU lattice shape: 2 x    32768 x      512 ull2s (    33554432 total)
154 | 
155 |         total lattice size:         32768 x    32768 spins
156 |         total lattice shape: 2 x    32768 x      512 ull2s (    33554432 total)
157 | 
158 |         total memory: 0.50 GB (0.50 GB per GPU)
159 | 
160 | Setting up GPUs:
161 |         GPU  0 done in 0.020104 secs
162 | 
163 | Initializing spin lattice... done in 0.058671 secs
164 | 
165 | [Switching to temperature: 1.5]
166 | 
167 | Running simulation...
168 | 
169 |         Step   MC SW          Magn.          N(-1)           N(0)           N(1)     SD value     flips/ns         GB/s          ERT
170 | 
171 |            0           7.080846E-06      357903413      357927395      357911016     5.716485
172 |          128    *      1.601530E-04      376546141      320821505      376374178     1.000418       511.37       769.55        2.17s
173 |          256    *      5.741259E-04      376809831      320738625      376193368     0.999816       509.93       767.38        2.18s
174 |          384    *      1.082895E-04      376545445      320767209      376429170     0.999965       509.44       766.64        2.18s
175 |          512    *      1.646699E-04      376582881      320752875      376406068     1.000123       507.92       764.36        2.18s
176 |          640    *      1.317356E-04      376417378      320765618      376558828     0.999747       510.41       768.11        2.18s
177 |          768    *      3.697937E-04      376286661      320771439      376683724     1.000044       508.58       765.36        2.18s
178 |          896    *      3.665267E-04      376673596      320788187      376280041     0.999778       509.95       767.42        2.18s
179 |         1024    *      1.519648E-04      376579698      320745599      376416527     1.000010       504.67       759.46        2.18s
180 | 
181 | Done in 2.184835E+03 ms (stats overhead: 1.15%, spins/ns: 503.25, BW: 757.33 GB/s)
182 | 
183 | 184 | To run 128 steps on a 2^20x2^20 lattice using 8 H100 GPUs: 185 | 186 |
187 | $ ./cuBlume -y $((2**20 / 8)) -x $((2**20)) -n 128 -p 32 -t 1.5 -g 8
188 | 
189 | Using GPUs:
190 |          0 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
191 |          1 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
192 |          2 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
193 |          3 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
194 |          4 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
195 |          5 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
196 |          6 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
197 |          7 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
198 | 
199 | Run configuration:
200 |         word size: 16
201 |         bits per spin: 4 (mask: 0xF)
202 |         spins/word: 32
203 |         spins: 1099511627776 (~1.10E+12)
204 |         seed: 463463564571
205 |         block size (X, Y): 16, 16
206 |         tile  size (X, Y): 32, 16
207 |         grid size 1D: 4194304
208 |         virtual grid size 2D (X, Y): 512, 8192
209 |         spins per tile (X, Y): 1024, 512
210 | 
211 |         iterations:
212 |                 beg: 1
213 |                 end: 128
214 |                 tot: 128
215 | 
216 |         print stats every 32 steps
217 |         delta: 1
218 |         temperature: 1.5 (0.661030190265538*T_crit)
219 | 
220 |         no. of  processes: 1
221 |         GPUs  per process: 8
222 |         total no. of GPUs: 8
223 |         GPUs  memory type: managed
224 | 
225 |         per-GPU lattice size:        131072 x  1048576 spins
226 |         per-GPU lattice shape: 2 x   131072 x    16384 ull2s (  4294967296 total)
227 | 
228 |         total lattice size:       1048576 x  1048576 spins
229 |         total lattice shape: 2 x  1048576 x    16384 ull2s ( 34359738368 total)
230 | 
231 |         total memory: 512.00 GB (64.00 GB per GPU)
232 | 
233 | Setting up GPUs:
234 |         GPU  0 done in 1.094278 secs
235 |         GPU  1 done in 1.260294 secs
236 |         GPU  2 done in 1.268412 secs
237 |         GPU  3 done in 1.259265 secs
238 |         GPU  4 done in 1.269356 secs
239 |         GPU  5 done in 1.279294 secs
240 |         GPU  6 done in 1.286008 secs
241 |         GPU  7 done in 1.288558 secs
242 | 
243 | Initializing spin lattice... done in 6.611633 secs
244 | 
245 | [Switching to temperature: 1.5]
246 | 
247 | Running simulation...
248 | 
249 |         Step   MC SW          Magn.          N(-1)           N(0)           N(1)     SD value     flips/ns         GB/s          ERT
250 | 
251 |            0           5.335123E-07   366503778958   366503483257   366504365561     5.717101
252 |           32    *      3.692141E-06   384166621778   331174324668   384170681330     1.001240      6375.69      9567.43       22.45s
253 |           64    *      7.972785E-07   385202476551   329105798057   385203353168     1.000216      6375.91      9567.76       22.45s
254 |           96    *      2.314373E-07   385421280026   328668813256   385421534494     1.000065      6376.48      9568.61       22.45s
255 |          128    *      5.507602E-06   385491919415   328533844618   385485863743     1.000011      6376.44      9568.55       22.45s
256 | 
257 | Done in 2.244686E+04 ms (stats overhead: 1.70%, spins/ns: 6269.81, BW: 9408.54 GB/s)
258 | 
259 | 260 | ## Contacts 261 | 262 | For comments, questions or anything related, write to Mauro Bisson at maurob@nvidia.com. 263 | -------------------------------------------------------------------------------- /optimized/cuBlumeCapel/cudamacro.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | #ifndef __CUDA_MACRO_H__ 23 | #define __CUDA_MACRO_H__ 24 | 25 | #define CHECK_CUDA(call) { \ 26 | cudaError_t err = call; \ 27 | if( cudaSuccess != err) { \ 28 | fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ 29 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 30 | exit(EXIT_FAILURE); \ 31 | }} 32 | 33 | #define CHECK_ERROR(errorMessage) { \ 34 | cudaError_t err = cudaGetLastError(); \ 35 | if( cudaSuccess != err) { \ 36 | fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ 37 | errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ 38 | exit(EXIT_FAILURE); \ 39 | }} 40 | #endif 41 | -------------------------------------------------------------------------------- /optimized/cuBlumeCapel/utils.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Mauro Bisson 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | void *Malloc(size_t sz) { 34 | 35 | void *ptr; 36 | 37 | if (!sz) { 38 | printf("Allocating zero bytes...\n"); 39 | exit(EXIT_FAILURE); 40 | } 41 | ptr = (void *)malloc(sz); 42 | if (!ptr) { 43 | fprintf(stderr, "Cannot allocate %zu bytes...\n", sz); 44 | exit(EXIT_FAILURE); 45 | } 46 | memset(ptr, 0, sz); 47 | return ptr; 48 | } 49 | 50 | void Free(void **ptr) { 51 | 52 | if (*ptr) { 53 | free(*ptr); 54 | *ptr = NULL; 55 | } 56 | return; 57 | } 58 | 59 | void *Realloc(void *ptr, size_t sz) { 60 | 61 | void *lp; 62 | 63 | if (!sz) { 64 | printf("Re-allocating to zero bytes, are you sure you want this?\n"); 65 | } 66 | lp = (void *)realloc(ptr, sz); 67 | if (!lp && sz) { 68 | fprintf(stderr, "Cannot reallocate to %zu bytes...\n", sz); 69 | exit(EXIT_FAILURE); 70 | } 71 | return lp; 72 | } 73 | 74 | FILE *Fopen(const char *path, const char *mode) { 75 | 76 | FILE *fp = NULL; 77 | fp = fopen(path, mode); 78 | if (!fp) { 79 | fprintf(stderr, "Cannot open file %s...\n", path); 80 | exit(EXIT_FAILURE); 81 | } 82 | return fp; 83 | } 84 | 85 | size_t Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { 86 | 87 | size_t wmemb=0; 88 | 89 | wmemb = fwrite(ptr, size, nmemb, stream); 90 | if (wmemb < nmemb) { 91 | fprintf(stderr, "Error while writing to file!\n"); 92 | exit(EXIT_FAILURE); 93 | } 94 | return wmemb; 95 | } 96 | 97 | size_t Fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { 98 | 99 | size_t rmemb=0; 100 | 101 | rmemb = fread(ptr, size, nmemb, stream); 102 | if (rmemb < nmemb && ferror(stream)) { 103 | fprintf(stderr, "Error while reading from file, could not read more than %zu elements!\n", rmemb); 104 | exit(EXIT_FAILURE); 105 | } 106 | return rmemb; 107 | } 108 | 109 | int Remove(const char *pathname) { 110 | 111 | int rv = remove(pathname); 112 | if (rv && errno != ENOENT) { 113 | fprintf(stderr, "Error removing file %s: %s\n", pathname, strerror(errno)); 114 | exit(EXIT_FAILURE); 115 | } 116 | return rv; 117 | } 118 | 119 | off_t getFsize(const char *fpath) { 120 | 121 | struct stat st; 122 | int rv; 123 | 124 | rv = stat(fpath, &st); 125 | if (rv) { 126 | fprintf(stderr, "Cannot stat file %s...\n", fpath); 127 | exit(EXIT_FAILURE); 128 | } 129 | return st.st_size; 130 | } 131 | 132 | double Wtime(void) { 133 | struct timespec tp; 134 | 135 | int rv = clock_gettime(CLOCK_MONOTONIC, &tp); 136 | if(rv) return 0; 137 | 138 | return tp.tv_nsec/1.0E+9 + (double)tp.tv_sec; 139 | } 140 | -------------------------------------------------------------------------------- /optimized/cuBlumeCapel/utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Mauro Bisson 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | #ifndef __UTILS_H__ 25 | #define __UTILS_H__ 26 | 27 | #ifdef __cplusplus 28 | #define UTILS_LINKAGE "C" 29 | #else 30 | #define UTILS_LINKAGE 31 | #endif 32 | 33 | extern UTILS_LINKAGE void *Malloc(size_t sz); 34 | extern UTILS_LINKAGE void Free(void **ptr); 35 | extern UTILS_LINKAGE void *Realloc(void *ptr, size_t sz); 36 | extern UTILS_LINKAGE FILE *Fopen(const char *path, const char *mode); 37 | extern UTILS_LINKAGE size_t Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); 38 | extern UTILS_LINKAGE size_t Fread(void *ptr, size_t size, size_t nmemb, FILE *stream); 39 | extern UTILS_LINKAGE int Remove(const char *pathname); 40 | extern UTILS_LINKAGE off_t getFsize(const char *fpath); 41 | extern UTILS_LINKAGE double Wtime(void); 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /optimized/cuBlumeCapel/vmm_alloc.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include "vmm_alloc.h" 27 | 28 | #define MIN(x,y) (((x)<(y))?(x):(y)) 29 | #define MAX(x,y) (((x)>(y))?(x):(y)) 30 | 31 | #define DIV_UP(a,b) (((a)+((b)-1))/(b)) 32 | 33 | #define MAX_DEVICE_NAME (256) 34 | 35 | #define CHECK_CUDA(call) { \ 36 | cudaError_t err = call; \ 37 | if( cudaSuccess != err) { \ 38 | fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ 39 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 40 | exit(EXIT_FAILURE); \ 41 | }} 42 | 43 | #define CHECK_CU(call) { \ 44 | CUresult res = call; \ 45 | if(CUDA_SUCCESS != res) { \ 46 | const char *errstr=NULL; \ 47 | cuGetErrorName(res, &errstr); \ 48 | fprintf(stderr, "Cuda driver API error in file '%s' in line %d: %s.\n", \ 49 | __FILE__, __LINE__, errstr); \ 50 | exit(EXIT_FAILURE); \ 51 | }} 52 | 53 | static void *Malloc(size_t sz) { 54 | 55 | void *ptr; 56 | 57 | ptr = (void *)malloc(sz); 58 | if (!ptr) { 59 | fprintf(stderr, "Cannot allocate %zu bytes...\n", sz); 60 | exit(EXIT_FAILURE); 61 | } 62 | return ptr; 63 | } 64 | 65 | size_t vmmFabricGranularity(int device) { 66 | 67 | CUmemAllocationProp prop = {}; 68 | 69 | prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; 70 | prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; 71 | prop.location.id = device; 72 | 73 | // necessary to export the handle for remote memory access via NVLink 74 | prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; 75 | 76 | size_t granularity = 0; 77 | CHECK_CU(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); 78 | 79 | return granularity; 80 | } 81 | 82 | // call to "allocate" physical memory (cuMemCreate() handle) on GPU "device" 83 | // On entry size contains de desired size of the allocation; on exit the actual 84 | // size, which must be a multiple of the granularity 85 | static CUmemGenericAllocationHandle allocatePhysicalMemory(int device, size_t size) { 86 | 87 | CUmemAllocationProp prop = {}; 88 | 89 | prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; 90 | prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; 91 | prop.location.id = device; 92 | 93 | // necessary to export the handle for remote memory access via NVLink 94 | prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; 95 | 96 | size_t granularity = 0; 97 | CHECK_CU(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); 98 | 99 | if (size % granularity) { 100 | 101 | cudaDeviceProp props; 102 | CHECK_CUDA(cudaGetDeviceProperties(&props, device)); 103 | 104 | int nameLen; 105 | char procName[MPI_MAX_PROCESSOR_NAME]; 106 | MPI_Get_processor_name(procName, &nameLen); 107 | 108 | fprintf(stderr, 109 | "%s:%d: error, requested allocation size (%zu bytes) is " 110 | "not a multiple of minimum supported granularity (%zu bytes) " 111 | "for device %d (%s) on node %s!\n", 112 | __func__, __LINE__, size, granularity, device, props.name, procName); 113 | MPI_Abort(MPI_COMM_WORLD, 0); 114 | } 115 | 116 | // Ensure size matches granularity requirements for the allocation 117 | //size_t padded_size = DIV_UP(size, granularity)*granularity; 118 | #if 0 119 | printf("%s:%d: device %d, padded_size: %zu\n", __func__, __LINE__, device, padded_size); 120 | #endif 121 | // Allocate physical memory 122 | CUmemGenericAllocationHandle allocHandle; 123 | 124 | //printf("device: %d, size: %zu\n", device, size); 125 | CHECK_CU(cuMemCreate(&allocHandle, size, &prop, 0)); 126 | 127 | return allocHandle; 128 | } 129 | 130 | static void setAccessOnDevice(int device, CUdeviceptr ptr, size_t size) { 131 | 132 | CUmemAccessDesc accessDesc = {}; 133 | 134 | accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; 135 | accessDesc.location.id = device; 136 | accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; 137 | 138 | //printf("device: %d\n", device); 139 | 140 | // Make the address accessible 141 | CHECK_CU(cuMemSetAccess(ptr, size, &accessDesc, 1)); 142 | 143 | return; 144 | } 145 | 146 | vmmAllocCtx_t *vmmFabricMalloc(void **devPtr, size_t sizePerGpu) { 147 | 148 | int inited = 0; 149 | MPI_Initialized(&inited); 150 | 151 | if (!inited) { 152 | fprintf(stderr, 153 | "%s:%d: error, MPI must be initialized before calling this function!\n", 154 | __func__, __LINE__); 155 | exit(EXIT_FAILURE); 156 | } 157 | 158 | int rank, ntask; 159 | 160 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 161 | MPI_Comm_size(MPI_COMM_WORLD, &ntask); 162 | 163 | char (*procNames)[MPI_MAX_PROCESSOR_NAME] = (char (*)[MPI_MAX_PROCESSOR_NAME])Malloc(sizeof(*procNames)*ntask); 164 | int nameLen; 165 | MPI_Get_processor_name(procNames[rank], &nameLen); 166 | MPI_Gather(procNames[rank], MPI_MAX_PROCESSOR_NAME, MPI_CHAR, procNames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, MPI_COMM_WORLD); 167 | 168 | int ndev = 0; 169 | CHECK_CUDA(cudaGetDeviceCount(&ndev)); 170 | 171 | int ndev_or; 172 | int ndev_and; 173 | MPI_Allreduce(&ndev, &ndev_or, 1, MPI_INT, MPI_BOR, MPI_COMM_WORLD); 174 | MPI_Allreduce(&ndev, &ndev_and, 1, MPI_INT, MPI_BAND, MPI_COMM_WORLD); 175 | if (ndev_or != ndev_and) { 176 | if (!rank) { 177 | fprintf(stderr, 178 | "%s:%d: error, not all processes have the same number of GPUs!\n", 179 | __func__, __LINE__); 180 | } 181 | MPI_Abort(MPI_COMM_WORLD, 0); 182 | } 183 | 184 | // local GPUs 185 | cudaDeviceProp *props = (cudaDeviceProp *)Malloc(sizeof(*props)*ndev); 186 | for(int i = 0; i < ndev; i++) { 187 | CHECK_CUDA(cudaGetDeviceProperties(props+i, i)); 188 | } 189 | 190 | // check local GPUs support 191 | for(int i = 0; i < ndev; i++) { 192 | 193 | int deviceSupportsVmm; 194 | CHECK_CU(cuDeviceGetAttribute(&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, i)); 195 | if (!deviceSupportsVmm) { 196 | fprintf(stderr, 197 | "%s:%d: error, device %d (%s) on node %s does NOT support Virtual Memory Management!\n", 198 | __func__, __LINE__, i, props[i].name, procNames[rank]); 199 | MPI_Abort(MPI_COMM_WORLD, 0); 200 | } 201 | 202 | // FOR FABRIC 203 | int deviceSupportsFabricMem; 204 | CHECK_CU(cuDeviceGetAttribute(&deviceSupportsFabricMem, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, i)); 205 | if (deviceSupportsFabricMem == 0) { 206 | fprintf(stderr, 207 | "%s:%d: error, device %d (%s) on node %s does NOT support Fabric Handles!\n", 208 | __func__, __LINE__, i, props[i].name, procNames[rank]); 209 | MPI_Abort(MPI_COMM_WORLD, 0); 210 | } 211 | } 212 | 213 | // check that all GPUs are of the same kind (this may be relaxed) 214 | cudaDeviceProp *props_all = NULL; 215 | if (!rank) { 216 | props_all = (cudaDeviceProp *)Malloc(sizeof(*props)*ntask*ndev); 217 | } 218 | 219 | MPI_Datatype MPI_DEV_PROP; 220 | MPI_Type_contiguous(sizeof(cudaDeviceProp), MPI_BYTE, &MPI_DEV_PROP); 221 | MPI_Type_commit(&MPI_DEV_PROP); 222 | 223 | MPI_Gather(props, ndev, MPI_DEV_PROP, props_all, ndev, MPI_DEV_PROP, 0, MPI_COMM_WORLD); 224 | 225 | if (!rank) { 226 | for(int i = 1; i < ntask*ndev; i++) { 227 | if (strncmp(props_all[i-1].name, props_all[i].name, MAX_DEVICE_NAME)) { 228 | fprintf(stderr, 229 | "%s:%d: error, device %d from proc %d (%s) and " 230 | "device %d from proc %d (%s) are different:\n" 231 | "\t%s\n\t%s\n", 232 | __func__, __LINE__, 233 | (i-1)%ndev, (i-1)/ndev, procNames[(i-1)/ndev], 234 | i %ndev, i /ndev, procNames[ i /ndev], 235 | props_all[i-1].name, props_all[i].name); 236 | MPI_Abort(MPI_COMM_WORLD, 0); 237 | } 238 | } 239 | } 240 | free(props); 241 | free(props_all); 242 | 243 | // allocate local handles 244 | CUmemGenericAllocationHandle *handles = (CUmemGenericAllocationHandle *)Malloc(sizeof(*handles)*ntask*ndev); 245 | memset(handles, 0, sizeof(*handles)*ntask*ndev); 246 | 247 | for(int i = 0; i < ndev; i++) { 248 | handles[rank*ndev + i] = allocatePhysicalMemory(i, sizePerGpu); 249 | } 250 | 251 | // export local handles 252 | CUmemFabricHandle *fabricHandles = (CUmemFabricHandle *)Malloc(sizeof(*fabricHandles)*ntask*ndev); 253 | memset(fabricHandles, 0, sizeof(*fabricHandles)*ntask*ndev); 254 | for(int i = 0; i < ndev; i++) { 255 | //printf("CU_MEM_HANDLE_TYPE_FABRIC: %d, CU_MEM_HANDLE_TYPE_MAX: %d\n", CU_MEM_HANDLE_TYPE_FABRIC, CU_MEM_HANDLE_TYPE_MAX); 256 | CHECK_CU(cuMemExportToShareableHandle(&fabricHandles[ndev*rank + i], 257 | handles[ndev*rank + i], 258 | CU_MEM_HANDLE_TYPE_FABRIC, 0)); 259 | } 260 | 261 | // distribute local handles 262 | MPI_Datatype MPI_FABRIC_HANDLE; 263 | MPI_Type_contiguous(sizeof(CUmemFabricHandle), MPI_BYTE, &MPI_FABRIC_HANDLE); 264 | MPI_Type_commit(&MPI_FABRIC_HANDLE); 265 | 266 | MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, fabricHandles, ndev, MPI_FABRIC_HANDLE, MPI_COMM_WORLD); 267 | 268 | // import remote handles 269 | for(int i = 0; i < ntask; i++) { 270 | if (i == rank) { 271 | continue; 272 | } 273 | for(int d = 0; d < ndev; d++) { 274 | CHECK_CU(cuMemImportFromShareableHandle(&handles[i*ndev + d], 275 | &fabricHandles[i*ndev + d], 276 | CU_MEM_HANDLE_TYPE_FABRIC)); 277 | } 278 | } 279 | // this can now be removed? 280 | free(fabricHandles); 281 | 282 | // create a (large) Virtual Address range and map local and remote handles 283 | const size_t totalSize = sizePerGpu*size_t(ntask)*size_t(ndev); 284 | 285 | CUdeviceptr cuptr; 286 | CHECK_CU(cuMemAddressReserve(&cuptr, totalSize, 0, 0, 0)); 287 | 288 | for(size_t i = 0; i < ntask; i++) { 289 | for(size_t d = 0; d < ndev; d++) { 290 | CHECK_CU(cuMemMap(cuptr + i*sizePerGpu*ndev + d*sizePerGpu, 291 | sizePerGpu, 0, handles[i*ndev + d], 0)); 292 | } 293 | } 294 | 295 | for(int d = 0; d < ndev; d++) { 296 | setAccessOnDevice(d, cuptr, totalSize); //sizePerGpu*ntask*ndev); 297 | } 298 | 299 | 300 | free(procNames); 301 | 302 | vmmAllocCtx_t *ctx = (vmmAllocCtx_t *)Malloc(sizeof(*ctx)); 303 | 304 | ctx->cuptr = cuptr; 305 | ctx->virtAddrRangeSize = totalSize; 306 | 307 | ctx->handles = handles; 308 | 309 | *devPtr = (void *)cuptr; 310 | 311 | return ctx; 312 | } 313 | 314 | void vmmFabricFree(vmmAllocCtx_t *ctx) { 315 | 316 | int ndev = 0; 317 | CHECK_CUDA(cudaGetDeviceCount(&ndev)); 318 | 319 | int rank, ntask; 320 | 321 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 322 | MPI_Comm_size(MPI_COMM_WORLD, &ntask); 323 | 324 | CHECK_CU(cuMemUnmap(ctx->cuptr, ctx->virtAddrRangeSize)); 325 | 326 | for(int i = 0; i < ntask; i++) { 327 | for(int d = 0; d < ndev; d++) { 328 | CHECK_CU(cuMemRelease(ctx->handles[i*ndev + d])); 329 | } 330 | } 331 | CHECK_CU(cuMemAddressFree(ctx->cuptr, ctx->virtAddrRangeSize)); 332 | 333 | free(ctx->handles); 334 | free(ctx); 335 | 336 | return; 337 | } 338 | -------------------------------------------------------------------------------- /optimized/cuBlumeCapel/vmm_alloc.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | #ifndef __VMM_ALLOC_H__ 23 | #define __VMM_ALLOC_H__ 24 | 25 | typedef struct { 26 | 27 | CUdeviceptr cuptr; 28 | size_t virtAddrRangeSize; 29 | 30 | CUmemGenericAllocationHandle *handles; 31 | 32 | } vmmAllocCtx_t; 33 | 34 | #ifdef __cplusplus 35 | extern "C" { 36 | #endif 37 | 38 | // helper to obtain the minimum size for fabric allocations 39 | size_t vmmFabricGranularity(int device); 40 | 41 | // Allocates sizePerGPU bytes on each device 42 | // visible to each MPI rank and return to 43 | // each caller the starting address of a 44 | // Virtual Address range to which all the 45 | // allocations are mapped. Mappings are 46 | // performed in Rank,DeviceId order: 47 | // 48 | // , , 49 | // , , 50 | // ... 51 | // , , 52 | // 53 | // Remote memories are accessed via FABRIC handles. 54 | // 55 | // Requirements: 56 | // * all ranks must have access to the same number of GPUs; 57 | // * all the GPUs must be the same type; 58 | vmmAllocCtx_t *vmmFabricMalloc(void **devPtr, size_t sizePerGpu); 59 | 60 | void vmmFabricFree(vmmAllocCtx_t *ctx); 61 | 62 | #ifdef __cplusplus 63 | } 64 | #endif 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /optimized/cuIsingModel/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 NVIDIA Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /optimized/cuIsingModel/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_HOME = /opt/cuda-12.8.1 2 | CUDACC = $(CUDA_HOME)/bin/nvcc 3 | CC = gcc 4 | LD = $(CUDACC) 5 | 6 | CFLAGS = -c -O3 -g -I$(CUDA_HOME)/include 7 | 8 | SMS ?= 89 9 | CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM)) 10 | 11 | CUDACFLAGS = -c -O3 -lineinfo $(CUDA_ARCH) -Xptxas=-v 12 | LDFLAGS = -Xcompiler=-fopenmp -lquadmath 13 | 14 | C_SRCS = utils.c 15 | CU_SRCS = main.cu 16 | 17 | ifdef USE_MNNVL 18 | $(info Compiling with MNNVL support...) 19 | MPI_HOME = /cm/shared/apps/openmpi/4.1.5 20 | 21 | CUDACFLAGS += -I$(MPI_HOME)/include -DUSE_MNNVL 22 | LDFLAGS += -L$(MPI_HOME)/lib -lcuda -lmpi 23 | CU_SRCS += vmm_alloc.cu 24 | endif 25 | 26 | C_OBJS = $(patsubst %.c, %.o, $(C_SRCS)) 27 | CU_OBJS = $(patsubst %.cu, %.o, $(CU_SRCS)) 28 | 29 | all: cuIsing 30 | 31 | cuIsing: $(CU_OBJS) $(C_OBJS) 32 | $(LD) -o cuIsing $(CU_OBJS) $(C_OBJS) $(LDFLAGS) 33 | 34 | %.o: %.cu 35 | $(CUDACC) $(CUDACFLAGS) $< 36 | 37 | %.o: %.c 38 | $(CC) $(CFLAGS) $< -o $@ 39 | 40 | clean: 41 | -@rm -f *.o cuIsing *.sass &> /dev/null || true 42 | -------------------------------------------------------------------------------- /optimized/cuIsingModel/README.md: -------------------------------------------------------------------------------- 1 | # A CUDA implementation for the Ising model supporting Multi-Node NVLink 2 | 3 | A high performance Ising model implementation for GPU. The code can run on 4 | multiple GPUs connected to the same node or on multiple nodes connected via 5 | NVLink (MNNVL). 6 | 7 | To compile the code to run on single node, adjust the Makefile to point to your CUDA 8 | installation, specify the CUDA architecture you want to compile for and then 9 | run `make`. That should be enough to produce the ``cuIsing`` binary. 10 | 11 | For multi-node, in addition to the Makefile adjustment above, also modify it to 12 | point to your MPI installation and then compile it with `make USE_MNNVL=1`. 13 | 14 | When running on a single node, the code uses managed memory. On multiple nodes 15 | with MNNVL, it uses [fabric memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#fabric-memory). 16 | 17 | When more than one GPU is used, the spin system is partitioned vertically. 18 | 19 | ## Usage 20 | 21 |
 22 | Usage: cuIsing [options]
 23 | options:
 24 |         -x|--x <HORIZ_DIM>
 25 |                 Specifies the horizontal dimension of the entire  lattice  (black+white  spins).
 26 |                 This dimension must be a multiple of 4096.
 27 | 
 28 |         -y|--y <VERT_DIM>
 29 |                 Specifies the vertical dimension of the per-GPU lattice.  This dimension must be
 30 |                 a multiple of 16.
 31 | 
 32 |         -n|--n <NSTEPS>
 33 |                 Specifies the number of iteration to run.
 34 |                 Defualt: 1
 35 | 
 36 |         -g|--gpus <NUM_DEVICES>
 37 |                 Specifies the number of GPUs to use. Will use devices with ids [0, NUM_DEVS-1].
 38 |                 Defualt: 1.
 39 | 
 40 |         -s|--seed <SEED>
 41 |                 Specifies the seed used to generate random numbers.
 42 |                 Default: 463463564571
 43 | 
 44 |         -a|--alpha <ALPHA>
 45 |                 Specifies the temperature in T_CRIT units.  If both this  option  and  '-t'  are
 46 |                 specified then the '-t' option is used.
 47 |                 Default: 0.100000
 48 | 
 49 |         -t|--temp <TEMP_0>[[,<IT_1>:<TEMP_1>]...]
 50 |                 Specifies the temperature(s), in absolute  units.   It  is  possible  to  use  a
 51 |                 temperature-changing   protocol   by   specifying   a   sequence   of    couples
 52 |                 <IT_i>:<TEMP_i> after the first temperature <TEMP_0>. The value <IT_i> specifies
 53 |                 the time step at which the temperature  changes  from  <TEMP_i-1>  to  <TEMP_i>.
 54 |                 Temperature <TEMP_0> is the starting temperature and thus  does  not  require  a
 55 |                 time step specification. 
 56 |                 Default: 0.226919
 57 | 
 58 |         -p|--print <STAT_FREQ>
 59 |                 Specifies the frequency, in no.  of  iteration,  with  which  the  magnetization
 60 |                 statistics is printed. If this option is used with --pexp, this option is ignored.
 61 |                 Default: only at the beginning and at end of the simulation
 62 | 
 63 |         --pexp
 64 |                 Prints statistics every power-of-2 time steps.  This  option  overrides  the  -p
 65 |                 option.
 66 |                 Default: disabled
 67 | 
 68 |         -c|--corr <CORR_FILE_PATH>
 69 |                 Enables correlation and writes to file CORR_FILE_PATH  the  correlation of  each
 70 |                 point with the vertical and  orizontal  neighbors at distance r <= 256.   Beyond
 71 |                 that, distance as chosen according to an  exponential rule, with 32  values  per
 72 |                 power of 2.  The  correlation  is  computed  every  time  the  magnetization  is
 73 |                 printed on screen (based  on  either  the  '-p'  or  '-e'  options)  and  it  is
 74 |                 written in the  file one line per measure.
 75 |                 Default: full correlation (see --corrfull option)
 76 | 
 77 |         --corrfull
 78 |                 Compute the correlation for each spin in the system.
 79 | 
 80 |         --corrdiag
 81 |                 Compute the correlation only for diagonal spins.
 82 | 
 83 |         --corrchkb
 84 |                 Computes the correlation for only one spin (the top-left one)  for each block of
 85 |                 16x16 spins (checkerboard pattern).
 86 | 
 87 |         --corrmixd
 88 |                 Computes the correlation using a mix of full and checkerboard modes.   The  full
 89 |                 correlation is used for  all distances  r <= 32. Then,  for each spin in a 16x16
 90 |                 square, it is computed for each r > 32.
 91 | 
 92 |         --writechkp <CHECKPOINT_FILE_PATH>
 93 |                 Enables write of checkpoint file at the end of the simulation.  The file can  be
 94 |                 later used to resume the simulation with the '-r' option.  This option and  '-r'
 95 |                 can be used together to break down a  large  run  into  multiple  smaller  runs.
 96 | 
 97 |         --readchkp <CHECKPOINT_FILE_PATH>
 98 |                 Enables the restart of a simulation from the state in a checkpoint file.  Please
 99 |                 note that in order for that to work, the non-checkpoint  command  lines  options
100 |                 used in the run where the checkpoint file was created must match with those used
101 |                 in the run where the checkpoint file is read.  This option and '-r' can be  used
102 |                 together  to  break   down   a   large   run   into   multiple   smaller   runs.
103 | 
104 |         -o|--o
105 |                 Enables the file dump of  the lattice  every time  the magnetization is printed.
106 |                 Default: off
107 | 
108 | 109 | For example, to run 102400 steps on a 16384^2 lattice using one GPU, using temperature 1.5 and 110 | printing the statistics every 10240 steps: 111 | 112 |
113 | $ ./cuIsing -y 16384 -x 16384 -n 102400 -p 10240 -t 1.5
114 | 
115 | Using GPUs:
116 |          0 (NVIDIA RTX 6000 Ada Generation, 48 GB, 142 SMs, 1536 th/SM max, CC 8.9, ECC off)
117 | 
118 | Run configuration:
119 |         word size: 16
120 |         bits per spin: 1 (mask: 0x1)
121 |         spins/word: 128
122 |         spins: 268435456 (~2.68E+08)
123 |         seed: 463463564571
124 |         block size (X, Y): 16, 16
125 |         tile  size (X, Y): 16, 16
126 |         grid  size (X, Y): 4, 1024
127 |         spins per tile (X, Y): 2048, 2048
128 | 
129 |         iterations:
130 |                 beg: 1
131 |                 end: 102400
132 |                 tot: 102400
133 | 
134 |         print stats every 10240 steps
135 |         temp: 1.5 (0.661030190265538*T_crit)
136 | 
137 |         local lattice size:         16384 x    16384 spins
138 |         local lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
139 | 
140 |         total lattice size:         16384 x    16384 spins
141 |         total lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
142 | 
143 |         total memory: 0.03 GB (0.03 GB per GPU)
144 | 
145 |         random-bit table:
146 |                 size of element: 32-bit
147 |                 no. of elements: 16
148 |                 bits per lookup: 4
149 | 
150 | Setting up GPUs:
151 |         GPU  0 done in 0.001597 secs
152 | 
153 | Initializing spin lattice... done in 0.011790 secs
154 | 
155 | Running simulation...
156 | 
157 |         Step          Magn.          N(-1)           N(1)     SD value     flips/ns         GB/s          ERT
158 | 
159 |            0   8.381903E-05      134206478      134228978    16.936372
160 |        10240   5.389421E-02      141451286      126984170     1.000717      1399.28       527.46       19.65s
161 |        20480   6.544993E-02      143002269      125433187     0.999917      1392.13       524.77       19.70s
162 |        30720   7.027917E-02      143650439      124785017     1.000416      1387.92       523.18       19.74s
163 |        40960   7.348213E-02      144080332      124355124     0.998606      1385.64       522.32       19.76s
164 |        51200   7.878675E-02      144792307      123643149     1.000069      1385.46       522.25       19.78s
165 |        61440   8.068839E-02      145047541      123387915     0.997942      1384.75       521.99       19.79s
166 |        71680   7.845285E-02      144747491      123687965     1.000395      1383.86       521.65       19.80s
167 |        81920   7.937136E-02      144870771      123564685     1.000686      1378.90       519.78       19.82s
168 |        92160   7.773913E-02      144651698      123783758     0.998647      1375.31       518.43       19.84s
169 |       102400   8.023911E-02      144987239      123448217     1.000491      1371.54       517.00       19.86s
170 | 
171 | Final energy: -1.949967
172 | 
173 | Done in 1.986138E+04 ms (stats overhead: 0.05%, spins/ns: 1383.98, BW: 521.70 GB/s)
174 | 
175 | 176 | Run 307200 steps on a 16384^2 lattice using one GPU, in three distinct runs 177 | each of 102400 steps using checkpointing: 178 | 179 |
180 | $ ./cuIsing -y 16384 -x 16384 -n 102400 -p 10240 -t 1.5 -w chkpfile
181 | 
182 | Using GPUs:
183 |          0 (NVIDIA RTX 6000 Ada Generation, 48 GB, 142 SMs, 1536 th/SM max, CC 8.9, ECC off)
184 | 
185 | Run configuration:
186 |         word size: 16
187 |         bits per spin: 1 (mask: 0x1)
188 |         spins/word: 128
189 |         spins: 268435456 (~2.68E+08)
190 |         seed: 463463564571
191 |         block size (X, Y): 16, 16
192 |         tile  size (X, Y): 16, 16
193 |         grid  size (X, Y): 4, 1024
194 |         spins per tile (X, Y): 2048, 2048
195 | 
196 |         iterations:
197 |                 beg: 1
198 |                 end: 102400
199 |                 tot: 102400
200 | 
201 |         print stats every 10240 steps
202 |         temp: 1.5 (0.661030190265538*T_crit)
203 | 
204 |         local lattice size:         16384 x    16384 spins
205 |         local lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
206 | 
207 |         total lattice size:         16384 x    16384 spins
208 |         total lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
209 | 
210 |         total memory: 0.03 GB (0.03 GB per GPU)
211 | 
212 |         random-bit table:
213 |                 size of element: 32-bit
214 |                 no. of elements: 16
215 |                 bits per lookup: 4
216 | 
217 | Setting up GPUs:
218 |         GPU  0 done in 0.001700 secs
219 | 
220 | Initializing spin lattice... done in 0.012194 secs
221 | 
222 | Running simulation...
223 | 
224 |         Step          Magn.          N(-1)           N(1)     SD value     flips/ns         GB/s          ERT
225 | 
226 |            0   8.381903E-05      134206478      134228978    16.936372
227 |        10240   5.389421E-02      141451286      126984170     1.000717      1351.59       509.49       20.34s
228 |        20480   6.544993E-02      143002269      125433187     0.999917      1352.59       509.86       20.34s
229 |        30720   7.027917E-02      143650439      124785017     1.000416      1347.67       508.01       20.36s
230 |        40960   7.348213E-02      144080332      124355124     0.998606      1349.08       508.54       20.36s
231 |        51200   7.878675E-02      144792307      123643149     1.000069      1351.91       509.61       20.36s
232 |        61440   8.068839E-02      145047541      123387915     0.997942      1355.41       510.93       20.35s
233 |        71680   7.845285E-02      144747491      123687965     1.000395      1353.09       510.05       20.34s
234 |        81920   7.937136E-02      144870771      123564685     1.000686      1352.15       509.70       20.34s
235 |        92160   7.773913E-02      144651698      123783758     0.998647      1347.46       507.93       20.35s
236 |       102400   8.023911E-02      144987239      123448217     1.000491      1345.72       507.27       20.36s
237 | 
238 | Final energy: -1.949967
239 | 
240 | Done in 2.035810E+04 ms (stats overhead: 0.05%, spins/ns: 1350.21, BW: 508.97 GB/s)
241 | 
242 | Writing checkpoint to file chkpfile... done in 0.083085 secs
243 | 
244 |
245 | $ ./cuIsing -y 16384 -x 16384 -n 102400 -p 10240 -t 1.5 -w chkpfile -r chkpfile
246 | 
247 | Using GPUs:
248 |          0 (NVIDIA RTX 6000 Ada Generation, 48 GB, 142 SMs, 1536 th/SM max, CC 8.9, ECC off)
249 | 
250 | Reading checkpoint from file chkpfile... done in 0.010425 secs
251 | 
252 | Run configuration:
253 |         word size: 16
254 |         bits per spin: 1 (mask: 0x1)
255 |         spins/word: 128
256 |         spins: 268435456 (~2.68E+08)
257 |         seed: 463463564571
258 |         block size (X, Y): 16, 16
259 |         tile  size (X, Y): 16, 16
260 |         grid  size (X, Y): 4, 1024
261 |         spins per tile (X, Y): 2048, 2048
262 | 
263 |         iterations:
264 |                 beg: 102401
265 |                 end: 204800
266 |                 tot: 102400
267 | 
268 |         print stats every 10240 steps
269 |         temp: 1.5 (0.661030190265538*T_crit)
270 | 
271 |         local lattice size:         16384 x    16384 spins
272 |         local lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
273 | 
274 |         total lattice size:         16384 x    16384 spins
275 |         total lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
276 | 
277 |         total memory: 0.03 GB (0.03 GB per GPU)
278 | 
279 |         random-bit table:
280 |                 size of element: 32-bit
281 |                 no. of elements: 16
282 |                 bits per lookup: 4
283 | 
284 | Setting up GPUs:
285 |         GPU  0 done in 0.003768 secs
286 | 
287 | Running simulation...
288 | 
289 |         Step          Magn.          N(-1)           N(1)     SD value     flips/ns         GB/s          ERT
290 | 
291 |       102400   8.023911E-02      144987239      123448217     1.000491
292 |       112640   8.427709E-02      145529207      122906249     0.999487      1369.15       516.10       20.08s
293 |       122880   8.961894E-02      146246178      122189278     1.001249      1362.94       513.76       20.13s
294 |       133120   8.933730E-02      146208378      122227078     0.999772      1356.70       511.41       20.17s
295 |       143360   8.894347E-02      146155518      122279938     1.000053      1356.84       511.46       20.20s
296 |       153600   8.961185E-02      146245227      122190229     1.000030      1352.37       509.78       20.22s
297 |       163840   8.997627E-02      146294138      122141318     0.999970      1352.44       509.81       20.24s
298 |       174080   8.834548E-02      146075257      122360199     1.000698      1352.11       509.68       20.26s
299 |       184320   8.784929E-02      146008660      122426796     1.000313      1349.95       508.87       20.27s
300 |       194560   9.042334E-02      146354143      122081313     1.000820      1348.24       508.22       20.28s
301 |       204800   9.108921E-02      146443515      121991941     1.000014      1346.63       507.62       20.30s
302 | 
303 | Final energy: -1.950272
304 | 
305 | Done in 2.029726E+04 ms (stats overhead: 0.05%, spins/ns: 1354.26, BW: 510.49 GB/s)
306 | 
307 | Writing checkpoint to file chkpfile... done in 0.082859 secs
308 | 
309 |
310 | $ ./cuIsing -y 16384 -x 16384 -n 102400 -p 10240 -t 1.5 -r chkpfile
311 | 
312 | Using GPUs:
313 |          0 (NVIDIA RTX 6000 Ada Generation, 48 GB, 142 SMs, 1536 th/SM max, CC 8.9, ECC off)
314 | 
315 | Reading checkpoint from file chkpfile... done in 0.010423 secs
316 | 
317 | Run configuration:
318 |         word size: 16
319 |         bits per spin: 1 (mask: 0x1)
320 |         spins/word: 128
321 |         spins: 268435456 (~2.68E+08)
322 |         seed: 463463564571
323 |         block size (X, Y): 16, 16
324 |         tile  size (X, Y): 16, 16
325 |         grid  size (X, Y): 4, 1024
326 |         spins per tile (X, Y): 2048, 2048
327 | 
328 |         iterations:
329 |                 beg: 204801
330 |                 end: 307200
331 |                 tot: 102400
332 | 
333 |         print stats every 10240 steps
334 |         temp: 1.5 (0.661030190265538*T_crit)
335 | 
336 |         local lattice size:         16384 x    16384 spins
337 |         local lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
338 | 
339 |         total lattice size:         16384 x    16384 spins
340 |         total lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
341 | 
342 |         total memory: 0.03 GB (0.03 GB per GPU)
343 | 
344 |         random-bit table:
345 |                 size of element: 32-bit
346 |                 no. of elements: 16
347 |                 bits per lookup: 4
348 | 
349 | Setting up GPUs:
350 |         GPU  0 done in 0.003810 secs
351 | 
352 | Running simulation...
353 | 
354 |         Step          Magn.          N(-1)           N(1)     SD value     flips/ns         GB/s          ERT
355 | 
356 |       204800   9.108921E-02      146443515      121991941     1.000014
357 |       215040   8.998523E-02      146295341      122140115     0.999673      1354.50       510.58       20.30s
358 |       225280   8.892218E-02      146152661      122282795     0.999000      1344.27       506.73       20.38s
359 |       235520   9.020317E-02      146324593      122110863     1.000224      1343.46       506.42       20.41s
360 |       245760   9.139725E-02      146484859      121950597     0.999815      1342.54       506.07       20.43s
361 |       256000   9.055272E-02      146371509      122063947     0.999528      1341.84       505.81       20.44s
362 |       266240   8.986650E-02      146279405      122156051     1.000316      1339.66       504.99       20.45s
363 |       276480   9.154957E-02      146505303      121930153     1.001214      1335.71       503.50       20.47s
364 |       286720   9.230582E-02      146606805      121828651     0.999690      1336.05       503.63       20.49s
365 |       296960   9.236395E-02      146614608      121820848     0.998615      1333.45       502.65       20.50s
366 |       307200   9.218215E-02      146590207      121845249     1.000438      1332.56       502.31       20.51s
367 | 
368 | Final energy: -1.950339
369 | 
370 | Done in 2.051432E+04 ms (stats overhead: 0.05%, spins/ns: 1339.93, BW: 505.09 GB/s)
371 | 
372 | 373 | 374 | To run 128 steps on a 2^20x2^20 lattice using 8 H100 GPUs: 375 | 376 |
377 | $ ./cuIsing -y $((2**20 / 8)) -x $((2**20)) -n 128 -p 128 -t 1.5 -g 8
378 | 
379 | Using GPUs:
380 | 	 0 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
381 | 	 1 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
382 | 	 2 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
383 | 	 3 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
384 | 	 4 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
385 | 	 5 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
386 | 	 6 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
387 | 	 7 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
388 | 
389 | Run configuration:
390 | 	word size: 16
391 | 	bits per spin: 1 (mask: 0x1)
392 | 	spins/word: 128
393 | 	spins: 1099511627776 (~1.10E+12)
394 | 	seed: 463463564571
395 | 	block size (X, Y): 16, 16
396 | 	tile  size (X, Y): 16, 16
397 | 	grid  size (X, Y): 256, 8192
398 | 	spins per tile (X, Y): 2048, 2048
399 | 
400 | 	iterations:
401 | 		beg: 1
402 | 		end: 128
403 | 		tot: 128
404 | 
405 | 	print stats every 128 steps
406 | 	temp: 1.5 (0.661030190265538*T_crit)
407 | 
408 | 	local lattice size:        131072 x  1048576 spins
409 | 	local lattice shape: 2 x   131072 x     4096 ull2s (  1073741824 total)
410 | 
411 | 	total lattice size:       1048576 x  1048576 spins
412 | 	total lattice shape: 2 x  1048576 x     4096 ull2s (  8589934592 total)
413 | 
414 | 	total memory: 128.00 GB (16.00 GB per GPU)
415 | 
416 | 	random-bit table:
417 | 		size of element: 32-bit
418 | 		no. of elements: 16
419 | 		bits per lookup: 4
420 | 
421 | Setting up GPUs:
422 | 	GPU  0 done in 0.001748 secs
423 | 	GPU  1 done in 0.166805 secs
424 | 	GPU  2 done in 0.166164 secs
425 | 	GPU  3 done in 0.166996 secs
426 | 	GPU  4 done in 0.186960 secs
427 | 	GPU  5 done in 0.187743 secs
428 | 	GPU  6 done in 0.182130 secs
429 | 	GPU  7 done in 0.192766 secs
430 | 
431 | Initializing spin lattice... done in 3.404245 secs
432 | 
433 | Running simulation...
434 | 
435 |         Step          Magn.        N(-1)         N(1)     SD value     flips/ns         GB/s          ERT
436 | 
437 |            0   7.547405E-07 549755398965 549756228811    16.936123
438 |          128   3.269196E-05 549737841294 549773786482     1.000580     10306.30      3867.38       13.78s
439 | 
440 | Final energy: -1.908699
441 | 
442 | Done in 1.377803E+04 ms (stats overhead: 0.90%, spins/ns: 10214.63, BW: 3832.98 GB/s)
443 | 
444 | 445 | ## Contacts 446 | 447 | For comments, questions or anything related, write to Mauro Bisson at maurob@nvidia.com. 448 | -------------------------------------------------------------------------------- /optimized/cuIsingModel/cudamacro.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | #ifndef __CUDA_MACRO_H__ 23 | #define __CUDA_MACRO_H__ 24 | 25 | #define CHECK_CUDA(call) { \ 26 | cudaError_t err = call; \ 27 | if( cudaSuccess != err) { \ 28 | fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ 29 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 30 | exit(EXIT_FAILURE); \ 31 | }} 32 | 33 | #define CHECK_ERROR(errorMessage) { \ 34 | cudaError_t err = cudaGetLastError(); \ 35 | if( cudaSuccess != err) { \ 36 | fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ 37 | errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ 38 | exit(EXIT_FAILURE); \ 39 | }} 40 | #endif 41 | -------------------------------------------------------------------------------- /optimized/cuIsingModel/utils.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Mauro Bisson 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | void *Malloc(size_t sz) { 34 | 35 | void *ptr; 36 | 37 | if (!sz) { 38 | printf("Allocating zero bytes...\n"); 39 | exit(EXIT_FAILURE); 40 | } 41 | ptr = (void *)malloc(sz); 42 | if (!ptr) { 43 | fprintf(stderr, "Cannot allocate %zu bytes...\n", sz); 44 | exit(EXIT_FAILURE); 45 | } 46 | memset(ptr, 0, sz); 47 | return ptr; 48 | } 49 | 50 | void Free(void **ptr) { 51 | 52 | if (*ptr) { 53 | free(*ptr); 54 | *ptr = NULL; 55 | } 56 | return; 57 | } 58 | 59 | void *Realloc(void *ptr, size_t sz) { 60 | 61 | void *lp; 62 | 63 | if (!sz) { 64 | printf("Re-allocating to zero bytes, are you sure you want this?\n"); 65 | } 66 | lp = (void *)realloc(ptr, sz); 67 | if (!lp && sz) { 68 | fprintf(stderr, "Cannot reallocate to %zu bytes...\n", sz); 69 | exit(EXIT_FAILURE); 70 | } 71 | return lp; 72 | } 73 | 74 | FILE *Fopen(const char *path, const char *mode) { 75 | 76 | FILE *fp = NULL; 77 | fp = fopen(path, mode); 78 | if (!fp) { 79 | fprintf(stderr, "Cannot open file %s...\n", path); 80 | exit(EXIT_FAILURE); 81 | } 82 | return fp; 83 | } 84 | 85 | size_t Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { 86 | 87 | size_t wmemb=0; 88 | 89 | wmemb = fwrite(ptr, size, nmemb, stream); 90 | if (wmemb < nmemb) { 91 | fprintf(stderr, "Error while writing to file!\n"); 92 | exit(EXIT_FAILURE); 93 | } 94 | return wmemb; 95 | } 96 | 97 | size_t Fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { 98 | 99 | size_t rmemb=0; 100 | 101 | rmemb = fread(ptr, size, nmemb, stream); 102 | if (rmemb < nmemb && ferror(stream)) { 103 | fprintf(stderr, "Error while reading from file, could not read more than %zu elements!\n", rmemb); 104 | exit(EXIT_FAILURE); 105 | } 106 | return rmemb; 107 | } 108 | 109 | int Remove(const char *pathname) { 110 | 111 | int rv = remove(pathname); 112 | if (rv && errno != ENOENT) { 113 | fprintf(stderr, "Error removing file %s: %s\n", pathname, strerror(errno)); 114 | exit(EXIT_FAILURE); 115 | } 116 | return rv; 117 | } 118 | 119 | off_t getFsize(const char *fpath) { 120 | 121 | struct stat st; 122 | int rv; 123 | 124 | rv = stat(fpath, &st); 125 | if (rv) { 126 | fprintf(stderr, "Cannot stat file %s...\n", fpath); 127 | exit(EXIT_FAILURE); 128 | } 129 | return st.st_size; 130 | } 131 | 132 | double Wtime(void) { 133 | struct timespec tp; 134 | 135 | int rv = clock_gettime(CLOCK_MONOTONIC, &tp); 136 | if(rv) return 0; 137 | 138 | return tp.tv_nsec/1.0E+9 + (double)tp.tv_sec; 139 | } 140 | -------------------------------------------------------------------------------- /optimized/cuIsingModel/utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Mauro Bisson 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | #ifndef __UTILS_H__ 25 | #define __UTILS_H__ 26 | 27 | #ifdef __cplusplus 28 | #define UTILS_LINKAGE "C" 29 | #else 30 | #define UTILS_LINKAGE 31 | #endif 32 | 33 | extern UTILS_LINKAGE void *Malloc(size_t sz); 34 | extern UTILS_LINKAGE void Free(void **ptr); 35 | extern UTILS_LINKAGE void *Realloc(void *ptr, size_t sz); 36 | extern UTILS_LINKAGE FILE *Fopen(const char *path, const char *mode); 37 | extern UTILS_LINKAGE size_t Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); 38 | extern UTILS_LINKAGE size_t Fread(void *ptr, size_t size, size_t nmemb, FILE *stream); 39 | extern UTILS_LINKAGE int Remove(const char *pathname); 40 | extern UTILS_LINKAGE off_t getFsize(const char *fpath); 41 | extern UTILS_LINKAGE double Wtime(void); 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /optimized/cuIsingModel/vmm_alloc.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include "vmm_alloc.h" 27 | 28 | #define MIN(x,y) (((x)<(y))?(x):(y)) 29 | #define MAX(x,y) (((x)>(y))?(x):(y)) 30 | 31 | #define DIV_UP(a,b) (((a)+((b)-1))/(b)) 32 | 33 | #define MAX_DEVICE_NAME (256) 34 | 35 | #define CHECK_CUDA(call) { \ 36 | cudaError_t err = call; \ 37 | if( cudaSuccess != err) { \ 38 | fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ 39 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 40 | exit(EXIT_FAILURE); \ 41 | }} 42 | 43 | #define CHECK_CU(call) { \ 44 | CUresult res = call; \ 45 | if(CUDA_SUCCESS != res) { \ 46 | const char *errstr=NULL; \ 47 | cuGetErrorName(res, &errstr); \ 48 | fprintf(stderr, "Cuda driver API error in file '%s' in line %d: %s.\n", \ 49 | __FILE__, __LINE__, errstr); \ 50 | exit(EXIT_FAILURE); \ 51 | }} 52 | 53 | static void *Malloc(size_t sz) { 54 | 55 | void *ptr; 56 | 57 | ptr = (void *)malloc(sz); 58 | if (!ptr) { 59 | fprintf(stderr, "Cannot allocate %zu bytes...\n", sz); 60 | exit(EXIT_FAILURE); 61 | } 62 | return ptr; 63 | } 64 | 65 | size_t vmmFabricGranularity(int device) { 66 | 67 | CUmemAllocationProp prop = {}; 68 | 69 | prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; 70 | prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; 71 | prop.location.id = device; 72 | 73 | // necessary to export the handle for remote memory access via NVLink 74 | prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; 75 | 76 | size_t granularity = 0; 77 | CHECK_CU(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); 78 | 79 | return granularity; 80 | } 81 | 82 | // call to "allocate" physical memory (cuMemCreate() handle) on GPU "device" 83 | // On entry size contains de desired size of the allocation; on exit the actual 84 | // size, which must be a multiple of the granularity 85 | static CUmemGenericAllocationHandle allocatePhysicalMemory(int device, size_t size) { 86 | 87 | CUmemAllocationProp prop = {}; 88 | 89 | prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; 90 | prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; 91 | prop.location.id = device; 92 | 93 | // necessary to export the handle for remote memory access via NVLink 94 | prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; 95 | 96 | size_t granularity = 0; 97 | CHECK_CU(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); 98 | 99 | if (size % granularity) { 100 | 101 | cudaDeviceProp props; 102 | CHECK_CUDA(cudaGetDeviceProperties(&props, device)); 103 | 104 | int nameLen; 105 | char procName[MPI_MAX_PROCESSOR_NAME]; 106 | MPI_Get_processor_name(procName, &nameLen); 107 | 108 | fprintf(stderr, 109 | "%s:%d: error, requested allocation size (%zu bytes) is " 110 | "not a multiple of minimum supported granularity (%zu bytes) " 111 | "for device %d (%s) on node %s!\n", 112 | __func__, __LINE__, size, granularity, device, props.name, procName); 113 | MPI_Abort(MPI_COMM_WORLD, 0); 114 | } 115 | 116 | // Ensure size matches granularity requirements for the allocation 117 | //size_t padded_size = DIV_UP(size, granularity)*granularity; 118 | #if 0 119 | printf("%s:%d: device %d, padded_size: %zu\n", __func__, __LINE__, device, padded_size); 120 | #endif 121 | // Allocate physical memory 122 | CUmemGenericAllocationHandle allocHandle; 123 | 124 | //printf("device: %d, size: %zu\n", device, size); 125 | CHECK_CU(cuMemCreate(&allocHandle, size, &prop, 0)); 126 | 127 | return allocHandle; 128 | } 129 | 130 | static void setAccessOnDevice(int device, CUdeviceptr ptr, size_t size) { 131 | 132 | CUmemAccessDesc accessDesc = {}; 133 | 134 | accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; 135 | accessDesc.location.id = device; 136 | accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; 137 | 138 | //printf("device: %d\n", device); 139 | 140 | // Make the address accessible 141 | CHECK_CU(cuMemSetAccess(ptr, size, &accessDesc, 1)); 142 | 143 | return; 144 | } 145 | 146 | vmmAllocCtx_t *vmmFabricMalloc(void **devPtr, size_t sizePerGpu) { 147 | 148 | int inited = 0; 149 | MPI_Initialized(&inited); 150 | 151 | if (!inited) { 152 | fprintf(stderr, 153 | "%s:%d: error, MPI must be initialized before calling this function!\n", 154 | __func__, __LINE__); 155 | exit(EXIT_FAILURE); 156 | } 157 | 158 | int rank, ntask; 159 | 160 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 161 | MPI_Comm_size(MPI_COMM_WORLD, &ntask); 162 | 163 | char (*procNames)[MPI_MAX_PROCESSOR_NAME] = (char (*)[MPI_MAX_PROCESSOR_NAME])Malloc(sizeof(*procNames)*ntask); 164 | int nameLen; 165 | MPI_Get_processor_name(procNames[rank], &nameLen); 166 | MPI_Gather(procNames[rank], MPI_MAX_PROCESSOR_NAME, MPI_CHAR, procNames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, MPI_COMM_WORLD); 167 | 168 | int ndev = 0; 169 | CHECK_CUDA(cudaGetDeviceCount(&ndev)); 170 | 171 | int ndev_or; 172 | int ndev_and; 173 | MPI_Allreduce(&ndev, &ndev_or, 1, MPI_INT, MPI_BOR, MPI_COMM_WORLD); 174 | MPI_Allreduce(&ndev, &ndev_and, 1, MPI_INT, MPI_BAND, MPI_COMM_WORLD); 175 | if (ndev_or != ndev_and) { 176 | if (!rank) { 177 | fprintf(stderr, 178 | "%s:%d: error, not all processes have the same number of GPUs!\n", 179 | __func__, __LINE__); 180 | } 181 | MPI_Abort(MPI_COMM_WORLD, 0); 182 | } 183 | 184 | // local GPUs 185 | cudaDeviceProp *props = (cudaDeviceProp *)Malloc(sizeof(*props)*ndev); 186 | for(int i = 0; i < ndev; i++) { 187 | CHECK_CUDA(cudaGetDeviceProperties(props+i, i)); 188 | } 189 | 190 | // check local GPUs support 191 | for(int i = 0; i < ndev; i++) { 192 | 193 | int deviceSupportsVmm; 194 | CHECK_CU(cuDeviceGetAttribute(&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, i)); 195 | if (!deviceSupportsVmm) { 196 | fprintf(stderr, 197 | "%s:%d: error, device %d (%s) on node %s does NOT support Virtual Memory Management!\n", 198 | __func__, __LINE__, i, props[i].name, procNames[rank]); 199 | MPI_Abort(MPI_COMM_WORLD, 0); 200 | } 201 | 202 | // FOR FABRIC 203 | int deviceSupportsFabricMem; 204 | CHECK_CU(cuDeviceGetAttribute(&deviceSupportsFabricMem, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, i)); 205 | if (deviceSupportsFabricMem == 0) { 206 | fprintf(stderr, 207 | "%s:%d: error, device %d (%s) on node %s does NOT support Fabric Handles!\n", 208 | __func__, __LINE__, i, props[i].name, procNames[rank]); 209 | MPI_Abort(MPI_COMM_WORLD, 0); 210 | } 211 | } 212 | 213 | // check that all GPUs are of the same kind (this may be relaxed) 214 | cudaDeviceProp *props_all = NULL; 215 | if (!rank) { 216 | props_all = (cudaDeviceProp *)Malloc(sizeof(*props)*ntask*ndev); 217 | } 218 | 219 | MPI_Datatype MPI_DEV_PROP; 220 | MPI_Type_contiguous(sizeof(cudaDeviceProp), MPI_BYTE, &MPI_DEV_PROP); 221 | MPI_Type_commit(&MPI_DEV_PROP); 222 | 223 | MPI_Gather(props, ndev, MPI_DEV_PROP, props_all, ndev, MPI_DEV_PROP, 0, MPI_COMM_WORLD); 224 | 225 | if (!rank) { 226 | for(int i = 1; i < ntask*ndev; i++) { 227 | if (strncmp(props_all[i-1].name, props_all[i].name, MAX_DEVICE_NAME)) { 228 | fprintf(stderr, 229 | "%s:%d: error, device %d from proc %d (%s) and " 230 | "device %d from proc %d (%s) are different:\n" 231 | "\t%s\n\t%s\n", 232 | __func__, __LINE__, 233 | (i-1)%ndev, (i-1)/ndev, procNames[(i-1)/ndev], 234 | i %ndev, i /ndev, procNames[ i /ndev], 235 | props_all[i-1].name, props_all[i].name); 236 | MPI_Abort(MPI_COMM_WORLD, 0); 237 | } 238 | } 239 | } 240 | free(props); 241 | free(props_all); 242 | 243 | // allocate local handles 244 | CUmemGenericAllocationHandle *handles = (CUmemGenericAllocationHandle *)Malloc(sizeof(*handles)*ntask*ndev); 245 | memset(handles, 0, sizeof(*handles)*ntask*ndev); 246 | 247 | for(int i = 0; i < ndev; i++) { 248 | handles[rank*ndev + i] = allocatePhysicalMemory(i, sizePerGpu); 249 | } 250 | 251 | // export local handles 252 | CUmemFabricHandle *fabricHandles = (CUmemFabricHandle *)Malloc(sizeof(*fabricHandles)*ntask*ndev); 253 | memset(fabricHandles, 0, sizeof(*fabricHandles)*ntask*ndev); 254 | for(int i = 0; i < ndev; i++) { 255 | //printf("CU_MEM_HANDLE_TYPE_FABRIC: %d, CU_MEM_HANDLE_TYPE_MAX: %d\n", CU_MEM_HANDLE_TYPE_FABRIC, CU_MEM_HANDLE_TYPE_MAX); 256 | CHECK_CU(cuMemExportToShareableHandle(&fabricHandles[ndev*rank + i], 257 | handles[ndev*rank + i], 258 | CU_MEM_HANDLE_TYPE_FABRIC, 0)); 259 | } 260 | 261 | // distribute local handles 262 | MPI_Datatype MPI_FABRIC_HANDLE; 263 | MPI_Type_contiguous(sizeof(CUmemFabricHandle), MPI_BYTE, &MPI_FABRIC_HANDLE); 264 | MPI_Type_commit(&MPI_FABRIC_HANDLE); 265 | 266 | MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, fabricHandles, ndev, MPI_FABRIC_HANDLE, MPI_COMM_WORLD); 267 | 268 | // import remote handles 269 | for(int i = 0; i < ntask; i++) { 270 | if (i == rank) { 271 | continue; 272 | } 273 | for(int d = 0; d < ndev; d++) { 274 | CHECK_CU(cuMemImportFromShareableHandle(&handles[i*ndev + d], 275 | &fabricHandles[i*ndev + d], 276 | CU_MEM_HANDLE_TYPE_FABRIC)); 277 | } 278 | } 279 | // this can now be removed? 280 | free(fabricHandles); 281 | 282 | // create a (large) Virtual Address range and map local and remote handles 283 | const size_t totalSize = sizePerGpu*size_t(ntask)*size_t(ndev); 284 | 285 | CUdeviceptr cuptr; 286 | CHECK_CU(cuMemAddressReserve(&cuptr, totalSize, 0, 0, 0)); 287 | 288 | for(size_t i = 0; i < ntask; i++) { 289 | for(size_t d = 0; d < ndev; d++) { 290 | CHECK_CU(cuMemMap(cuptr + i*sizePerGpu*ndev + d*sizePerGpu, 291 | sizePerGpu, 0, handles[i*ndev + d], 0)); 292 | } 293 | } 294 | 295 | for(int d = 0; d < ndev; d++) { 296 | setAccessOnDevice(d, cuptr, totalSize); //sizePerGpu*ntask*ndev); 297 | } 298 | 299 | 300 | free(procNames); 301 | 302 | vmmAllocCtx_t *ctx = (vmmAllocCtx_t *)Malloc(sizeof(*ctx)); 303 | 304 | ctx->cuptr = cuptr; 305 | ctx->virtAddrRangeSize = totalSize; 306 | 307 | ctx->handles = handles; 308 | 309 | *devPtr = (void *)cuptr; 310 | 311 | return ctx; 312 | } 313 | 314 | void vmmFabricFree(vmmAllocCtx_t *ctx) { 315 | 316 | int ndev = 0; 317 | CHECK_CUDA(cudaGetDeviceCount(&ndev)); 318 | 319 | int rank, ntask; 320 | 321 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 322 | MPI_Comm_size(MPI_COMM_WORLD, &ntask); 323 | 324 | CHECK_CU(cuMemUnmap(ctx->cuptr, ctx->virtAddrRangeSize)); 325 | 326 | for(int i = 0; i < ntask; i++) { 327 | for(int d = 0; d < ndev; d++) { 328 | CHECK_CU(cuMemRelease(ctx->handles[i*ndev + d])); 329 | } 330 | } 331 | CHECK_CU(cuMemAddressFree(ctx->cuptr, ctx->virtAddrRangeSize)); 332 | 333 | free(ctx->handles); 334 | free(ctx); 335 | 336 | return; 337 | } 338 | -------------------------------------------------------------------------------- /optimized/cuIsingModel/vmm_alloc.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | #ifndef __VMM_ALLOC_H__ 23 | #define __VMM_ALLOC_H__ 24 | 25 | typedef struct { 26 | 27 | CUdeviceptr cuptr; 28 | size_t virtAddrRangeSize; 29 | 30 | CUmemGenericAllocationHandle *handles; 31 | 32 | } vmmAllocCtx_t; 33 | 34 | #ifdef __cplusplus 35 | extern "C" { 36 | #endif 37 | 38 | // helper to obtain the minimum size for fabric allocations 39 | size_t vmmFabricGranularity(int device); 40 | 41 | // Allocates sizePerGPU bytes on each device 42 | // visible to each MPI rank and return to 43 | // each caller the starting address of a 44 | // Virtual Address range to which all the 45 | // allocations are mapped. Mappings are 46 | // performed in Rank,DeviceId order: 47 | // 48 | // , , 49 | // , , 50 | // ... 51 | // , , 52 | // 53 | // Remote memories are accessed via FABRIC handles. 54 | // 55 | // Requirements: 56 | // * all ranks must have access to the same number of GPUs; 57 | // * all the GPUs must be the same type; 58 | vmmAllocCtx_t *vmmFabricMalloc(void **devPtr, size_t sizePerGpu); 59 | 60 | void vmmFabricFree(vmmAllocCtx_t *ctx); 61 | 62 | #ifdef __cplusplus 63 | } 64 | #endif 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /optimized/old/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_HOME=/usr/local/cuda 2 | CUDACC=$(CUDA_HOME)/bin/nvcc 3 | CC=gcc 4 | LD=$(CUDACC) 5 | CFLAGS=-c -O3 -g -I$(CUDA_HOME)/include 6 | CUDACFLAGS=-c -O3 --use_fast_math -lineinfo -arch=sm_70 -Xptxas=-v 7 | LDFLAGS= -Xcompiler=-fopenmp 8 | 9 | all: cuIsing 10 | 11 | cuIsing: main.o utils.o 12 | $(LD) -o cuIsing main.o utils.o $(LDFLAGS) 13 | 14 | %.o: %.cu 15 | $(CUDACC) $(CUDACFLAGS) $< 16 | 17 | %.o: %.c 18 | $(CC) $(CFLAGS) $< -o $@ 19 | 20 | clean: 21 | rm *.o cuIsing 22 | -------------------------------------------------------------------------------- /optimized/old/README.md: -------------------------------------------------------------------------------- 1 | # Optimized CUDA implementation 2 | 3 | To compile the code simply adjust the Makefile to point to your CUDA 4 | installation and specify the CUDA architecture you want to compile for. A 5 | simple `make` should be enough to produce the ``cuIsing`` binary. 6 | 7 | ## Usage 8 | 9 |
 10 | Usage: cuIsing [options]
 11 | options:
 12 |         -x|--x <HORIZ_DIM>
 13 |                 Specifies the horizontal dimension of the entire  lattice  (black+white  spins),
 14 |                 per GPU. This dimension must be a multiple of 2048.
 15 | 
 16 |         -y|--y <VERT_DIM>
 17 |                 Specifies the vertical dimension of the entire lattice (black+white spins),  per
 18 |                 GPU. This dimension must be a multiple of 16.
 19 | 
 20 |         -n|--n <NSTEPS>
 21 |                 Specifies the number of iteration to run.
 22 |                 Defualt: 1
 23 | 
 24 |         -d|--devs <NUM_DEVICES>
 25 |                 Specifies the number of GPUs to use. Will use devices with ids [0, NUM_DEVS-1].
 26 |                 Defualt: 1.
 27 | 
 28 |         -s|--seed <SEED>
 29 |                 Specifies the seed used to generate random numbers.
 30 |                 Default: 463463564571
 31 | 
 32 |         -a|--alpha <ALPHA>
 33 |                 Specifies the temperature in T_CRIT units.  If both this  option  and  '-t'  are
 34 |                 specified then the '-t' option is used.
 35 |                 Default: 0.100000
 36 | 
 37 |         -t|--temp <TEMP>
 38 |                 Specifies the temperature in absolute units.  If both this option and  '-a'  are
 39 |                 specified then this option is used.
 40 |                 Default: 0.226919
 41 | 
 42 |         -p|--print <STAT_FREQ>
 43 |                 Specifies the frequency, in no.  of  iteration,  with  which  the  magnetization
 44 |                 statistics is printed.  If this option is used together to the '-e' option, this
 45 |                 option is ignored.
 46 |                 Default: only at the beginning and at end of the simulation
 47 | 
 48 |         -e|--exppr
 49 |                 Prints the magnetization at time steps in the series 0 <= 2^(x/4) < NSTEPS.   If
 50 |                 this option is used  together  to  the  '-p'  option,  the  latter  is  ignored.
 51 |                 Default: disabled
 52 | 
 53 |         -c|--corr
 54 |                 Dumps to a  file  named  corr_{X}x{Y}_T_{TEMP}  the  correlation  of each  point
 55 |                 with the  128 points on the right and below.  The correlation is computed  every
 56 |                 time the magnetization is printed on screen (based on either the  '-p'  or  '-e'
 57 |                 option) and it is written in the file one line per measure.
 58 |                 Default: disabled
 59 | 
 60 |         -m|--magn <TGT_MAGN>
 61 |                 Specifies the magnetization value at which the simulation is  interrupted.   The
 62 |                 magnetization of the system is checked against TGT_MAGN every STAT_FREQ, if  the
 63 |                 '-p' option is specified, or according to the exponential  timestep  series,  if
 64 |                 the '-e' option is specified.  If neither '-p' not '-e' are specified then  this
 65 |                 option is ignored.
 66 |                 Default: unset
 67 | 
 68 |         -J|--J <PROB>
 69 |                 Specifies the probability [0.0-1.0] that links  connecting  any  two  spins  are
 70 |                 anti-ferromagnetic. 
 71 |                 Default: 0.0
 72 | 
 73 |            --xsl <HORIZ_SUB_DIM>
 74 |                 Specifies the horizontal dimension of each sub-lattice (black+white spins),  per
 75 |                 GPU.  This dimension must be a divisor of the horizontal dimension of the entire
 76 |                 lattice per  GPU  (specified  with  the  '-x'  option) and a multiple of 2048.
 77 |                 Default: sub-lattices are disabled.
 78 | 
 79 |            --ysl <VERT_SUB_DIM>
 80 |                 Specifies the vertical  dimension of each  sub-lattice (black+white spins),  per
 81 |                 GPU.  This dimension must be a divisor of the vertical dimension of  the  entire
 82 |                 lattice per  GPU  (specified  with  the  '-y'  option) and a multiple of 16.
 83 | 
 84 |         -o|--o
 85 |                 Enables the file dump of  the lattice  every time  the magnetization is printed.
 86 |                 Default: off
 87 | 
88 | 89 | For example, to run 128 update steps on a 65536^2 lattice using two V100 GPUs 90 | connected via NVLink and printing the magnetization every 16 steps: 91 | 92 |
 93 | # 2xV100
 94 | $ ./cuIsing -y 32768 -x 65536 -n 128 -p 16 -d 2 -t 1.5  
 95 | 
 96 | Using GPUs:
 97 |          0 (Tesla V100-DGXS-16GB, 80 SMs, 2048 th/SM max, CC 7.0, ECC on)
 98 |          1 (Tesla V100-DGXS-16GB, 80 SMs, 2048 th/SM max, CC 7.0, ECC on)
 99 | 
100 | GPUs direct access matrix:
101 |           0   1
102 | GPU  0:   V   V
103 | GPU  1:   V   V
104 | 
105 | Run configuration:
106 |         spin/word: 16
107 |         spins: 4294967296
108 |         seed: 463463564571
109 |         iterations: 128
110 |         block (X, Y): 16, 16
111 |         tile  (X, Y): 32, 16
112 |         grid  (X, Y): 32, 2048
113 |         print magn. every 16 steps
114 |         temp: 1.500000 (0.661030*T_crit)
115 |         temp update not set
116 |         not using Hamiltonian buffer
117 | 
118 |         local lattice size:         32768 x    65536
119 |         total lattice size:         65536 x    65536
120 |         local lattice shape: 2 x    32768 x     2048 (   134217728 ulls)
121 |         total lattice shape: 2 x    65536 x     2048 (   268435456 ulls)
122 |         memory: 2048.00 MB (1024.00 MB per GPU)
123 | 
124 | Setting up multi-gpu configuration:
125 |         GPU  0 done
126 |         GPU  1 done
127 | 
128 | Initial magnetization:  0.000000, up_s:   2147484090, dw_s:   2147483206
129 |         magnetization:  0.000043, up_s:   2147575418, dw_s:   2147391878 (iter:       16)
130 |         magnetization:  0.000074, up_s:   2147641872, dw_s:   2147325424 (iter:       32)
131 |         magnetization:  0.000057, up_s:   2147605659, dw_s:   2147361637 (iter:       48)
132 |         magnetization:  0.000101, up_s:   2147701147, dw_s:   2147266149 (iter:       64)
133 |         magnetization:  0.000035, up_s:   2147558546, dw_s:   2147408750 (iter:       80)
134 |         magnetization:  0.000006, up_s:   2147471275, dw_s:   2147496021 (iter:       96)
135 |         magnetization:  0.000060, up_s:   2147612509, dw_s:   2147354787 (iter:      112)
136 |         magnetization:  0.000091, up_s:   2147678887, dw_s:   2147288409 (iter:      128)
137 | Final   magnetization:  0.000091, up_s:   2147678887, dw_s:   2147288409 (iter:      128)
138 | 
139 | Kernel execution time for 128 update steps: 7.174555E+02 ms, 766.26 flips/ns (BW: 1150.32 GB/s)
140 | 
141 | 
142 | 143 | Or, to run concurrently 1024 independent sub-lattices of size 2048^2 using two 144 | V100 GPUs connected via NVLink and printing the magnetization every 16 steps: 145 | 146 |
147 | # 2xV100
148 | $ ./cuIsing -y 32768 -x 65536 -n 128 -p 16 -d 2 -t 1.5 --xsl 2048 --ysl 2048
149 | 
150 | Using GPUs:
151 |          0 (Tesla V100-DGXS-16GB, 80 SMs, 2048 th/SM max, CC 7.0, ECC on)
152 |          1 (Tesla V100-DGXS-16GB, 80 SMs, 2048 th/SM max, CC 7.0, ECC on)
153 | 
154 | GPUs direct access matrix:
155 |           0   1
156 | GPU  0:   V   V
157 | GPU  1:   V   V
158 | 
159 | Run configuration:
160 |         spin/word: 16
161 |         spins: 4294967296
162 |         seed: 463463564571
163 |         iterations: 128
164 |         block (X, Y): 16, 16
165 |         tile  (X, Y): 32, 16
166 |         grid  (X, Y): 32, 2048
167 |         print magn. every 16 steps
168 |         temp: 1.500000 (0.661030*T_crit)
169 |         temp update not set
170 |         not using Hamiltonian buffer
171 | 
172 |         using sub-lattices:
173 |                 no. of sub-lattices per GPU:      512
174 |                 no. of sub-lattices (total):     1024
175 |                 sub-lattices size:              2048 x    2048
176 | 
177 |         local lattice size:         32768 x    65536
178 |         total lattice size:         65536 x    65536
179 |         local lattice shape: 2 x    32768 x     2048 (   134217728 ulls)
180 |         total lattice shape: 2 x    65536 x     2048 (   268435456 ulls)
181 |         memory: 2048.00 MB (1024.00 MB per GPU)
182 | 
183 | Setting up multi-gpu configuration:
184 |         GPU  0 done
185 |         GPU  1 done
186 | 
187 | Initial magnetization:  0.000000, up_s:   2147484090, dw_s:   2147483206
188 |         magnetization:  0.000052, up_s:   2147594634, dw_s:   2147372662 (iter:       16)
189 |         magnetization:  0.000069, up_s:   2147631783, dw_s:   2147335513 (iter:       32)
190 |         magnetization:  0.000031, up_s:   2147550893, dw_s:   2147416403 (iter:       48)
191 |         magnetization:  0.000068, up_s:   2147630364, dw_s:   2147336932 (iter:       64)
192 |         magnetization:  0.000008, up_s:   2147500244, dw_s:   2147467052 (iter:       80)
193 |         magnetization:  0.000059, up_s:   2147357073, dw_s:   2147610223 (iter:       96)
194 |         magnetization:  0.000000, up_s:   2147482936, dw_s:   2147484360 (iter:      112)
195 |         magnetization:  0.000010, up_s:   2147461873, dw_s:   2147505423 (iter:      128)
196 | Final   magnetization:  0.000010, up_s:   2147461873, dw_s:   2147505423 (iter:      128)
197 | 
198 | Kernel execution time for 128 update steps: 7.147521E+02 ms, 769.16 flips/ns (BW: 1154.67 GB/s)
199 | 
200 | 201 | To run 128 update steps on a 131072x65536 lattice using 2 and 8 A100 GPUs 202 | connected via NVLink and printing the magnetization every 16 steps: 203 | 204 |
205 | # 2xA100
206 | $ ./cuIsing -y $((32*2048)) -x $((32*2048)) -n 128 -p 16 -d 2 -t 1.5
207 | 
208 | Using GPUs:
209 |          0 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
210 |          1 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
211 | 
212 | GPUs direct access matrix:
213 |           0   1
214 | GPU  0:   V   V
215 | GPU  1:   V   V
216 | 
217 | Run configuration:
218 |         spin/word: 16
219 |         spins: 8589934592
220 |         seed: 463463564571
221 |         iterations: 128
222 |         block (X, Y): 16, 16
223 |         tile  (X, Y): 32, 16
224 |         grid  (X, Y): 32, 4096
225 |         print magn. every 16 steps
226 |         temp: 1.500000 (0.661030*T_crit)
227 |         temp update not set
228 |         not using Hamiltonian buffer
229 | 
230 |         local lattice size:         65536 x    65536
231 |         total lattice size:        131072 x    65536
232 |         local lattice shape: 2 x    65536 x     2048 (   268435456 ulls)
233 |         total lattice shape: 2 x   131072 x     2048 (   536870912 ulls)
234 |         memory: 4096.00 MB (2048.00 MB per GPU)
235 | 
236 | Setting up multi-gpu configuration:
237 |         GPU  0 done
238 |         GPU  1 done
239 | 
240 | Initial magnetization:  0.000005, up_s:   4294989182, dw_s:   4294945410
241 |         magnetization:  0.000082, up_s:   4294617248, dw_s:   4295317344 (iter:       16)
242 |         magnetization:  0.000249, up_s:   4293898346, dw_s:   4296036246 (iter:       32)
243 |         magnetization:  0.000503, up_s:   4292806461, dw_s:   4297128131 (iter:       48)
244 |         magnetization:  0.000725, up_s:   4291852263, dw_s:   4298082329 (iter:       64)
245 |         magnetization:  0.000904, up_s:   4291086016, dw_s:   4298848576 (iter:       80)
246 |         magnetization:  0.001097, up_s:   4290256223, dw_s:   4299678369 (iter:       96)
247 |         magnetization:  0.001245, up_s:   4289621029, dw_s:   4300313563 (iter:      112)
248 |         magnetization:  0.001418, up_s:   4288877118, dw_s:   4301057474 (iter:      128)
249 | Final   magnetization:  0.001418, up_s:   4288877118, dw_s:   4301057474 (iter:      128)
250 | 
251 | Kernel execution time for 128 update steps: 1.055835E+03 ms, 1041.37 flips/ns (BW: 1563.32 GB/s)
252 | 
253 | 
254 | # 8xA100
255 | $ ./cuIsing -y $((32*2048)) -x $((32*2048)) -n 128 -p 16 -d 8 -t 1.5
256 | 
257 | Using GPUs:
258 |          0 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
259 |          1 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
260 |          2 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
261 |          3 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
262 |          4 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
263 |          5 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
264 |          6 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
265 |          7 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
266 | 
267 | GPUs direct access matrix:
268 |           0   1   2   3   4   5   6   7
269 | GPU  0:   V   V   V   V   V   V   V   V
270 | GPU  1:   V   V   V   V   V   V   V   V
271 | GPU  2:   V   V   V   V   V   V   V   V
272 | GPU  3:   V   V   V   V   V   V   V   V
273 | GPU  4:   V   V   V   V   V   V   V   V
274 | GPU  5:   V   V   V   V   V   V   V   V
275 | GPU  6:   V   V   V   V   V   V   V   V
276 | GPU  7:   V   V   V   V   V   V   V   V
277 | 
278 | Run configuration:
279 |         spin/word: 16
280 |         spins: 34359738368
281 |         seed: 463463564571
282 |         iterations: 128
283 |         block (X, Y): 16, 16
284 |         tile  (X, Y): 32, 16
285 |         grid  (X, Y): 32, 4096
286 |         print magn. every 16 steps
287 |         temp: 1.500000 (0.661030*T_crit)
288 |         temp update not set
289 |         not using Hamiltonian buffer
290 | 
291 |         local lattice size:         65536 x    65536
292 |         total lattice size:        524288 x    65536
293 |         local lattice shape: 2 x    65536 x     2048 (   268435456 ulls)
294 |         total lattice shape: 2 x   524288 x     2048 (  2147483648 ulls)
295 |         memory: 16384.00 MB (2048.00 MB per GPU)
296 | 
297 | Setting up multi-gpu configuration:
298 |         GPU  0 done
299 |         GPU  1 done
300 |         GPU  2 done
301 |         GPU  3 done
302 |         GPU  4 done
303 |         GPU  5 done
304 |         GPU  6 done
305 |         GPU  7 done
306 | 
307 | Initial magnetization:  0.000010, up_s:  17179689306, dw_s:  17180049062
308 |         magnetization:  0.000203, up_s:  17176389528, dw_s:  17183348840 (iter:       16)
309 |         magnetization:  0.000402, up_s:  17172963073, dw_s:  17186775295 (iter:       32)
310 |         magnetization:  0.000539, up_s:  17170610910, dw_s:  17189127458 (iter:       48)
311 |         magnetization:  0.000642, up_s:  17168843228, dw_s:  17190895140 (iter:       64)
312 |         magnetization:  0.000749, up_s:  17167009008, dw_s:  17192729360 (iter:       80)
313 |         magnetization:  0.000865, up_s:  17165014291, dw_s:  17194724077 (iter:       96)
314 |         magnetization:  0.000941, up_s:  17163708078, dw_s:  17196030290 (iter:      112)
315 |         magnetization:  0.001023, up_s:  17162287230, dw_s:  17197451138 (iter:      128)
316 | Final   magnetization:  0.001023, up_s:  17162287230, dw_s:  17197451138 (iter:      128)
317 | 
318 | Kernel execution time for 128 update steps: 1.063368E+03 ms, 4135.96 flips/ns (BW: 6205.20 GB/s)
319 | 
320 | 321 | To run 128 update steps on a 131072x65536 lattice using 2 and 8 H100 GPUs 322 | connected via NVLink and printing the magnetization every 16 steps: 323 | 324 |
325 | 
326 | # 2xH100
327 | $ ./cuIsing -y $((32*2048)) -x $((32*2048)) -n 128 -p 16 -d 2 -t 1.5
328 | 
329 | Using GPUs:
330 |          0 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
331 |          1 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
332 | 
333 | GPUs direct access matrix:
334 |           0   1
335 | GPU  0:   V   V
336 | GPU  1:   V   V
337 | 
338 | Run configuration:
339 |         spin/word: 16
340 |         spins: 8589934592
341 |         seed: 463463564571
342 |         iterations: 128
343 |         block (X, Y): 16, 16
344 |         tile  (X, Y): 32, 16
345 |         grid  (X, Y): 32, 4096
346 |         print magn. every 16 steps
347 |         temp: 1.500000 (0.661030*T_crit)
348 |         temp update not set
349 |         not using Hamiltonian buffer
350 | 
351 |         local lattice size:         65536 x    65536
352 |         total lattice size:        131072 x    65536
353 |         local lattice shape: 2 x    65536 x     2048 (   268435456 ulls)
354 |         total lattice shape: 2 x   131072 x     2048 (   536870912 ulls)
355 |         memory: 4096.00 MB (2048.00 MB per GPU)
356 | 
357 | Setting up multi-gpu configuration:
358 |         GPU  0 done
359 |         GPU  1 done
360 | 
361 | Initial magnetization:  0.000005, up_s:   4294989182, dw_s:   4294945410
362 |         magnetization:  0.000082, up_s:   4294617248, dw_s:   4295317344 (iter:       16)
363 |         magnetization:  0.000249, up_s:   4293898346, dw_s:   4296036246 (iter:       32)
364 |         magnetization:  0.000503, up_s:   4292806461, dw_s:   4297128131 (iter:       48)
365 |         magnetization:  0.000725, up_s:   4291852263, dw_s:   4298082329 (iter:       64)
366 |         magnetization:  0.000904, up_s:   4291086016, dw_s:   4298848576 (iter:       80)
367 |         magnetization:  0.001097, up_s:   4290256223, dw_s:   4299678369 (iter:       96)
368 |         magnetization:  0.001245, up_s:   4289621029, dw_s:   4300313563 (iter:      112)
369 |         magnetization:  0.001418, up_s:   4288877118, dw_s:   4301057474 (iter:      128)
370 | Final   magnetization:  0.001418, up_s:   4288877118, dw_s:   4301057474 (iter:      128)
371 | 
372 | Kernel execution time for 128 update steps: 6.105666E+02 ms, 1800.81 flips/ns (BW: 2703.41 GB/s)
373 | 
374 | # 8xH100
375 | $ ./cuIsing -y $((32*2048)) -x $((32*2048)) -n 128 -p 16 -d 8 -t 1.5
376 | 
377 | Using GPUs:
378 |          0 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
379 |          1 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
380 |          2 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
381 |          3 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
382 |          4 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
383 |          5 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
384 |          6 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
385 |          7 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
386 | 
387 | GPUs direct access matrix:
388 |           0   1   2   3   4   5   6   7
389 | GPU  0:   V   V   V   V   V   V   V   V
390 | GPU  1:   V   V   V   V   V   V   V   V
391 | GPU  2:   V   V   V   V   V   V   V   V
392 | GPU  3:   V   V   V   V   V   V   V   V
393 | GPU  4:   V   V   V   V   V   V   V   V
394 | GPU  5:   V   V   V   V   V   V   V   V
395 | GPU  6:   V   V   V   V   V   V   V   V
396 | GPU  7:   V   V   V   V   V   V   V   V
397 | 
398 | Run configuration:
399 |         spin/word: 16
400 |         spins: 34359738368
401 |         seed: 463463564571
402 |         iterations: 128
403 |         block (X, Y): 16, 16
404 |         tile  (X, Y): 32, 16
405 |         grid  (X, Y): 32, 4096
406 |         print magn. every 16 steps
407 |         temp: 1.500000 (0.661030*T_crit)
408 |         temp update not set
409 |         not using Hamiltonian buffer
410 | 
411 |         local lattice size:         65536 x    65536
412 |         total lattice size:        524288 x    65536
413 |         local lattice shape: 2 x    65536 x     2048 (   268435456 ulls)
414 |         total lattice shape: 2 x   524288 x     2048 (  2147483648 ulls)
415 |         memory: 16384.00 MB (2048.00 MB per GPU)
416 | 
417 | Setting up multi-gpu configuration:
418 |         GPU  0 done
419 |         GPU  1 done
420 |         GPU  2 done
421 |         GPU  3 done
422 |         GPU  4 done
423 |         GPU  5 done
424 |         GPU  6 done
425 |         GPU  7 done
426 | 
427 | Initial magnetization:  0.000010, up_s:  17179689306, dw_s:  17180049062
428 |         magnetization:  0.000203, up_s:  17176389528, dw_s:  17183348840 (iter:       16)
429 |         magnetization:  0.000402, up_s:  17172963073, dw_s:  17186775295 (iter:       32)
430 |         magnetization:  0.000539, up_s:  17170610910, dw_s:  17189127458 (iter:       48)
431 |         magnetization:  0.000642, up_s:  17168843228, dw_s:  17190895140 (iter:       64)
432 |         magnetization:  0.000749, up_s:  17167009008, dw_s:  17192729360 (iter:       80)
433 |         magnetization:  0.000865, up_s:  17165014291, dw_s:  17194724077 (iter:       96)
434 |         magnetization:  0.000941, up_s:  17163708078, dw_s:  17196030290 (iter:      112)
435 |         magnetization:  0.001023, up_s:  17162287230, dw_s:  17197451138 (iter:      128)
436 | Final   magnetization:  0.001023, up_s:  17162287230, dw_s:  17197451138 (iter:      128)
437 | 
438 | Kernel execution time for 128 update steps: 6.158027E+02 ms, 7141.97 flips/ns (BW: 10715.14 GB/s)
439 | 
440 | 441 | ## Visualizing results 442 | 443 | Running the code with the '-o' option enables the lattice dump at every timestep in which the 444 | magnetization is printed on screen (depends on either the '-p' and '-e' options). The file name 445 | has the following format: 446 | 447 |
448 | lattice_<LOCAL_Y>x<LOCAL_X>_T_<TEMP>_IT_<IT_NUMBER>_<GPU_ID>.txt
449 | 
450 | 451 | The included `plotLattice.py` script allows to create an image from those output files. For example, 452 | the following command: 453 | 454 |
455 | $ ./plotLattice.py lattice_8192x8192_T_1.500000_IT_00001024_0.txt
456 | 
457 | 458 | will generate an image file named `lattice_8192x8192_T_1.500000_IT_00001024_0.txt.png` like: 459 | 460 | ![image_1](images/lattice_8192x8192_T_1.500000_IT_00001024_0.txt.png) 461 | 462 | ## Contacts 463 | 464 | For comments, questions or anything related, write to Mauro Bisson at maurob@nvidia.com. 465 | 466 | -------------------------------------------------------------------------------- /optimized/old/cudamacro.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | #ifndef __CUDA_MACRO_H__ 23 | #define __CUDA_MACRO_H__ 24 | 25 | #define CHECK_CUDA(call) { \ 26 | cudaError_t err = call; \ 27 | if( cudaSuccess != err) { \ 28 | fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ 29 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 30 | exit(EXIT_FAILURE); \ 31 | }} 32 | 33 | #define CHECK_ERROR(errorMessage) { \ 34 | cudaError_t err = cudaGetLastError(); \ 35 | if( cudaSuccess != err) { \ 36 | fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ 37 | errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ 38 | exit(EXIT_FAILURE); \ 39 | }} 40 | #endif 41 | -------------------------------------------------------------------------------- /optimized/old/images/lattice_8192x8192_T_1.500000_IT_00001024_0.txt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ising-gpu/935796d7a26016670363af7a0dced8a9ebcd4714/optimized/old/images/lattice_8192x8192_T_1.500000_IT_00001024_0.txt.png -------------------------------------------------------------------------------- /optimized/old/plotLattice.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import numpy as np 5 | from matplotlib import pyplot as plt 6 | 7 | data = [] 8 | f=open(sys.argv[1]) 9 | for l in f: 10 | data.append([int(c) for c in l.strip(" \n\r")]) 11 | 12 | print len(data), 'x', len(data[0]) 13 | 14 | plt.imshow(data, interpolation='nearest') 15 | 16 | outFile = sys.argv[1]+".png" 17 | plt.savefig(outFile) 18 | -------------------------------------------------------------------------------- /optimized/old/utils.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Mauro Bisson 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | void *Malloc(size_t sz) { 34 | 35 | void *ptr; 36 | 37 | if (!sz) { 38 | printf("Allocating zero bytes...\n"); 39 | exit(EXIT_FAILURE); 40 | } 41 | ptr = (void *)malloc(sz); 42 | if (!ptr) { 43 | fprintf(stderr, "Cannot allocate %zu bytes...\n", sz); 44 | exit(EXIT_FAILURE); 45 | } 46 | memset(ptr, 0, sz); 47 | return ptr; 48 | } 49 | 50 | void Free(void **ptr) { 51 | 52 | if (*ptr) { 53 | free(*ptr); 54 | *ptr = NULL; 55 | } 56 | return; 57 | } 58 | 59 | void *Realloc(void *ptr, size_t sz) { 60 | 61 | void *lp; 62 | 63 | if (!sz) { 64 | printf("Re-allocating to zero bytes, are you sure you want this?\n"); 65 | } 66 | lp = (void *)realloc(ptr, sz); 67 | if (!lp && sz) { 68 | fprintf(stderr, "Cannot reallocate to %zu bytes...\n", sz); 69 | exit(EXIT_FAILURE); 70 | } 71 | return lp; 72 | } 73 | 74 | FILE *Fopen(const char *path, const char *mode) { 75 | 76 | FILE *fp = NULL; 77 | fp = fopen(path, mode); 78 | if (!fp) { 79 | fprintf(stderr, "Cannot open file %s...\n", path); 80 | exit(EXIT_FAILURE); 81 | } 82 | return fp; 83 | } 84 | 85 | size_t Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { 86 | 87 | size_t wmemb=0; 88 | 89 | wmemb = fwrite(ptr, size, nmemb, stream); 90 | if (wmemb < nmemb) { 91 | fprintf(stderr, "Error while writing to file!\n"); 92 | exit(EXIT_FAILURE); 93 | } 94 | return wmemb; 95 | } 96 | 97 | size_t Fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { 98 | 99 | size_t rmemb=0; 100 | 101 | rmemb = fread(ptr, size, nmemb, stream); 102 | if (rmemb < nmemb && ferror(stream)) { 103 | fprintf(stderr, "Error while reading from file, could not read more than %zu elements!\n", rmemb); 104 | exit(EXIT_FAILURE); 105 | } 106 | return rmemb; 107 | } 108 | 109 | int Remove(const char *pathname) { 110 | 111 | int rv = remove(pathname); 112 | if (rv && errno != ENOENT) { 113 | fprintf(stderr, "Error removing file %s: %s\n", pathname, strerror(errno)); 114 | exit(EXIT_FAILURE); 115 | } 116 | return rv; 117 | } 118 | 119 | off_t getFsize(const char *fpath) { 120 | 121 | struct stat st; 122 | int rv; 123 | 124 | rv = stat(fpath, &st); 125 | if (rv) { 126 | fprintf(stderr, "Cannot stat file %s...\n", fpath); 127 | exit(EXIT_FAILURE); 128 | } 129 | return st.st_size; 130 | } 131 | 132 | double Wtime(void) { 133 | struct timespec tp; 134 | 135 | int rv = clock_gettime(CLOCK_MONOTONIC, &tp); 136 | if(rv) return 0; 137 | 138 | return tp.tv_nsec/1.0E+9 + (double)tp.tv_sec; 139 | } 140 | -------------------------------------------------------------------------------- /optimized/old/utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Mauro Bisson 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | #ifndef __UTILS_H__ 25 | #define __UTILS_H__ 26 | 27 | #ifdef __cplusplus 28 | #define UTILS_LINKAGE "C" 29 | #else 30 | #define UTILS_LINKAGE 31 | #endif 32 | 33 | extern UTILS_LINKAGE void *Malloc(size_t sz); 34 | extern UTILS_LINKAGE void Free(void **ptr); 35 | extern UTILS_LINKAGE void *Realloc(void *ptr, size_t sz); 36 | extern UTILS_LINKAGE FILE *Fopen(const char *path, const char *mode); 37 | extern UTILS_LINKAGE size_t Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); 38 | extern UTILS_LINKAGE size_t Fread(void *ptr, size_t size, size_t nmemb, FILE *stream); 39 | extern UTILS_LINKAGE int Remove(const char *pathname); 40 | extern UTILS_LINKAGE off_t getFsize(const char *fpath); 41 | extern UTILS_LINKAGE double Wtime(void); 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /tensorcore/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_HOME=/usr/local/cuda 2 | CUDACC=$(CUDA_HOME)/bin/nvcc 3 | CC=gcc 4 | LD=$(CUDACC) 5 | CFLAGS=-c -O3 -g -I$(CUDA_HOME)/include 6 | CUDACFLAGS= -std=c++11 -c -O3 -lineinfo -arch=sm_70 -Xptxas=-v -I../external/cub 7 | LDFLAGS= -lcurand -lcublas 8 | 9 | all: ising_tensorcore 10 | 11 | ising_tensorcore: main.o 12 | $(LD) -o ising_tensorcore main.o $(LDFLAGS) 13 | 14 | %.o: %.cu 15 | nvcc -c $(CUDACFLAGS) $< 16 | 17 | clean: 18 | rm *.o ising_tensorcore 19 | -------------------------------------------------------------------------------- /tensorcore/README.md: -------------------------------------------------------------------------------- 1 | ### Tensor Core implementation using CUDA C 2 | 3 | ### Basic Usage 4 | Compile binary with `make`. 5 | 6 | Example run command: 7 | 8 | `./ising_tensorcore -g -x -y -n ` 9 | 10 | Run `./ising_tensorcore --help` for more options. 11 | 12 | ### Visualizing Results 13 | `-o` flag enables output of final lattice configuration to text file `final.txt`. Use provided `plot_ising.py` to visualize output. 14 | 15 | For example: 16 | ``` 17 | $ ./ising_tensorcore -g 2 -x 8 -y 8 -n 100 -a 0.5 -o 18 | ... 19 | Writing lattice to final.txt... 20 | 21 | $ python plot_ising.py 22 | ``` 23 | 24 | This will produce the following output: 25 | 26 | ![sample_plot.png](sample_plot.png) 27 | -------------------------------------------------------------------------------- /tensorcore/cudamacro.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | #ifndef __CUDA_MACRO_H__ 23 | #define __CUDA_MACRO_H__ 24 | 25 | #define CHECK_CUDA(call) { \ 26 | cudaError_t err = call; \ 27 | if( cudaSuccess != err) { \ 28 | fprintf(stderr, "CUDA error in file '%s' in line %i : %s.\n", \ 29 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 30 | exit(EXIT_FAILURE); \ 31 | }} 32 | 33 | #define CHECK_CUBLAS(call) { \ 34 | cublasStatus_t status = call; \ 35 | if( CUBLAS_STATUS_SUCCESS != status) { \ 36 | fprintf(stderr, "CUBLAS error: %s = %d at (%s:%d)\n", #call, \ 37 | status, __FILE__, __LINE__); \ 38 | exit(EXIT_FAILURE); \ 39 | }} 40 | 41 | #define CHECK_CURAND(call) { \ 42 | curandStatus_t status = call; \ 43 | if( CURAND_STATUS_SUCCESS != status) { \ 44 | fprintf(stderr, "CURAND error: %s = %d at (%s:%d)\n", #call, \ 45 | status, __FILE__, __LINE__); \ 46 | exit(EXIT_FAILURE); \ 47 | }} 48 | 49 | #define CHECK_ERROR(errorMessage) { \ 50 | cudaError_t err = cudaGetLastError(); \ 51 | if( cudaSuccess != err) { \ 52 | fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ 53 | errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ 54 | exit(EXIT_FAILURE); \ 55 | }} 56 | #endif 57 | -------------------------------------------------------------------------------- /tensorcore/main.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include 30 | #include 31 | #include 32 | 33 | #include 34 | 35 | #include "cudamacro.h" 36 | 37 | #define LATTICE_SUP_N (256) 38 | #define LATTICE_SUB_N (LATTICE_SUP_N / 2) 39 | #define TCRIT 2.26918531421f 40 | #define THREADS (LATTICE_SUB_N) 41 | 42 | #define SUP_OFFSET(i,j,nbx) (((j)*(long long)(nbx) + (i))*LATTICE_SUP_N*LATTICE_SUP_N) 43 | #define SUB_OFFSET(i,j) (((j)*LATTICE_SUP_N + (i)*LATTICE_SUB_N)*LATTICE_SUB_N) 44 | #define SUB_ELEM(i,j) ((j)*LATTICE_SUB_N + (i)) 45 | 46 | #define CUB_CHUNK_SIZE ((1ll<<31) - (1ll<<28)) 47 | 48 | __global__ void set_k(__half* k, __half* kT) { 49 | const int tid = blockDim.x * blockIdx.x + threadIdx.x; 50 | const int i = tid % LATTICE_SUB_N; 51 | const int j = tid / LATTICE_SUB_N; 52 | if (j >= LATTICE_SUB_N) return; 53 | 54 | __half val = __float2half(0.0f); 55 | if (i == j || i + 1 == j) { 56 | val = __float2half(1.0f); 57 | } 58 | 59 | k[j*LATTICE_SUB_N + i] = val; 60 | kT[i*LATTICE_SUB_N + j] = val; 61 | } 62 | 63 | __global__ void init_spins(__half* lattice, 64 | const unsigned long long seed, 65 | const int nbx, 66 | const int nby, 67 | const long long offset) { 68 | const long long tid = static_cast(blockDim.x) * blockIdx.x + threadIdx.x + offset; 69 | const long long nx = nbx * LATTICE_SUP_N; 70 | const long long ny = nby * LATTICE_SUP_N; 71 | if (tid >= nx * ny) return; 72 | 73 | curandStatePhilox4_32_10_t state; 74 | curand_init(seed, tid, 0, &state); 75 | float randval = curand_uniform(&state); 76 | __half val = (randval < 0.5f) ? __float2half(-1.0f) : __float2half(1.0f); 77 | 78 | lattice[tid] = val; 79 | } 80 | 81 | template 82 | struct __align__(sizeof(__half)*N) halfn { 83 | __half val[N]; 84 | }; 85 | 86 | #define NLOOPS 2 87 | #define SPINSPERTHREAD 8 88 | template 89 | __global__ void update_spins(__half* lattice, 90 | float inv_temp, 91 | const __half* __restrict__ nn_sums, 92 | const unsigned long long seed, 93 | const unsigned long long iter, 94 | const int nbx, 95 | const int nby, 96 | const long long offset) { 97 | const long long tid = static_cast(blockDim.x) * blockIdx.x + threadIdx.x + offset; 98 | 99 | const int threads_per_subblock = LATTICE_SUB_N * LATTICE_SUB_N / (NLOOPS * SPINSPERTHREAD); 100 | 101 | int bi = tid / threads_per_subblock % (2 * nbx); 102 | int bj = tid / (threads_per_subblock * 2 * nbx); 103 | 104 | // subblock local thread idx 105 | int tl = tid % threads_per_subblock; 106 | 107 | if (bj >= nby) return; 108 | 109 | // Offset threads depending on parity and color 110 | if (is_black) { 111 | if (bi % 2) { 112 | bj = 2*bj + 1; 113 | } else { 114 | bj = 2*bj; 115 | } 116 | } else { 117 | if (bi % 2) { 118 | bj = 2*bj; 119 | } else { 120 | bj = 2*bj + 1; 121 | } 122 | } 123 | 124 | curandStatePhilox4_32_10_t state; 125 | curand_init(seed, tid, iter, &state); 126 | 127 | #pragma unroll 128 | for (int n = 0; n < NLOOPS; n++) { 129 | size_t elem_offset = SUP_OFFSET(bi/2, bj/2, nbx) + SUB_OFFSET(bi%2, bj%2) + (tl + n * threads_per_subblock) * SPINSPERTHREAD; 130 | 131 | halfn lij = *(reinterpret_cast*>(lattice + elem_offset)); 132 | const halfn nn = *(reinterpret_cast*>(nn_sums + elem_offset)); 133 | 134 | #pragma unroll 135 | for (int m = 0; m < SPINSPERTHREAD; m++) { 136 | float randval = curand_uniform(&state); 137 | float accept = exp(-2.0f * inv_temp * __half2float(nn.val[m] * lij.val[m])); 138 | if (randval < accept) { 139 | lij.val[m] = -lij.val[m]; 140 | } 141 | } 142 | 143 | *reinterpret_cast*>(lattice + elem_offset) = lij; 144 | 145 | } 146 | } 147 | 148 | template 149 | __global__ void add_boundaries(const __half* __restrict__ lattice, 150 | __half* nn_sums, 151 | const int nbx, 152 | const int nby, 153 | const long long offset) { 154 | const long long tid = static_cast(blockDim.x) * blockIdx.x + threadIdx.x + offset; 155 | 156 | // subblock i,j (1 thread block per subblock) 157 | int bi = tid / LATTICE_SUB_N % (2 * nbx); 158 | int bj = tid / (LATTICE_SUB_N * 2 * nbx); 159 | 160 | // subblock local i 161 | int il = tid % LATTICE_SUB_N; 162 | 163 | if (bj >= nby) return; 164 | 165 | // Offset threads depending on parity and color 166 | int jl, jb; 167 | if (is_black) { 168 | if (bi % 2) { 169 | bj = 2*bj + 1; 170 | jl = LATTICE_SUB_N - 1; 171 | jb = 0; 172 | } else { 173 | bj = 2*bj; 174 | jl = 0; 175 | jb = LATTICE_SUB_N - 1; 176 | } 177 | } else { 178 | if (bi % 2) { 179 | bj = 2*bj; 180 | jl = 0; 181 | jb = LATTICE_SUB_N - 1; 182 | } else { 183 | bj = 2*bj + 1; 184 | jl = LATTICE_SUB_N - 1; 185 | jb = 0; 186 | } 187 | } 188 | 189 | int bn = 2*nbx; 190 | int bm = 2*nby; 191 | int bin = (bi - 1 >= 0) ? bi - 1 : bn - 1; 192 | int bip = (bi + 1 < bn) ? bi + 1 : 0; 193 | int bjn = (bj - 1 >= 0) ? bj - 1 : bm - 1; 194 | int bjp = (bj + 1 < bm) ? bj + 1 : 0; 195 | 196 | // Update LR 197 | size_t boundary_offset; 198 | if (jl == 0) { 199 | boundary_offset = SUP_OFFSET(bi/2, bjn/2, nbx) + SUB_OFFSET(bi%2, bjn%2); 200 | } else { 201 | boundary_offset = SUP_OFFSET(bi/2, bjp/2, nbx) + SUB_OFFSET(bi%2, bjp%2); 202 | } 203 | 204 | size_t local_offset = SUP_OFFSET(bi/2, bj/2, nbx) + SUB_OFFSET(bi%2, bj%2); 205 | *(nn_sums + local_offset + SUB_ELEM(il, jl)) += *(lattice + boundary_offset + SUB_ELEM(il, jb)); 206 | 207 | 208 | // Update UD 209 | if (!is_black) { 210 | jl = (jl == 0) ? LATTICE_SUB_N - 1 : 0; 211 | jb = (jb == 0) ? LATTICE_SUB_N - 1 : 0; 212 | } 213 | 214 | if (jl == 0) { 215 | boundary_offset = SUP_OFFSET(bin/2, bj/2, nbx) + SUB_OFFSET(bin%2, bj%2); 216 | } else { 217 | boundary_offset = SUP_OFFSET(bip/2, bj/2, nbx) + SUB_OFFSET(bip%2, bj%2); 218 | } 219 | 220 | __half bval = *(lattice + boundary_offset + SUB_ELEM(jb, il)); 221 | 222 | __syncthreads(); 223 | 224 | *(nn_sums + local_offset + SUB_ELEM(jl, il)) += bval; 225 | 226 | } 227 | 228 | void sync(int nGPUs) { 229 | // Sync all devices 230 | for (int dev = 0; dev < nGPUs; dev++) { 231 | CHECK_CUDA(cudaSetDevice(dev)); 232 | CHECK_CUDA(cudaDeviceSynchronize()); 233 | } 234 | } 235 | 236 | 237 | void update(__half **Ab0, __half **Bb0, __half **Ab1, __half **Bb1, __half **Cb, 238 | __half **Aw0, __half **Bw0, __half **Aw1, __half **Bw1, __half **Cw, 239 | __half *lattice, float inv_temp, __half *nn_sums, cublasHandle_t *cublas_handles, int iter, 240 | int nbx, int nby, unsigned long long seed, int nGPUs) { 241 | 242 | int batchCount = 2 * nbx * nby; 243 | int batchCountPerGPU = batchCount / nGPUs; 244 | 245 | __half alpha = __float2half(1.0f); 246 | __half beta0 = __float2half(0.0f); 247 | __half beta1 = __float2half(1.0f); 248 | 249 | // Update black 250 | for (int dev = 0; dev < nGPUs; dev++) { 251 | CHECK_CUDA(cudaSetDevice(dev)); 252 | CHECK_CUBLAS(cublasGemmBatchedEx(cublas_handles[dev], CUBLAS_OP_N, CUBLAS_OP_N, LATTICE_SUB_N, LATTICE_SUB_N, LATTICE_SUB_N, 253 | &alpha, (void**) &Ab0[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, 254 | (void**) &Bb0[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, &beta0, 255 | (void**) &Cb[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, batchCountPerGPU, 256 | CUDA_R_16F, CUBLAS_GEMM_ALGO0_TENSOR_OP)); 257 | 258 | CHECK_CUBLAS(cublasGemmBatchedEx(cublas_handles[dev], CUBLAS_OP_N, CUBLAS_OP_N, LATTICE_SUB_N, LATTICE_SUB_N, LATTICE_SUB_N, 259 | &alpha, (void**) &Ab1[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, 260 | (void**) &Bb1[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, &beta1, 261 | (void**) &Cb[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, batchCountPerGPU, 262 | CUDA_R_16F, CUBLAS_GEMM_ALGO0_TENSOR_OP)); 263 | 264 | int blocks = (2 * nbx * nby); 265 | int blocksPerGPU = blocks / nGPUs; 266 | add_boundaries<<>>(lattice, nn_sums, nbx, nby, dev * ((long long)blocksPerGPU * THREADS)); 267 | blocks = (2 * nbx * nby * LATTICE_SUB_N) / (NLOOPS * SPINSPERTHREAD); 268 | blocksPerGPU = blocks / nGPUs; 269 | update_spins<<>>(lattice, inv_temp, nn_sums, seed, (2*iter) * (NLOOPS * SPINSPERTHREAD), nbx, nby, dev * ((long long)blocksPerGPU * THREADS)); 270 | } 271 | 272 | sync(nGPUs); 273 | 274 | // Update white 275 | for (int dev = 0; dev < nGPUs; dev++) { 276 | cudaSetDevice(dev); 277 | CHECK_CUBLAS(cublasGemmBatchedEx(cublas_handles[dev], CUBLAS_OP_N, CUBLAS_OP_N, LATTICE_SUB_N, LATTICE_SUB_N, LATTICE_SUB_N, 278 | &alpha, (void**) &Aw0[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, 279 | (void**) &Bw0[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, &beta0, 280 | (void**) &Cw[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, batchCountPerGPU, 281 | CUDA_R_16F, CUBLAS_GEMM_ALGO0_TENSOR_OP)); 282 | 283 | CHECK_CUBLAS(cublasGemmBatchedEx(cublas_handles[dev], CUBLAS_OP_N, CUBLAS_OP_N, LATTICE_SUB_N, LATTICE_SUB_N, LATTICE_SUB_N, 284 | &alpha, (void**) &Aw1[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, 285 | (void**) &Bw1[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, &beta1, 286 | (void**) &Cw[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, batchCountPerGPU, 287 | CUDA_R_16F, CUBLAS_GEMM_ALGO0_TENSOR_OP)); 288 | 289 | int blocks = (2 * nbx * nby); 290 | int blocksPerGPU = blocks / nGPUs; 291 | add_boundaries<<>>(lattice, nn_sums, nbx, nby, dev * ((long long)blocksPerGPU * THREADS)); 292 | blocks = (2 * nbx * nby * LATTICE_SUB_N) / (NLOOPS * SPINSPERTHREAD); 293 | blocksPerGPU = blocks / nGPUs; 294 | update_spins<<>>(lattice, inv_temp, nn_sums, seed, (2*iter + 1) * (NLOOPS * SPINSPERTHREAD), nbx, nby, dev * ((long long)blocksPerGPU * THREADS)); 295 | } 296 | 297 | sync(nGPUs); 298 | } 299 | 300 | void write_lattice(__half *lattice, std::string filename, int nbx, int nby, int nGPUs) { 301 | printf("Writing lattice to %s...\n", filename.c_str()); 302 | 303 | long long nx = nbx * LATTICE_SUP_N; 304 | long long ny = nby * LATTICE_SUP_N; 305 | 306 | __half* lattice_h; 307 | float* lattice_true_h; 308 | lattice_h = (__half*) malloc(nx * ny * sizeof(*lattice_h)); 309 | lattice_true_h = (float*) malloc(nx * ny * sizeof(*lattice_true_h)); 310 | 311 | long spinsPerGPU = nx * (ny/nGPUs); 312 | // Copy out full lattice to host 313 | for (int dev = 0; dev < nGPUs; dev++) { 314 | CHECK_CUDA(cudaSetDevice(dev)); 315 | CHECK_CUDA(cudaMemcpy(&lattice_h[dev * spinsPerGPU], &lattice[dev * spinsPerGPU], spinsPerGPU * sizeof(*lattice_h), cudaMemcpyDeviceToHost)); 316 | } 317 | 318 | // Write file 319 | for (int bj = 0; bj < nby; bj++) { 320 | for (int bi = 0; bi < nbx; bi++) { 321 | __half* l00 = lattice_h + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(0, 0); 322 | __half* l01 = lattice_h + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(0, 1); 323 | __half* l10 = lattice_h + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(1, 0); 324 | __half* l11 = lattice_h + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(1, 1); 325 | 326 | long long offset = (bj * LATTICE_SUP_N) * nx + (bi * LATTICE_SUP_N); 327 | for(int j = 0; j < LATTICE_SUB_N; j++) { 328 | for(int i = 0; i < LATTICE_SUB_N; i++) { 329 | lattice_true_h[offset + (2*j) * nx + (2*i)] = __half2float(*(l00 + SUB_ELEM(i, j))); 330 | lattice_true_h[offset + (2*j + 1) * nx + (2*i + 1)] = __half2float(*(l11 + SUB_ELEM(i, j))); 331 | lattice_true_h[offset + (2*j) * nx + (2*i + 1)] = __half2float(*(l10 + SUB_ELEM(i, j))); 332 | lattice_true_h[offset + (2*j + 1) * nx + (2*i)] = __half2float(*(l01 + SUB_ELEM(i, j))); 333 | } 334 | } 335 | } 336 | } 337 | 338 | std::ofstream f; 339 | f.open(filename); 340 | if (f.is_open()) { 341 | for (long long j = 0; j < ny; j++) { 342 | for (long long i = 0; i < nx; i++) { 343 | f << lattice_true_h[j * nx + i] << " "; 344 | } 345 | f << std::endl; 346 | } 347 | } 348 | f.close(); 349 | 350 | free(lattice_h); 351 | free(lattice_true_h); 352 | } 353 | 354 | static void usage(const char *pname) { 355 | 356 | const char *bname = rindex(pname, '/'); 357 | if (!bname) {bname = pname;} 358 | else {bname++;} 359 | 360 | fprintf(stdout, 361 | "Usage: %s [options]\n" 362 | "options:\n" 363 | "\t-x|--lattice-nbx \n" 364 | "\t\tnumber of blocks along lattice rows (number of rows / 256)\n" 365 | "\n" 366 | "\t-y|--lattice-nby \n" 367 | "\t\tnumber of blocks along lattice columns (number of columns / 256)\n" 368 | "\n" 369 | "\t-g|--ngpus \n" 370 | "\t\tnumber of GPUs to use for simulation\n" 371 | "\n" 372 | "\t-w|--nwarmup \n" 373 | "\t\tnumber of warmup iterations\n" 374 | "\n" 375 | "\t-n|--niters \n" 376 | "\t\tnumber of trial iterations\n" 377 | "\n" 378 | "\t-a|--alpha \n" 379 | "\t\tcoefficient of critical temperature\n" 380 | "\n" 381 | "\t-s|--seed \n" 382 | "\t\tseed for random number generation\n" 383 | "\n" 384 | "\t-o|--write-lattice\n" 385 | "\t\twrite final lattice configuration to file\n\n", 386 | bname); 387 | exit(EXIT_SUCCESS); 388 | } 389 | 390 | int main(int argc, char **argv) { 391 | 392 | // Defaults 393 | int nbx = 10; // Lattice rows dimension (in number of super blocks) 394 | int nby = 10; // Lattice columns dimension (in number of super blocks) 395 | float alpha = 0.1f; // coefficient of critical temperature 396 | int niter = 1000; 397 | int nwarmup = 100; 398 | bool write = false; 399 | int nGPUs = 1; 400 | unsigned long long seed = 1234ULL; 401 | 402 | while (1) { 403 | static struct option long_options[] = { 404 | { "lattice-nbx", required_argument, 0, 'x'}, 405 | { "lattice-nby", required_argument, 0, 'y'}, 406 | { "ngpus", required_argument, 0, 'g'}, 407 | { "seed", required_argument, 0, 's'}, 408 | { "nwarmup", required_argument, 0, 'w'}, 409 | { "niter", required_argument, 0, 'n'}, 410 | { "write-lattice", no_argument, 0, 'o'}, 411 | { "help", no_argument, 0, 'h'}, 412 | { 0, 0, 0, 0} 413 | }; 414 | 415 | int option_index = 0; 416 | int ch = getopt_long(argc, argv, "x:y:g:a:s:w:n:oh", long_options, &option_index); 417 | if (ch == -1) break; 418 | 419 | switch(ch) { 420 | case 0: 421 | break; 422 | case 'x': 423 | nbx = atoi(optarg); break; 424 | case 'y': 425 | nby = atoi(optarg); break; 426 | case 'g': 427 | nGPUs = atoi(optarg); break; 428 | case 'a': 429 | alpha = atof(optarg); break; 430 | case 's': 431 | seed = atoll(optarg); break; 432 | case 'w': 433 | nwarmup = atoi(optarg); break; 434 | case 'n': 435 | niter = atoi(optarg); break; 436 | case 'o': 437 | write = true; break; 438 | case 'h': 439 | usage(argv[0]); break; 440 | case '?': 441 | exit(EXIT_FAILURE); 442 | default: 443 | fprintf(stderr, "unknown option: %c\n", ch); 444 | exit(EXIT_FAILURE); 445 | } 446 | } 447 | 448 | if (nby % nGPUs != 0) { 449 | fprintf(stderr, "ERROR: Number of super blocks in y dimension must be multiple of number of gpus.\n"); 450 | exit(EXIT_FAILURE); 451 | } 452 | 453 | long long nx = nbx * LATTICE_SUP_N; 454 | long long ny = nby * LATTICE_SUP_N; 455 | 456 | __half* lattice; 457 | __half* nn_sums; 458 | __half* k; 459 | __half* kT; 460 | CHECK_CUDA(cudaMallocManaged(&lattice, nx * ny * sizeof(*lattice))); 461 | CHECK_CUDA(cudaMallocManaged(&nn_sums, nx * ny * sizeof(*nn_sums))); 462 | CHECK_CUDA(cudaMallocManaged(&k, LATTICE_SUB_N * LATTICE_SUB_N * sizeof(*k))); 463 | CHECK_CUDA(cudaMallocManaged(&kT, LATTICE_SUB_N * LATTICE_SUB_N * sizeof(*kT))); 464 | 465 | for (int dev = 0; dev < nGPUs; dev++) { 466 | CHECK_CUDA(cudaMemAdvise(k, LATTICE_SUB_N * LATTICE_SUB_N * sizeof(*k), cudaMemAdviseSetReadMostly, dev)); 467 | CHECK_CUDA(cudaMemAdvise(k, LATTICE_SUB_N * LATTICE_SUB_N * sizeof(*k), cudaMemAdviseSetAccessedBy, dev)); 468 | CHECK_CUDA(cudaMemAdvise(kT, LATTICE_SUB_N * LATTICE_SUB_N * sizeof(*kT), cudaMemAdviseSetReadMostly, dev)); 469 | CHECK_CUDA(cudaMemAdvise(kT, LATTICE_SUB_N * LATTICE_SUB_N * sizeof(*kT), cudaMemAdviseSetAccessedBy, dev)); 470 | } 471 | 472 | long long spinsPerGPU = nx * ny / nGPUs; 473 | for (int dev = 0; dev < nGPUs; dev++) { 474 | CHECK_CUDA(cudaMemAdvise(&lattice[dev * spinsPerGPU], spinsPerGPU * sizeof(*lattice), cudaMemAdviseSetPreferredLocation, dev)); 475 | CHECK_CUDA(cudaMemAdvise(&nn_sums[dev * spinsPerGPU], spinsPerGPU * sizeof(*nn_sums), cudaMemAdviseSetPreferredLocation, dev)); 476 | } 477 | 478 | cublasHandle_t* cublas_handles; 479 | cublas_handles = (cublasHandle_t*) malloc(nGPUs * sizeof(cublasHandle_t)); 480 | for (int dev = 0; dev < nGPUs; dev++) { 481 | CHECK_CUDA(cudaSetDevice(dev)); 482 | CHECK_CUBLAS(cublasCreate(&cublas_handles[dev])); 483 | CHECK_CUBLAS(cublasSetMathMode(cublas_handles[dev], CUBLAS_TENSOR_OP_MATH)); 484 | } 485 | 486 | // Setup k and k transpose matrices 487 | CHECK_CUDA(cudaSetDevice(0)); 488 | int blocks = (LATTICE_SUB_N * LATTICE_SUB_N + THREADS - 1) / THREADS; 489 | set_k<<>>(k, kT); 490 | 491 | // Initialize lattice spins randomly 492 | for (int dev = 0; dev < nGPUs; dev++) { 493 | CHECK_CUDA(cudaSetDevice(dev)); 494 | blocks = (nx * ny + THREADS - 1) / THREADS; 495 | int blocksPerGPU = blocks/nGPUs; 496 | init_spins<<>>(lattice, seed, nbx, nby, dev * nx * (ny/nGPUs)); 497 | } 498 | 499 | sync(nGPUs); 500 | 501 | // Setup pointers for batched GEMMS 502 | __half **Ab0, **Bb0; 503 | __half **Ab1, **Bb1; 504 | __half **Aw0, **Bw0; 505 | __half **Aw1, **Bw1; 506 | __half **Cb, **Cw; 507 | 508 | int batchCount = 2 * (nbx * nby); 509 | int batchCountPerGPU = batchCount / nGPUs; 510 | CHECK_CUDA(cudaMallocManaged(&Ab0, batchCount * sizeof(*Ab0))); 511 | CHECK_CUDA(cudaMallocManaged(&Bb0, batchCount * sizeof(*Bb0))); 512 | CHECK_CUDA(cudaMallocManaged(&Ab1, batchCount * sizeof(*Ab1))); 513 | CHECK_CUDA(cudaMallocManaged(&Bb1, batchCount * sizeof(*Bb1))); 514 | CHECK_CUDA(cudaMallocManaged(&Aw0, batchCount * sizeof(*Aw0))); 515 | CHECK_CUDA(cudaMallocManaged(&Bw0, batchCount * sizeof(*Bw0))); 516 | CHECK_CUDA(cudaMallocManaged(&Aw1, batchCount * sizeof(*Aw1))); 517 | CHECK_CUDA(cudaMallocManaged(&Bw1, batchCount * sizeof(*Bw1))); 518 | CHECK_CUDA(cudaMallocManaged(&Cb, batchCount * sizeof(*Cb))); 519 | CHECK_CUDA(cudaMallocManaged(&Cw, batchCount * sizeof(*Cw))); 520 | 521 | for (int dev = 0; dev < nGPUs; dev++) { 522 | CHECK_CUDA(cudaMemAdvise(&Ab0[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Ab0), cudaMemAdviseSetPreferredLocation, dev)); 523 | CHECK_CUDA(cudaMemAdvise(&Bb0[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Bb0), cudaMemAdviseSetPreferredLocation, dev)); 524 | CHECK_CUDA(cudaMemAdvise(&Ab1[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Ab1), cudaMemAdviseSetPreferredLocation, dev)); 525 | CHECK_CUDA(cudaMemAdvise(&Bb1[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Bb1), cudaMemAdviseSetPreferredLocation, dev)); 526 | CHECK_CUDA(cudaMemAdvise(&Aw0[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Aw0), cudaMemAdviseSetPreferredLocation, dev)); 527 | CHECK_CUDA(cudaMemAdvise(&Bw0[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Bw0), cudaMemAdviseSetPreferredLocation, dev)); 528 | CHECK_CUDA(cudaMemAdvise(&Aw1[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Aw1), cudaMemAdviseSetPreferredLocation, dev)); 529 | CHECK_CUDA(cudaMemAdvise(&Bw1[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Bw1), cudaMemAdviseSetPreferredLocation, dev)); 530 | CHECK_CUDA(cudaMemAdvise(&Cb[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Cb), cudaMemAdviseSetPreferredLocation, dev)); 531 | CHECK_CUDA(cudaMemAdvise(&Cw[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Cw), cudaMemAdviseSetPreferredLocation, dev)); 532 | } 533 | 534 | int idx = 0; 535 | 536 | for (int bj = 0; bj < nby; bj++) { 537 | for (int bi = 0; bi < nbx; bi++) { 538 | __half* nn_sums00 = nn_sums + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(0, 0); 539 | __half* nn_sums11 = nn_sums + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(1, 1); 540 | __half* nn_sums01 = nn_sums + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(0, 1); 541 | __half* nn_sums10 = nn_sums + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(1, 0); 542 | __half* lat00 = lattice + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(0, 0); 543 | __half* lat11 = lattice + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(1, 1); 544 | __half* lat01 = lattice + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(0, 1); 545 | __half* lat10 = lattice + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(1, 0); 546 | 547 | // Black: 548 | //nn_sum(0,0) = lattice(0,1) x K + K^T x lattice(1,0) 549 | //nn_sum(1,1) = lattice(1,0) x K^T + K x lattice(0,1) 550 | Ab0[idx ] = lat01; Bb0[idx ] = k; 551 | Ab0[idx+1] = lat10; Bb0[idx+1] = kT; 552 | 553 | Ab1[idx ] = kT; Bb1[idx ] = lat10; 554 | Ab1[idx+1] = k; Bb1[idx+1] = lat01; 555 | 556 | Cb[idx ] = nn_sums00; 557 | Cb[idx+1] = nn_sums11; 558 | 559 | // White: 560 | //nn_sum(1,0) = lattice(1,1) x K + K x lattice(0,0) 561 | //nn_sum(0,1) = lattice(0,0) x K^T + K^T x lattice(1,1) 562 | Aw0[idx ] = lat00 ; Bw0[idx ] = kT; 563 | Aw0[idx+1] = lat11 ; Bw0[idx+1] = k; 564 | 565 | Aw1[idx ] = kT; Bw1[idx ] = lat11; 566 | Aw1[idx+1] = k; Bw1[idx+1] = lat00; 567 | 568 | Cw[idx ] = nn_sums01; 569 | Cw[idx+1] = nn_sums10; 570 | 571 | idx += 2; 572 | 573 | } 574 | } 575 | 576 | sync(nGPUs); 577 | 578 | float inv_temp = 1.0f / (alpha*TCRIT); 579 | 580 | // Warmup 581 | printf("Starting warmup...\n"); 582 | for (int n = 0; n < nwarmup; n++) { 583 | update(Ab0, Bb0, Ab1, Bb1, Cb, Aw0, Bw0, Aw1, Bw1, Cw, 584 | lattice, inv_temp, nn_sums, cublas_handles, n+1, nbx, nby, seed, nGPUs); 585 | } 586 | 587 | sync(nGPUs); 588 | printf("Starting trial iterations...\n"); 589 | auto t0 = std::chrono::high_resolution_clock::now(); 590 | 591 | for (int n = nwarmup; n < niter + nwarmup; n++) { 592 | update(Ab0, Bb0, Ab1, Bb1, Cb, Aw0, Bw0, Aw1, Bw1, Cw, 593 | lattice, inv_temp, nn_sums, cublas_handles, n+1, nbx, nby, seed, nGPUs); 594 | if ((n - nwarmup) % 1000 == 0) printf("Completed %d/%d iterations...\n", n - nwarmup + 1, niter); 595 | } 596 | 597 | sync(nGPUs); 598 | auto t1 = std::chrono::high_resolution_clock::now(); 599 | 600 | double duration = (double) std::chrono::duration_cast(t1-t0).count(); 601 | printf("REPORT:\n"); 602 | printf("\tnGPUs: %d\n", nGPUs); 603 | printf("\ttemperature: %f * %f\n", alpha, TCRIT); 604 | printf("\tseed: %llu\n", seed); 605 | printf("\twarmup iterations: %d\n", nwarmup); 606 | printf("\ttrial iterations: %d\n", niter); 607 | printf("\tlattice dimensions: %lld x %lld\n", nx, ny); 608 | printf("\telapsed time: %f sec\n", duration * 1e-6); 609 | printf("\tupdates per ns: %f\n", (double) (nx * ny) * niter / duration * 1e-3); 610 | 611 | // Compute average magnetism 612 | double* devsums; 613 | int nchunks = (spinsPerGPU + CUB_CHUNK_SIZE - 1)/ CUB_CHUNK_SIZE; 614 | CHECK_CUDA(cudaMallocManaged(&devsums, nGPUs * nchunks * sizeof(*devsums))); 615 | for (int dev = 0 ; dev < nGPUs; dev++) { 616 | CHECK_CUDA(cudaSetDevice(dev)); 617 | size_t cub_workspace_bytes = 0; 618 | void* workspace = NULL; 619 | 620 | CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, &lattice[dev * spinsPerGPU], &devsums[dev*nchunks], CUB_CHUNK_SIZE)); 621 | CHECK_CUDA(cudaMalloc(&workspace, cub_workspace_bytes)); 622 | 623 | for (int n = 0; n < nchunks; n++) { 624 | CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, &lattice[dev * spinsPerGPU + n*CUB_CHUNK_SIZE], 625 | &devsums[dev * nchunks + n], std::min((long long) CUB_CHUNK_SIZE, spinsPerGPU - n * CUB_CHUNK_SIZE))); 626 | } 627 | CHECK_CUDA(cudaFree(workspace)); 628 | } 629 | 630 | sync(nGPUs); 631 | 632 | double hostsum = 0; 633 | for (int n = 0; n < nGPUs * nchunks; n++) { 634 | hostsum += devsums[n]; 635 | } 636 | std::cout << "\taverage magnetism (absolute): " << abs(hostsum / (nx * ny)) << std::endl; 637 | 638 | CHECK_CUDA(cudaFree(devsums)); 639 | 640 | if (write) write_lattice(lattice, "final.txt", nbx, nby, nGPUs); 641 | 642 | return 0; 643 | } 644 | -------------------------------------------------------------------------------- /tensorcore/plot_ising.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | lattice = np.loadtxt("final.txt", dtype=np.int32) 5 | plt.imshow(lattice) 6 | plt.title('Final Lattice Configuration') 7 | plt.colorbar() 8 | plt.show() 9 | 10 | -------------------------------------------------------------------------------- /tensorcore/sample_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ising-gpu/935796d7a26016670363af7a0dced8a9ebcd4714/tensorcore/sample_plot.png --------------------------------------------------------------------------------