├── .gitmodules
├── LICENSE
├── README.md
├── basic_cuda
    ├── Makefile
    ├── README.md
    ├── cudamacro.h
    ├── main.cu
    ├── plot_ising.py
    └── sample_plot.png
├── basic_python
    ├── README.md
    ├── ising_basic.py
    ├── plot_ising_multi.py
    └── sample_plot.png
├── optimized
    ├── cuBlumeCapel
    │   ├── LICENSE
    │   ├── Makefile
    │   ├── README.md
    │   ├── cudamacro.h
    │   ├── main.cu
    │   ├── utils.c
    │   ├── utils.h
    │   ├── vmm_alloc.cu
    │   └── vmm_alloc.h
    ├── cuIsingModel
    │   ├── LICENSE
    │   ├── Makefile
    │   ├── README.md
    │   ├── cudamacro.h
    │   ├── main.cu
    │   ├── utils.c
    │   ├── utils.h
    │   ├── vmm_alloc.cu
    │   └── vmm_alloc.h
    └── old
    │   ├── Makefile
    │   ├── README.md
    │   ├── cudamacro.h
    │   ├── images
    │       └── lattice_8192x8192_T_1.500000_IT_00001024_0.txt.png
    │   ├── main.cu
    │   ├── plotLattice.py
    │   ├── utils.c
    │   └── utils.h
└── tensorcore
    ├── Makefile
    ├── README.md
    ├── cudamacro.h
    ├── main.cu
    ├── plot_ising.py
    └── sample_plot.png


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "external/cub"]
2 | 	path = external/cub
3 | 	url = https://github.com/NVlabs/cub.git
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 NVIDIA Corporation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ### GPU-accelerated Monte Carlo simulations of 2D Ising Model
2 | This repository contains several implementations of the checkerboard Metropolis algorithm to simulate the 2D Ising model, which are referred to in a paper in preparation ([link](https://arxiv.org/abs/1906.06297)).
3 | 
4 | ### License
5 | This code is released under an MIT license which can be found in `LICENSE`. 
6 | 


--------------------------------------------------------------------------------
/basic_cuda/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_HOME=/usr/local/cuda
 2 | CUDACC=$(CUDA_HOME)/bin/nvcc
 3 | CC=gcc
 4 | LD=$(CUDACC)
 5 | LDFLAGS=-lcurand
 6 | CFLAGS=-c -O3 -g -I$(CUDA_HOME)/include
 7 | CUDACFLAGS= -std=c++11 -c -O3 -lineinfo -arch=sm_70 -Xptxas=-v -I../external/cub
 8 | 
 9 | all: ising_basic
10 | 
11 | ising_basic: main.o
12 | 	$(LD) -o ising_basic main.o $(LDFLAGS)
13 | 
14 | %.o: %.cu
15 | 	nvcc -c $(CUDACFLAGS) $<
16 | 
17 | clean:
18 | 	rm *.o ising_basic
19 | 


--------------------------------------------------------------------------------
/basic_cuda/README.md:
--------------------------------------------------------------------------------
 1 | ### Basic Implementation using CUDA C
 2 | 
 3 | ### Basic Usage
 4 | Compile binary with `make`.
 5 | 
 6 | Example run command:
 7 | 
 8 | `./ising_basic -x <rows> -y <columns> -n <number of iterations> `
 9 | 
10 | Run `./ising_basic --help` for more options.
11 | 
12 | ### Visualizing Results
13 | `-o` flag enables output of final lattice configuration to text file `final.txt`. Use provided `plot_ising.py` to visualize output.
14 | 
15 | For example:
16 | ```
17 | $ ./ising_basic -x 2048 -y 2048 -n 100 -a 0.5 -o
18 | ...
19 | Writing lattice to final.txt...
20 | 
21 | $ python plot_ising.py
22 | ```
23 | 
24 | This will produce the following output:
25 | 
26 | ![sample_plot.png](sample_plot.png)
27 | 


--------------------------------------------------------------------------------
/basic_cuda/cudamacro.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | #ifndef __CUDA_MACRO_H__
23 | #define __CUDA_MACRO_H__
24 | 
25 | #define CHECK_CUDA(call) {                                                   \
26 |     cudaError_t err = call;                                                  \
27 |     if( cudaSuccess != err) {                                                \
28 |         fprintf(stderr, "CUDA error in file '%s' in line %i : %s.\n",        \
29 |                 __FILE__, __LINE__, cudaGetErrorString( err) );              \
30 |         exit(EXIT_FAILURE);                                                  \
31 |     }}
32 | 
33 | #define CHECK_CUBLAS(call) {                                                 \
34 |     cublasStatus_t status = call;                                            \
35 |     if( CUBLAS_STATUS_SUCCESS != status) {                                   \
36 |         fprintf(stderr, "CUBLAS error: %s = %d at (%s:%d)\n", #call,         \
37 |                 status, __FILE__, __LINE__);                                 \
38 |         exit(EXIT_FAILURE);                                                  \
39 |     }}
40 | 
41 | #define CHECK_CURAND(call) {                                                 \
42 |     curandStatus_t status = call;                                            \
43 |     if( CURAND_STATUS_SUCCESS != status) {                                   \
44 |         fprintf(stderr, "CURAND error: %s = %d at (%s:%d)\n", #call,         \
45 |                 status, __FILE__, __LINE__);                                 \
46 |         exit(EXIT_FAILURE);                                                  \
47 |     }}
48 | 
49 | #define CHECK_ERROR(errorMessage) {                                          \
50 |     cudaError_t err = cudaGetLastError();                                    \
51 |     if( cudaSuccess != err) {                                                \
52 |         fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
53 |                 errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
54 |         exit(EXIT_FAILURE);                                                  \
55 |     }}
56 | #endif
57 | 


--------------------------------------------------------------------------------
/basic_cuda/main.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and associated documentation files (the "Software"),
  6 |  * to deal in the Software without restriction, including without limitation
  7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 |  * and/or sell copies of the Software, and to permit persons to whom the
  9 |  * Software is furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in
 12 |  * all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 |  * DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | 
 23 | #include <chrono>
 24 | #include <fstream>
 25 | #include <getopt.h>
 26 | #include <iostream>
 27 | #include <string>
 28 | 
 29 | #include <cuda_fp16.h>
 30 | #include <curand.h>
 31 | #include <cublas_v2.h>
 32 | 
 33 | #include <cub/cub.cuh>
 34 | #define CUB_CHUNK_SIZE ((1ll<<31) - (1ll<<28))
 35 | 
 36 | #include "cudamacro.h"
 37 | 
 38 | #define TCRIT 2.26918531421f
 39 | #define THREADS  128
 40 | 
 41 | // Initialize lattice spins
 42 | __global__ void init_spins(signed char* lattice,
 43 |                            const float* __restrict__ randvals,
 44 |                            const long long nx,
 45 |                            const long long ny) {
 46 |   const long long  tid = static_cast<long long>(blockDim.x) * blockIdx.x + threadIdx.x;
 47 |   if (tid >= nx * ny) return;
 48 | 
 49 |   float randval = randvals[tid];
 50 |   signed char val = (randval < 0.5f) ? -1 : 1;
 51 |   lattice[tid] = val;
 52 | }
 53 | 
 54 | template<bool is_black>
 55 | __global__ void update_lattice(signed char* lattice,
 56 |                                const signed char* __restrict__ op_lattice,
 57 |                                const float* __restrict__ randvals,
 58 |                                const float inv_temp,
 59 |                                const long long nx,
 60 |                                const long long ny) {
 61 |   const long long tid = static_cast<long long>(blockDim.x) * blockIdx.x + threadIdx.x;
 62 |   const int i = tid / ny;
 63 |   const int j = tid % ny;
 64 | 
 65 |   if (i >= nx || j >= ny) return;
 66 | 
 67 |   // Set stencil indices with periodicity
 68 |   int ipp = (i + 1 < nx) ? i + 1 : 0;
 69 |   int inn = (i - 1 >= 0) ? i - 1: nx - 1;
 70 |   int jpp = (j + 1 < ny) ? j + 1 : 0;
 71 |   int jnn = (j - 1 >= 0) ? j - 1: ny - 1;
 72 | 
 73 |   // Select off-column index based on color and row index parity
 74 |   int joff;
 75 |   if (is_black) {
 76 |     joff = (i % 2) ? jpp : jnn;
 77 |   } else {
 78 |     joff = (i % 2) ? jnn : jpp;
 79 |   }
 80 | 
 81 |   // Compute sum of nearest neighbor spins
 82 |   signed char nn_sum = op_lattice[inn * ny + j] + op_lattice[i * ny + j] + op_lattice[ipp * ny + j] + op_lattice[i * ny + joff];
 83 | 
 84 |   // Determine whether to flip spin
 85 |   signed char lij = lattice[i * ny + j];
 86 |   float acceptance_ratio = exp(-2.0f * inv_temp * nn_sum * lij);
 87 |   if (randvals[i*ny + j] < acceptance_ratio) {
 88 |     lattice[i * ny + j] = -lij;
 89 |   }
 90 | }
 91 | 
 92 | // Write lattice configuration to file
 93 | void write_lattice(signed char *lattice_b, signed char *lattice_w, std::string filename, long long nx, long long ny) {
 94 |   printf("Writing lattice to %s...\n", filename.c_str());
 95 |   signed char *lattice_h, *lattice_b_h, *lattice_w_h;
 96 |   lattice_h = (signed char*) malloc(nx * ny * sizeof(*lattice_h));
 97 |   lattice_b_h = (signed char*) malloc(nx * ny/2 * sizeof(*lattice_b_h));
 98 |   lattice_w_h = (signed char*) malloc(nx * ny/2 * sizeof(*lattice_w_h));
 99 | 
100 |   CHECK_CUDA(cudaMemcpy(lattice_b_h, lattice_b, nx * ny/2 * sizeof(*lattice_b), cudaMemcpyDeviceToHost));
101 |   CHECK_CUDA(cudaMemcpy(lattice_w_h, lattice_b, nx * ny/2 * sizeof(*lattice_w), cudaMemcpyDeviceToHost));
102 | 
103 |   for (int i = 0; i < nx; i++) {
104 |     for (int j = 0; j < ny/2; j++) {
105 |       if (i % 2) {
106 |         lattice_h[i*ny + 2*j+1] = lattice_b_h[i*ny/2 + j];
107 |         lattice_h[i*ny + 2*j] = lattice_w_h[i*ny/2 + j];
108 |       } else {
109 |         lattice_h[i*ny + 2*j] = lattice_b_h[i*ny/2 + j];
110 |         lattice_h[i*ny + 2*j+1] = lattice_w_h[i*ny/2 + j];
111 |       }
112 |     }
113 |   }
114 | 
115 |   std::ofstream f;
116 |   f.open(filename);
117 |   if (f.is_open()) {
118 |     for (int i = 0; i < nx; i++) {
119 |       for (int j = 0; j < ny; j++) {
120 |          f << (int)lattice_h[i * ny + j] << " ";
121 |       }
122 |       f << std::endl;
123 |     }
124 |   }
125 |   f.close();
126 | 
127 |   free(lattice_h);
128 |   free(lattice_b_h);
129 |   free(lattice_w_h);
130 | }
131 | 
132 | void update(signed char *lattice_b, signed char *lattice_w, float* randvals, curandGenerator_t rng, float inv_temp, long long nx, long long ny) {
133 | 
134 |   // Setup CUDA launch configuration
135 |   int blocks = (nx * ny/2 + THREADS - 1) / THREADS;
136 | 
137 |   // Update black
138 |   CHECK_CURAND(curandGenerateUniform(rng, randvals, nx*ny/2));
139 |   update_lattice<true><<<blocks, THREADS>>>(lattice_b, lattice_w, randvals, inv_temp, nx, ny/2);
140 | 
141 |   // Update white
142 |   CHECK_CURAND(curandGenerateUniform(rng, randvals, nx*ny/2));
143 |   update_lattice<false><<<blocks, THREADS>>>(lattice_w, lattice_b, randvals, inv_temp, nx, ny/2);
144 | }
145 | 
146 | static void usage(const char *pname) {
147 | 
148 |   const char *bname = rindex(pname, '/');
149 |   if (!bname) {bname = pname;}
150 |   else        {bname++;}
151 | 
152 |   fprintf(stdout,
153 |           "Usage: %s [options]\n"
154 |           "options:\n"
155 |           "\t-x|--lattice-n <LATTICE_N>\n"
156 |           "\t\tnumber of lattice rows\n"
157 |           "\n"
158 |           "\t-y|--lattice_m <LATTICE_M>\n"
159 |           "\t\tnumber of lattice columns\n"
160 |           "\n"
161 |           "\t-w|--nwarmup <NWARMUP>\n"
162 |           "\t\tnumber of warmup iterations\n"
163 |           "\n"
164 |           "\t-n|--niters <NITERS>\n"
165 |           "\t\tnumber of trial iterations\n"
166 |           "\n"
167 |           "\t-a|--alpha <ALPHA>\n"
168 |           "\t\tcoefficient of critical temperature\n"
169 |           "\n"
170 |           "\t-s|--seed <SEED>\n"
171 |           "\t\tseed for random number generation\n"
172 |           "\n"
173 |           "\t-o|--write-lattice\n"
174 |           "\t\twrite final lattice configuration to file\n\n",
175 |           bname);
176 |   exit(EXIT_SUCCESS);
177 | }
178 | 
179 | int main(int argc, char **argv) {
180 | 
181 |   // Defaults
182 |   long long nx = 5120;
183 |   long long ny = 5120;
184 |   float alpha = 0.1f;
185 |   int nwarmup = 100;
186 |   int niters = 1000;
187 |   bool write = false;
188 |   unsigned long long seed = 1234ULL;
189 | 
190 |   while (1) {
191 |     static struct option long_options[] = {
192 |         {     "lattice-n", required_argument, 0, 'x'},
193 |         {     "lattice-m", required_argument, 0, 'y'},
194 |         {         "alpha", required_argument, 0, 'y'},
195 |         {          "seed", required_argument, 0, 's'},
196 |         {       "nwarmup", required_argument, 0, 'w'},
197 |         {        "niters", required_argument, 0, 'n'},
198 |         { "write-lattice",       no_argument, 0, 'o'},
199 |         {          "help",       no_argument, 0, 'h'},
200 |         {               0,                 0, 0,   0}
201 |     };
202 | 
203 |     int option_index = 0;
204 |     int ch = getopt_long(argc, argv, "x:y:a:s:w:n:oh", long_options, &option_index);
205 |     if (ch == -1) break;
206 | 
207 |     switch(ch) {
208 |       case 0:
209 |         break;
210 |       case 'x':
211 |         nx = atoll(optarg); break;
212 |       case 'y':
213 |         ny = atoll(optarg); break;
214 |       case 'a':
215 |         alpha = atof(optarg); break;
216 |       case 's':
217 |         seed = atoll(optarg); break;
218 |       case 'w':
219 |         nwarmup = atoi(optarg); break;
220 |       case 'n':
221 |         niters = atoi(optarg); break;
222 |       case 'o':
223 |         write = true; break;
224 |       case 'h':
225 |         usage(argv[0]); break;
226 |       case '?':
227 |         exit(EXIT_FAILURE);
228 |       default:
229 |         fprintf(stderr, "unknown option: %c\n", ch);
230 |         exit(EXIT_FAILURE);
231 |     }
232 |   }
233 | 
234 |   // Check arguments
235 |   if (nx % 2 != 0 || ny % 2 != 0) {
236 |     fprintf(stderr, "ERROR: Lattice dimensions must be even values.\n");
237 |     exit(EXIT_FAILURE);
238 |   }
239 | 
240 |   float inv_temp = 1.0f / (alpha*TCRIT);
241 | 
242 |   // Setup cuRAND generator
243 |   curandGenerator_t rng;
244 |   CHECK_CURAND(curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_PHILOX4_32_10));
245 |   CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(rng, seed));
246 |   float *randvals;
247 |   CHECK_CUDA(cudaMalloc(&randvals, nx * ny/2 * sizeof(*randvals)));
248 | 
249 |   // Setup black and white lattice arrays on device
250 |   signed char *lattice_b, *lattice_w;
251 |   CHECK_CUDA(cudaMalloc(&lattice_b, nx * ny/2 * sizeof(*lattice_b)));
252 |   CHECK_CUDA(cudaMalloc(&lattice_w, nx * ny/2 * sizeof(*lattice_w)));
253 | 
254 |   int blocks = (nx * ny/2 + THREADS - 1) / THREADS;
255 |   CHECK_CURAND(curandGenerateUniform(rng, randvals, nx*ny/2));
256 |   init_spins<<<blocks, THREADS>>>(lattice_b, randvals, nx, ny/2);
257 |   CHECK_CURAND(curandGenerateUniform(rng, randvals, nx*ny/2));
258 |   init_spins<<<blocks, THREADS>>>(lattice_w, randvals, nx, ny/2);
259 | 
260 |   // Warmup iterations
261 |   printf("Starting warmup...\n");
262 |   for (int i = 0; i < nwarmup; i++) {
263 |     update(lattice_b, lattice_w, randvals, rng, inv_temp, nx, ny);
264 |   }
265 | 
266 |   CHECK_CUDA(cudaDeviceSynchronize());
267 | 
268 |   printf("Starting trial iterations...\n");
269 |   auto t0 = std::chrono::high_resolution_clock::now();
270 |   for (int i = 0; i < niters; i++) {
271 |     update(lattice_b, lattice_w, randvals, rng, inv_temp, nx, ny);
272 |     if (i % 1000 == 0) printf("Completed %d/%d iterations...\n", i+1, niters);
273 |   }
274 | 
275 |   CHECK_CUDA(cudaDeviceSynchronize());
276 |   auto t1 = std::chrono::high_resolution_clock::now();
277 | 
278 |   double duration = (double) std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
279 |   printf("REPORT:\n");
280 |   printf("\tnGPUs: %d\n", 1);
281 |   printf("\ttemperature: %f * %f\n", alpha, TCRIT);
282 |   printf("\tseed: %llu\n", seed);
283 |   printf("\twarmup iterations: %d\n", nwarmup);
284 |   printf("\ttrial iterations: %d\n", niters);
285 |   printf("\tlattice dimensions: %lld x %lld\n", nx, ny);
286 |   printf("\telapsed time: %f sec\n", duration * 1e-6);
287 |   printf("\tupdates per ns: %f\n", (double) (nx * ny) * niters / duration * 1e-3);
288 | 
289 |   // Reduce
290 |   double* devsum;
291 |   int nchunks = (nx * ny/2 + CUB_CHUNK_SIZE - 1)/ CUB_CHUNK_SIZE;
292 |   CHECK_CUDA(cudaMalloc(&devsum, 2 * nchunks * sizeof(*devsum)));
293 |   size_t cub_workspace_bytes = 0;
294 |   void* workspace = NULL;
295 |   CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, lattice_b, devsum, CUB_CHUNK_SIZE));
296 |   CHECK_CUDA(cudaMalloc(&workspace, cub_workspace_bytes));
297 |   for (int i = 0; i < nchunks; i++) {
298 |     CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, &lattice_b[i*CUB_CHUNK_SIZE], devsum + 2*i,
299 |                            std::min((long long) CUB_CHUNK_SIZE, nx * ny/2 - i * CUB_CHUNK_SIZE)));
300 |     CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, &lattice_w[i*CUB_CHUNK_SIZE], devsum + 2*i + 1,
301 |                            std::min((long long) CUB_CHUNK_SIZE, nx * ny/2 - i * CUB_CHUNK_SIZE)));
302 |   }
303 | 
304 |   double* hostsum;
305 |   hostsum = (double*)malloc(2 * nchunks * sizeof(*hostsum));
306 |   CHECK_CUDA(cudaMemcpy(hostsum, devsum, 2 * nchunks * sizeof(*devsum), cudaMemcpyDeviceToHost));
307 |   double fullsum = 0.0;
308 |   for (int i = 0; i < 2 * nchunks; i++) {
309 |     fullsum += hostsum[i];
310 |   }
311 |   std::cout << "\taverage magnetism (absolute): " << abs(fullsum / (nx * ny)) << std::endl;
312 | 
313 |   if (write) write_lattice(lattice_b, lattice_w, "final.txt", nx, ny);
314 | 
315 |   return 0;
316 | }
317 | 


--------------------------------------------------------------------------------
/basic_cuda/plot_ising.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | lattice = np.loadtxt("final.txt", dtype=np.int32)
 5 | plt.imshow(lattice)
 6 | plt.title('Final Lattice Configuration')
 7 | plt.colorbar()
 8 | plt.show()
 9 | 
10 | 


--------------------------------------------------------------------------------
/basic_cuda/sample_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ising-gpu/935796d7a26016670363af7a0dced8a9ebcd4714/basic_cuda/sample_plot.png


--------------------------------------------------------------------------------
/basic_python/README.md:
--------------------------------------------------------------------------------
 1 | ### Basic Implementation using Python
 2 | ### Required packages:
 3 | - numpy
 4 | - numba
 5 | - cupy
 6 | - matplotlib (optional, for plotting only)
 7 | 
 8 | ### Basic Usage
 9 | Single GPU:
10 | 
11 | `python ising_basic.py -x <rows> -y <columns> -n <number of iterations> `
12 | 
13 | Multi GPU using MPI:
14 | 
15 | `mpirun -np <# of GPUS> python ising_basic.py -x <rows> -y <columns> -n <number of iterations>`
16 | 
17 | Run `python ising_basic.py --help` for more options.
18 | 
19 | ### Visualizing Results
20 | `-o` flag enables output of final lattice configuration to text files `final_rank*.txt`. Use provided `plot_ising_multi.py` to visualize output.
21 | 
22 | For example:
23 | ```
24 | $ mpirun -np 2 python ising_basic.py -x 2048 -y 2048 -n 100 -a 0.5 -o
25 | ...
26 | Writing lattice to final_rank0.txt...
27 | Writing lattice to final_rank1.txt...
28 | 
29 | $ python plot_ising_multi.py
30 | ```
31 | 
32 | This will produce the following output:
33 | 
34 | ![sample_plot.png](sample_plot.png)
35 | 


--------------------------------------------------------------------------------
/basic_python/ising_basic.py:
--------------------------------------------------------------------------------
  1 |  # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  #
  3 |  # Permission is hereby granted, free of charge, to any person obtaining a
  4 |  # copy of this software and associated documentation files (the "Software"),
  5 |  # to deal in the Software without restriction, including without limitation
  6 |  # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  7 |  # and/or sell copies of the Software, and to permit persons to whom the
  8 |  # Software is furnished to do so, subject to the following conditions:
  9 |  #
 10 |  # The above copyright notice and this permission notice shall be included in
 11 |  # all copies or substantial portions of the Software.
 12 |  #
 13 |  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 |  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 |  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 16 |  # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 |  # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 18 |  # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 19 |  # DEALINGS IN THE SOFTWARE.
 20 | 
 21 | import argparse
 22 | import math
 23 | import sys
 24 | import time
 25 | 
 26 | import cupy.cuda.curand as curand
 27 | from mpi4py import MPI
 28 | from numba import cuda
 29 | from numba import vectorize
 30 | import numpy as np
 31 | 
 32 | # Set constants
 33 | TCRIT = 2.26918531421 # critical temperature
 34 | 
 35 | # Setup MPI and get neighbor ranks
 36 | comm = MPI.COMM_WORLD
 37 | rank = comm.rank
 38 | rank_up = comm.rank - 1 if (comm.rank - 1 >= 0) else comm.size - 1
 39 | rank_down = comm.rank + 1 if (comm.rank + 1 < comm.size) else 0
 40 | 
 41 | # Parse command line arguments
 42 | parser = argparse.ArgumentParser()
 43 | parser.add_argument("--lattice-n", '-x', type=int, default=40*128, help="number of lattice rows")
 44 | parser.add_argument("--lattice-m", '-y', type=int, default=40*128, help="number of lattice columns")
 45 | parser.add_argument("--nwarmup", '-w', type=int, default=100, help="number of warmup iterations")
 46 | parser.add_argument("--niters", '-n', type=int, default=1000, help="number of trial iterations")
 47 | parser.add_argument("--alpha", '-a', type=float, default=0.1, help="coefficient of critical temperature")
 48 | parser.add_argument("--seed", '-s', type=int, default=1234, help="seed for random number generation")
 49 | parser.add_argument("--write-lattice", '-o', action='store_true', help="write final lattice configuration to file/s")
 50 | parser.add_argument("--use-common-seed", '-c', action='store_true', help="Use common seed for all ranks + updating offset. " +
 51 |                                                                          "Yields consistent results independent of number " +
 52 |                                                                          "of GPUs but is slower.")
 53 | args = parser.parse_args()
 54 | 
 55 | # Check arguments
 56 | if args.lattice_m % 2 != 0:
 57 |     raise Exception("lattice_m must be an even value. Aborting.")
 58 | if args.lattice_n % comm.size != 0:
 59 |     raise Exception("lattice_n must be evenly divisible by number of GPUs. Aborting.")
 60 | if (args.lattice_n / comm.size) % 2 != 0:
 61 |     raise Exception("Slab width (lattice_n / nGPUs) must be an even value. Aborting.")
 62 | 
 63 | # Compute slab width
 64 | lattice_slab_n = args.lattice_n // comm.size
 65 | 
 66 | inv_temp = (1.0) / (args.alpha * TCRIT)
 67 | 
 68 | # Generate lattice with random spins with shape of randval array
 69 | @vectorize(['int8(float32)'], target='cuda')                             
 70 | def generate_lattice(randval):
 71 |     return 1 if randval > 0.5 else -1 
 72 | 
 73 | @cuda.jit
 74 | def update_lattice_multi(lattice, op_lattice, op_lattice_up, op_lattice_down, randvals, is_black):
 75 |     n,m = lattice.shape
 76 |     tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
 77 |     j = tid % m
 78 |     i = tid // m
 79 | 
 80 |     if (i >= n or j >= m): return
 81 | 
 82 |     # Set stencil indices with periodicity
 83 |     jpp = (j + 1) if (j + 1) < m else 0
 84 |     jnn = (j - 1) if (j - 1) >= 0 else (m - 1)
 85 | 
 86 |     # Select off-column index based on color and row index parity
 87 |     if (is_black):
 88 |         joff = jpp if (i % 2) else jnn
 89 |     else:
 90 |         joff = jnn if (i % 2) else jpp
 91 | 
 92 |     # Compute sum of nearest neighbor spins (taking values from neighboring
 93 |     # lattice slabs if required)
 94 |     nn_sum = op_lattice[i, j] + op_lattice[i, joff]
 95 |     nn_sum += op_lattice[i - 1, j] if (i - 1) >= 0 else op_lattice_up[n - 1, j]
 96 |     nn_sum += op_lattice[i + 1, j] if (i + 1) < n else op_lattice_down[0, j]
 97 | 
 98 |     # Determine whether to flip spin
 99 |     lij = lattice[i, j]
100 |     acceptance_ratio = math.exp(-2.0 * inv_temp * nn_sum * lij)
101 |     if (randvals[i, j] < acceptance_ratio):
102 |         lattice[i, j] = -lij
103 | 
104 | # Create lattice update kernel (for single GPU case, this version with fewer arguments
105 | # is a bit faster due to launch overhead introduced by numba)
106 | @cuda.jit
107 | def update_lattice(lattice, op_lattice, randvals, is_black):
108 |     n,m = lattice.shape
109 |     tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
110 |     i = tid // m
111 |     j = tid % m
112 | 
113 |     if (i >= n or j >= m): return
114 | 
115 |     # Set stencil indices with periodicity
116 |     ipp = (i + 1) if (i + 1) < n else 0
117 |     jpp = (j + 1) if (j + 1) < m else 0
118 |     inn = (i - 1) if (i - 1) >= 0 else (n - 1)
119 |     jnn = (j - 1) if (j - 1) >= 0 else (m - 1)
120 | 
121 |     # Select off-column index based on color and row index parity
122 |     if (is_black):
123 |         joff = jpp if (i % 2) else jnn
124 |     else:
125 |         joff = jnn if (i % 2) else jpp
126 | 
127 |     # Compute sum of nearest neighbor spins
128 |     nn_sum = op_lattice[inn, j] + op_lattice[i, j] + op_lattice[ipp, j] + op_lattice[i, joff]
129 | 
130 |     # Determine whether to flip spin
131 |     lij = lattice[i, j]
132 |     acceptance_ratio = math.exp(-2.0 * inv_temp * nn_sum * lij)
133 |     if (randvals[i, j] < acceptance_ratio):
134 |         lattice[i, j] = -lij
135 | 
136 | # Write lattice configuration to file
137 | def write_lattice(prefix, lattice_b, lattice_w):
138 |   lattice_b_h = lattice_b.copy_to_host()
139 |   lattice_w_h = lattice_w.copy_to_host()
140 |   lattice = np.zeros((lattice_slab_n, args.lattice_m), dtype=np.int8)
141 |   for i in range(lattice_slab_n):
142 |       for j in range(args.lattice_m // 2):
143 |           if (i % 2):
144 |               lattice[i, 2*j+1] = lattice_b_h[i, j]
145 |               lattice[i, 2*j] = lattice_w_h[i, j]
146 |           else:
147 |               lattice[i, 2*j] = lattice_b_h[i, j]
148 |               lattice[i, 2*j+1] = lattice_w_h[i, j]
149 | 
150 |   print("Writing lattice to {}_rank{}.txt...".format(prefix, rank))
151 |   np.savetxt("{}_rank{}.txt".format(prefix, rank), lattice, fmt='%d')
152 | 
153 | # Helper class for random number generation
154 | class curandUniformRNG:
155 |     def __init__(self, seed=0):
156 |         rng = curand.createGenerator(curand.CURAND_RNG_PSEUDO_PHILOX4_32_10)
157 |         curand.setPseudoRandomGeneratorSeed(rng, seed)
158 |         if (args.use_common_seed):
159 |             self.offset = rank * lattice_slab_n * args.lattice_m // 2
160 |             curand.setGeneratorOffset(rng, self.offset)
161 |         self._rng = rng
162 | 
163 |     def fill_random(self, arr):
164 |         ptr = arr.__cuda_array_interface__['data'][0]
165 |         curand.generateUniform(self._rng, ptr, arr.size)
166 |         if (args.use_common_seed):
167 |             self.offset += args.lattice_n * args.lattice_m // 2
168 |             curand.setGeneratorOffset(self._rng, self.offset)
169 | 
170 | # Helper function to perform device sync plus MPI barrier
171 | def sync():
172 |   cuda.synchronize()
173 |   comm.barrier()
174 | 
175 | def update(lattices_b, lattices_w, randvals, rng):
176 |     # Setup CUDA launch configuration
177 |     threads = 128
178 |     blocks = (args.lattice_m // 2 * lattice_slab_n + threads - 1) // threads
179 | 
180 |     if (comm.size > 1):
181 |         # Update black
182 |         rng.fill_random(randvals)
183 |         update_lattice_multi[blocks, threads](lattices_b[rank], lattices_w[rank], lattices_w[rank_up], lattices_w[rank_down], randvals, True)
184 |         sync()
185 |         # Update white
186 |         rng.fill_random(randvals)
187 |         update_lattice_multi[blocks, threads](lattices_w[rank], lattices_b[rank], lattices_b[rank_up], lattices_b[rank_down], randvals, False)
188 |         sync()
189 |     else:
190 |         # Update black
191 |         rng.fill_random(randvals)
192 |         update_lattice[blocks, threads](lattices_b[rank], lattices_w[rank], randvals, True)
193 |         # Update white
194 |         rng.fill_random(randvals)
195 |         update_lattice[blocks, threads](lattices_w[rank], lattices_b[rank], randvals, False)
196 | 
197 | 
198 | # Set device
199 | cuda.select_device(rank)
200 | 
201 | # Setup cuRAND generator
202 | rng = curandUniformRNG(seed=args.seed if args.use_common_seed else args.seed + 42 * rank)
203 | randvals = cuda.device_array((lattice_slab_n, args.lattice_m // 2), dtype=np.float32)
204 | 
205 | # Setup black and white lattice arrays on device
206 | rng.fill_random(randvals)
207 | lattice_b = generate_lattice(randvals)
208 | rng.fill_random(randvals)
209 | lattice_w = generate_lattice(randvals)
210 | 
211 | # Setup/open CUDA IPC handles
212 | ipch_b = comm.allgather(lattice_b.get_ipc_handle())
213 | ipch_w = comm.allgather(lattice_w.get_ipc_handle())
214 | lattices_b = [x.open() if i != rank else lattice_b for i,x in enumerate(ipch_b)]
215 | lattices_w = [x.open() if i != rank else lattice_w for i,x in enumerate(ipch_w)]
216 | 
217 | # Warmup iterations
218 | if rank == 0:
219 |     print("Starting warmup...")
220 |     sys.stdout.flush()
221 | sync()
222 | for i in range(args.nwarmup):
223 |     update(lattices_b, lattices_w, randvals, rng)
224 | sync()
225 | 
226 | # Trial iterations
227 | if rank == 0:
228 |     print("Starting trial iterations...")
229 |     sys.stdout.flush()
230 | t0 = time.time()
231 | for i in range(args.niters):
232 |     update(lattices_b, lattices_w, randvals, rng)
233 |     if (rank == 0 and i % 1000 == 0):
234 |         print("Completed {}/{} iterations...".format(i+1, args.niters))
235 |         sys.stdout.flush()
236 | sync()
237 | 
238 | t1 = time.time()
239 | t = t1 - t0
240 | 
241 | # Compute average magnetism
242 | m = (np.sum(lattices_b[rank], dtype=np.int64) + np.sum(lattices_w[rank], dtype=np.int64)) / float(args.lattice_n * args.lattice_m)
243 | m_global = comm.allreduce(m, MPI.SUM)
244 | 
245 | if (rank == 0):
246 |   print("REPORT:")
247 |   print("\tnGPUs: {}".format(comm.size))
248 |   print("\ttemperature: {} * {}".format(args.alpha, TCRIT))
249 |   print("\tseed: {}".format(args.seed))
250 |   print("\twarmup iterations: {}".format(args.nwarmup))
251 |   print("\ttrial iterations: {}".format(args.niters))
252 |   print("\tlattice dimensions: {} x {}".format(args.lattice_n, args.lattice_m))
253 |   print("\telapsed time: {} sec".format(t))
254 |   print("\tupdates per ns: {}".format((args.lattice_n * args.lattice_m * args.niters) / t * 1e-9))
255 |   print("\taverage magnetism (absolute): {}".format(np.abs(m_global)))
256 |   sys.stdout.flush()
257 | 
258 | sync()
259 | 
260 | if (args.write_lattice):
261 |     write_lattice("final", lattices_b[rank], lattices_w[rank])
262 | 


--------------------------------------------------------------------------------
/basic_python/plot_ising_multi.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | files = sorted(glob.glob("final_rank*.txt"))
 6 | 
 7 | if (len(files) == 0):
 8 |     raise Exception("Could not find any lattice files. Expecting files named 'final_rank*.txt' for processing")
 9 | 
10 | lattice = np.loadtxt(files[0], dtype=np.int32)
11 | for i,f in enumerate(files):
12 |     if i == 0: continue
13 |     lattice = np.concatenate((lattice, np.loadtxt(f, dtype=np.int32)))
14 | 
15 | plt.imshow(lattice)
16 | plt.title('Final Lattice Configuration')
17 | plt.colorbar()
18 | plt.show()
19 | 
20 | 


--------------------------------------------------------------------------------
/basic_python/sample_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ising-gpu/935796d7a26016670363af7a0dced8a9ebcd4714/basic_python/sample_plot.png


--------------------------------------------------------------------------------
/optimized/cuBlumeCapel/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 NVIDIA Corporation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/optimized/cuBlumeCapel/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_HOME = /opt/cuda-12.8.1
 2 | CUDACC = $(CUDA_HOME)/bin/nvcc
 3 | CC = gcc
 4 | LD = $(CUDACC)
 5 | 
 6 | CFLAGS = -c -O3 -g -I$(CUDA_HOME)/include
 7 | 
 8 | SMS ?= 89
 9 | CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM))
10 | 
11 | CUDACFLAGS = -c -O3 -lineinfo $(CUDA_ARCH) -Xptxas=-v
12 | LDFLAGS    = -Xcompiler=-fopenmp -lcurand
13 | 
14 | C_SRCS  = utils.c
15 | CU_SRCS = main.cu
16 |                                 
17 | ifdef USE_MNNVL
18 | $(info Compiling with MNNVL support...)
19 | MPI_HOME = /project/coreai_hpc_hpc/hpc_sdk/Linux_aarch64/dev/comm_libs/12.8/hpcx/latest/ompi
20 | 
21 | CUDACFLAGS += -I$(MPI_HOME)/include -DUSE_MNNVL
22 | LDFLAGS    += -L$(MPI_HOME)/lib -lcuda -lmpi
23 | CU_SRCS    += vmm_alloc.cu
24 | endif
25 | 
26 | C_OBJS = $(patsubst %.c, %.o, $(C_SRCS))
27 | CU_OBJS = $(patsubst %.cu, %.o, $(CU_SRCS))
28 | 
29 | all: cuBlume
30 | 
31 | cuBlume: $(CU_OBJS) $(C_OBJS)
32 | 	$(LD) -o cuBlume $(CU_OBJS) $(C_OBJS) $(LDFLAGS)
33 | 
34 | %.o: %.cu
35 | 	$(CUDACC) $(CUDACFLAGS) $<
36 | 
37 | %.o: %.c
38 | 	$(CC) $(CFLAGS) $< -o $@
39 | 
40 | clean:
41 | 	-@rm -f *.o cuBlume *.sass &> /dev/null || true
42 | 


--------------------------------------------------------------------------------
/optimized/cuBlumeCapel/README.md:
--------------------------------------------------------------------------------
  1 | # A CUDA implementation for the Blume-Capel model supporting Multi-Node NVLink
  2 | 
  3 | A high performance Blume Capel model implementation for GPU. The code can run on
  4 | multiple GPUs connected to the same node or on multiple nodes connected via
  5 | NVLink (MNNVL).
  6 | 
  7 | To compile the code to run on single node, adjust the Makefile to point to your CUDA
  8 | installation, specify the CUDA architecture you want to compile for and then
  9 | run `make`. That should be enough to produce the ``cuBlume`` binary.
 10 | 
 11 | For multi-node, in addition to the Makefile adjustment above, also modify it to 
 12 | point to your MPI installation and then compile it with `make USE_MNNVL=1`.
 13 | 
 14 | When running on a single node, the code uses managed memory. On multiple nodes
 15 | with MNNVL, it uses [fabric memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#fabric-memory).
 16 | 
 17 | When more than one GPU is used, the spin system is partitioned vertically.
 18 | 
 19 | ## Usage
 20 | 
 21 | <PRE>
 22 | Usage: cuBlume [options]
 23 | options:
 24 |         -x|--x &lt;HORIZ_DIM&gt;
 25 |                 Specifies the horizontal dimension of the entire  lattice  (black+white  spins).
 26 |                 This dimension must be a multiple of 2048.
 27 | 
 28 |         -y|--y &lt;VERT_DIM&gt;
 29 |                 Specifies the vertical dimension of the per-GPU lattice.  This dimension must be
 30 |                 a multiple of 16.
 31 | 
 32 |         -n|--n &lt;NSTEPS&gt;
 33 |                 Specifies the number of iteration to run.
 34 |                 Defualt: 1
 35 | 
 36 |         -g|--gpus &lt;NUM_DEVICES&gt;
 37 |                 Specifies the number of GPUs to use. Will use devices with ids [0, NUM_DEVS-1].
 38 |                 Defualt: 1.
 39 | 
 40 |         -s|--seed &lt;SEED&gt;
 41 |                 Specifies the seed used to generate random numbers.
 42 |                 Default: 463463564571
 43 | 
 44 |         -a|--alpha &lt;ALPHA&gt;
 45 |                 Specifies the temperature in T_CRIT units.  If both this  option  and  '-t'  are
 46 |                 specified then the '-t' option is used.
 47 |                 Default: 0.100000
 48 | 
 49 |         -d|--delta &lt;DELTA&gt;
 50 |                 Specifies the delta parameter for the Blume-Capel model.
 51 |                 Default: 1.000000
 52 | 
 53 |         -t|--temp &lt;TEMP_0&gt;[[,&lt;IT_1&gt;:&lt;TEMP_1&gt;]...]
 54 |                 Specifies the temperature(s), in absolute  units.   It  is  possible  to  use  a
 55 |                 temperature-changing   protocol   by   specifying   a   sequence   of    couples
 56 |                 &lt;IT_i&gt;:&lt;TEMP_i&gt; after the first temperature &lt;TEMP_0&gt;. The value &lt;IT_i&gt; specifies
 57 |                 the time step at which the temperature  changes  from  &lt;TEMP_i-1&gt;  to  &lt;TEMP_i&gt;.
 58 |                 Temperature &lt;TEMP_0&gt; is the starting temperature and thus  does  not  require  a
 59 |                 time step specification.
 60 |                 Default: 0.226919
 61 | 
 62 |         -p|--print &lt;STAT_FREQ&gt;
 63 |                 Specifies the frequency, in no.  of  iteration,  with  which  the  magnetization
 64 |                 statistics is printed.  If this option is used together to the '-e' option, this
 65 |                 option is ignored.
 66 |                 Default: only at the beginning and at end of the simulation
 67 | 
 68 |         --pexp
 69 |                 Prints statistics every power-of-2 time steps.  This  option  overrides  the  -p
 70 |                 option.
 71 |                 Default: disabled
 72 | 
 73 |         -c|--corr
 74 |                 Dumps  to  a  file  named  corr_{TYPE}_{X}x{Y}_T_{TEMP} the correlation o   each
 75 |                 point with the vertical and horizontal neighbors at distance r &lt;= 256.   Beyond
 76 |                 that, distance as chosen according to an exponential rule, with 32  values  per
 77 |                 power of 2.  The  correlation  is  computed  every  time  the  magnetization  is
 78 |                 printed on screen (based  on  either  the  '-p'  or  '-e'  options)  and  it  is
 79 |                 written in the  file one line per measure.
 80 |                 Default: full correlation (see --corrfull option)
 81 | 
 82 |         --corrfull
 83 |                 Compute the correlation for each spin in the system.
 84 | 
 85 |         --corrdiag
 86 |                 Compute the correlation only for diagonal spins.
 87 | 
 88 |         --corrchkb
 89 |                 Computes the correlation for only one spin (the top-left one)  for each block of
 90 |                 16x16 spins (checkerboard pattern).
 91 | 
 92 |         --corrmixd
 93 |                 Computes the correlation using a mix of full and checkerboard modes.   The  full
 94 |                 correlation is used for  all distances  r <= 32. Then,  for each spin in a 16x16
 95 |                 square, it is computed for each r > 32.
 96 | 
 97 |         --writechkp &lt;CHECKPOINT_FILE_PATH&gt;
 98 |                 Enables write of checkpoint file at the end of the simulation.  The file can  be
 99 |                 later used to resume the simulation with the '-r' option.  This option and  '-r'
100 |                 can be used together to break down a  large  run  into  multiple  smaller  runs.
101 |                 When running with multiple processes,  the file name must contain either '%i' or
102 |                 '%d' which will be substituted with the process number.
103 |                 
104 |         --readchkp &lt;CHECKPOINT_FILE_PATH&gt;
105 |                 Enables the restart of a simulation from the state in a checkpoint file.  Please
106 |                 note that in order for that to work, the non-checkpoint  command  lines  options
107 |                 used in the run where the checkpoint file was created must match with those used
108 |                 in the run where the checkpoint file is read.  This option and '-r' can be  used
109 |                 together  to  break   down   a   large   run   into   multiple   smaller   runs.
110 |                 When running with multiple processes,  the file name must contain either '%i' or
111 |                 '%d' which will be substituted with the process number.
112 |         -o|--o
113 |                 Enables the file dump of  the lattice  every time  the magnetization is printed.
114 |                 Default: off
115 | </PRE>
116 | 
117 | For example, to run 102400 steps on a 16384^2 lattice using one GPU, using temperature 1.5 and
118 | printing the statistics every 10240 steps:
119 | 
120 | <PRE>
121 | $ ./cuBlume -y 32768 -x 32768 -n 1024 -p 128 -g 1 -t 1.5
122 | 
123 | Using GPUs:
124 |          0 (NVIDIA RTX 6000 Ada Generation, 48 GB, 142 SMs, 1536 th/SM max, CC 8.9, ECC off)
125 | 
126 | Run configuration:
127 |         word size: 16
128 |         bits per spin: 4 (mask: 0xF)
129 |         spins/word: 32
130 |         spins: 1073741824 (~1.07E+09)
131 |         seed: 463463564571
132 |         block size (X, Y): 16, 16
133 |         tile  size (X, Y): 32, 16
134 |         grid size 1D: 32768
135 |         virtual grid size 2D (X, Y): 16, 2048
136 |         spins per tile (X, Y): 1024, 512
137 | 
138 |         iterations:
139 |                 beg: 1
140 |                 end: 1024
141 |                 tot: 1024
142 | 
143 |         print stats every 128 steps
144 |         delta: 1
145 |         temperature: 1.5 (0.661030190265538*T_crit)
146 | 
147 |         no. of  processes: 1
148 |         GPUs  per process: 1
149 |         total no. of GPUs: 1
150 |         GPUs  memory type: managed
151 | 
152 |         per-GPU lattice size:         32768 x    32768 spins
153 |         per-GPU lattice shape: 2 x    32768 x      512 ull2s (    33554432 total)
154 | 
155 |         total lattice size:         32768 x    32768 spins
156 |         total lattice shape: 2 x    32768 x      512 ull2s (    33554432 total)
157 | 
158 |         total memory: 0.50 GB (0.50 GB per GPU)
159 | 
160 | Setting up GPUs:
161 |         GPU  0 done in 0.020104 secs
162 | 
163 | Initializing spin lattice... done in 0.058671 secs
164 | 
165 | [Switching to temperature: 1.5]
166 | 
167 | Running simulation...
168 | 
169 |         Step   MC SW          Magn.          N(-1)           N(0)           N(1)     SD value     flips/ns         GB/s          ERT
170 | 
171 |            0           7.080846E-06      357903413      357927395      357911016     5.716485
172 |          128    *      1.601530E-04      376546141      320821505      376374178     1.000418       511.37       769.55        2.17s
173 |          256    *      5.741259E-04      376809831      320738625      376193368     0.999816       509.93       767.38        2.18s
174 |          384    *      1.082895E-04      376545445      320767209      376429170     0.999965       509.44       766.64        2.18s
175 |          512    *      1.646699E-04      376582881      320752875      376406068     1.000123       507.92       764.36        2.18s
176 |          640    *      1.317356E-04      376417378      320765618      376558828     0.999747       510.41       768.11        2.18s
177 |          768    *      3.697937E-04      376286661      320771439      376683724     1.000044       508.58       765.36        2.18s
178 |          896    *      3.665267E-04      376673596      320788187      376280041     0.999778       509.95       767.42        2.18s
179 |         1024    *      1.519648E-04      376579698      320745599      376416527     1.000010       504.67       759.46        2.18s
180 | 
181 | Done in 2.184835E+03 ms (stats overhead: 1.15%, spins/ns: 503.25, BW: 757.33 GB/s)
182 | </PRE>
183 | 
184 | To run 128 steps on a 2^20x2^20 lattice using 8 H100 GPUs:
185 | 
186 | <PRE>
187 | $ ./cuBlume -y $((2**20 / 8)) -x $((2**20)) -n 128 -p 32 -t 1.5 -g 8
188 | 
189 | Using GPUs:
190 |          0 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
191 |          1 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
192 |          2 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
193 |          3 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
194 |          4 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
195 |          5 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
196 |          6 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
197 |          7 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
198 | 
199 | Run configuration:
200 |         word size: 16
201 |         bits per spin: 4 (mask: 0xF)
202 |         spins/word: 32
203 |         spins: 1099511627776 (~1.10E+12)
204 |         seed: 463463564571
205 |         block size (X, Y): 16, 16
206 |         tile  size (X, Y): 32, 16
207 |         grid size 1D: 4194304
208 |         virtual grid size 2D (X, Y): 512, 8192
209 |         spins per tile (X, Y): 1024, 512
210 | 
211 |         iterations:
212 |                 beg: 1
213 |                 end: 128
214 |                 tot: 128
215 | 
216 |         print stats every 32 steps
217 |         delta: 1
218 |         temperature: 1.5 (0.661030190265538*T_crit)
219 | 
220 |         no. of  processes: 1
221 |         GPUs  per process: 8
222 |         total no. of GPUs: 8
223 |         GPUs  memory type: managed
224 | 
225 |         per-GPU lattice size:        131072 x  1048576 spins
226 |         per-GPU lattice shape: 2 x   131072 x    16384 ull2s (  4294967296 total)
227 | 
228 |         total lattice size:       1048576 x  1048576 spins
229 |         total lattice shape: 2 x  1048576 x    16384 ull2s ( 34359738368 total)
230 | 
231 |         total memory: 512.00 GB (64.00 GB per GPU)
232 | 
233 | Setting up GPUs:
234 |         GPU  0 done in 1.094278 secs
235 |         GPU  1 done in 1.260294 secs
236 |         GPU  2 done in 1.268412 secs
237 |         GPU  3 done in 1.259265 secs
238 |         GPU  4 done in 1.269356 secs
239 |         GPU  5 done in 1.279294 secs
240 |         GPU  6 done in 1.286008 secs
241 |         GPU  7 done in 1.288558 secs
242 | 
243 | Initializing spin lattice... done in 6.611633 secs
244 | 
245 | [Switching to temperature: 1.5]
246 | 
247 | Running simulation...
248 | 
249 |         Step   MC SW          Magn.          N(-1)           N(0)           N(1)     SD value     flips/ns         GB/s          ERT
250 | 
251 |            0           5.335123E-07   366503778958   366503483257   366504365561     5.717101
252 |           32    *      3.692141E-06   384166621778   331174324668   384170681330     1.001240      6375.69      9567.43       22.45s
253 |           64    *      7.972785E-07   385202476551   329105798057   385203353168     1.000216      6375.91      9567.76       22.45s
254 |           96    *      2.314373E-07   385421280026   328668813256   385421534494     1.000065      6376.48      9568.61       22.45s
255 |          128    *      5.507602E-06   385491919415   328533844618   385485863743     1.000011      6376.44      9568.55       22.45s
256 | 
257 | Done in 2.244686E+04 ms (stats overhead: 1.70%, spins/ns: 6269.81, BW: 9408.54 GB/s)
258 | </PRE>
259 | 
260 | ## Contacts
261 | 
262 | For comments, questions or anything related, write to Mauro Bisson at maurob@nvidia.com.
263 | 


--------------------------------------------------------------------------------
/optimized/cuBlumeCapel/cudamacro.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | #ifndef __CUDA_MACRO_H__
23 | #define __CUDA_MACRO_H__
24 | 
25 | #define CHECK_CUDA(call) {                                                   \
26 |     cudaError_t err = call;                                                  \
27 |     if( cudaSuccess != err) {                                                \
28 |         fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
29 |                 __FILE__, __LINE__, cudaGetErrorString( err) );              \
30 |         exit(EXIT_FAILURE);                                                  \
31 |     }}
32 | 
33 | #define CHECK_ERROR(errorMessage) {                                          \
34 |     cudaError_t err = cudaGetLastError();                                    \
35 |     if( cudaSuccess != err) {                                                \
36 |         fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
37 |                 errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
38 |         exit(EXIT_FAILURE);                                                  \
39 |     }}
40 | #endif
41 | 


--------------------------------------------------------------------------------
/optimized/cuBlumeCapel/utils.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Mauro Bisson <maurob@nvidia.com>
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a
  7 |  * copy of this software and associated documentation files (the "Software"),
  8 |  * to deal in the Software without restriction, including without limitation
  9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 10 |  * and/or sell copies of the Software, and to permit persons to whom the
 11 |  * Software is furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in
 14 |  * all copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 22 |  * DEALINGS IN THE SOFTWARE.
 23 |  */
 24 | #include <errno.h>
 25 | #include <stdio.h>
 26 | #include <stdlib.h>
 27 | #include <string.h>
 28 | #include <sys/types.h>
 29 | #include <sys/stat.h>
 30 | #include <unistd.h>
 31 | #include <time.h>
 32 | 
 33 | void *Malloc(size_t sz) {
 34 | 
 35 | 	void *ptr;
 36 | 
 37 | 	if (!sz) {
 38 | 		printf("Allocating zero bytes...\n");
 39 | 		exit(EXIT_FAILURE);
 40 | 	}
 41 | 	ptr = (void *)malloc(sz);
 42 | 	if (!ptr) {
 43 | 		fprintf(stderr, "Cannot allocate %zu bytes...\n", sz);
 44 | 		exit(EXIT_FAILURE);
 45 | 	}
 46 | 	memset(ptr, 0, sz);
 47 | 	return ptr;
 48 | }
 49 | 
 50 | void Free(void **ptr) {
 51 | 
 52 | 	if (*ptr) {
 53 | 		free(*ptr);
 54 | 		*ptr = NULL;
 55 | 	}
 56 | 	return;
 57 | }
 58 | 
 59 | void *Realloc(void *ptr, size_t sz) {
 60 | 
 61 |         void *lp;
 62 | 
 63 | 	if (!sz) {
 64 | 		printf("Re-allocating to zero bytes, are you sure you want this?\n");
 65 | 	}
 66 |         lp = (void *)realloc(ptr, sz);
 67 |         if (!lp && sz) {
 68 |                 fprintf(stderr, "Cannot reallocate to %zu bytes...\n", sz);
 69 |                 exit(EXIT_FAILURE);
 70 |         }
 71 |         return lp;
 72 | }
 73 | 
 74 | FILE *Fopen(const char *path, const char *mode) {
 75 | 
 76 |         FILE *fp = NULL;
 77 |         fp = fopen(path, mode);
 78 |         if (!fp) {
 79 |                 fprintf(stderr, "Cannot open file %s...\n", path);
 80 |                 exit(EXIT_FAILURE);
 81 |         }
 82 |         return fp;
 83 | }
 84 | 
 85 | size_t Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) {
 86 | 
 87 | 	size_t wmemb=0;
 88 | 
 89 | 	wmemb = fwrite(ptr, size, nmemb, stream);
 90 | 	if (wmemb < nmemb) {
 91 | 		fprintf(stderr, "Error while writing to file!\n");
 92 | 		exit(EXIT_FAILURE);
 93 | 	}
 94 | 	return wmemb;
 95 | }
 96 | 
 97 | size_t Fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
 98 | 
 99 | 	size_t rmemb=0;
100 | 
101 | 	rmemb = fread(ptr, size, nmemb, stream);
102 | 	if (rmemb < nmemb && ferror(stream)) {
103 | 		fprintf(stderr, "Error while reading from file, could not read more than %zu elements!\n", rmemb);
104 | 		exit(EXIT_FAILURE);
105 | 	}
106 | 	return rmemb;
107 | }
108 | 
109 | int Remove(const char *pathname) {
110 | 
111 | 	int rv = remove(pathname);
112 | 	if (rv && errno != ENOENT) {
113 | 		fprintf(stderr, "Error removing file %s: %s\n", pathname, strerror(errno));
114 | 		exit(EXIT_FAILURE);
115 | 	}
116 | 	return rv;
117 | }
118 | 
119 | off_t getFsize(const char *fpath) {
120 | 
121 |         struct stat     st;
122 |         int             rv;
123 | 
124 |         rv = stat(fpath, &st);
125 |         if (rv) {
126 |                 fprintf(stderr, "Cannot stat file %s...\n", fpath);
127 |                 exit(EXIT_FAILURE);
128 |         }
129 |         return st.st_size;
130 | }
131 | 
132 | double Wtime(void) {
133 | 	struct timespec tp;
134 | 
135 | 	int rv = clock_gettime(CLOCK_MONOTONIC, &tp);
136 | 	if(rv) return 0;
137 | 
138 | 	return tp.tv_nsec/1.0E+9 + (double)tp.tv_sec;
139 | }
140 | 


--------------------------------------------------------------------------------
/optimized/cuBlumeCapel/utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Mauro Bisson <maurob@nvidia.com>
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a
 7 |  * copy of this software and associated documentation files (the "Software"),
 8 |  * to deal in the Software without restriction, including without limitation
 9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 |  * and/or sell copies of the Software, and to permit persons to whom the
11 |  * Software is furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in
14 |  * all copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 |  * DEALINGS IN THE SOFTWARE.
23 |  */
24 | #ifndef __UTILS_H__
25 | #define __UTILS_H__
26 | 
27 | #ifdef __cplusplus
28 | #define UTILS_LINKAGE "C"
29 | #else
30 | #define UTILS_LINKAGE
31 | #endif
32 | 
33 | extern UTILS_LINKAGE void *Malloc(size_t sz);
34 | extern UTILS_LINKAGE void Free(void **ptr);
35 | extern UTILS_LINKAGE void *Realloc(void *ptr, size_t sz);
36 | extern UTILS_LINKAGE FILE *Fopen(const char *path, const char *mode);
37 | extern UTILS_LINKAGE size_t Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
38 | extern UTILS_LINKAGE size_t Fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
39 | extern UTILS_LINKAGE int Remove(const char *pathname);
40 | extern UTILS_LINKAGE off_t getFsize(const char *fpath);
41 | extern UTILS_LINKAGE double Wtime(void);
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/optimized/cuBlumeCapel/vmm_alloc.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and associated documentation files (the "Software"),
  6 |  * to deal in the Software without restriction, including without limitation
  7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 |  * and/or sell copies of the Software, and to permit persons to whom the
  9 |  * Software is furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in
 12 |  * all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 |  * DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | #include <stdio.h>
 23 | #include <stdlib.h>
 24 | #include <cuda.h>
 25 | #include <mpi.h>
 26 | #include "vmm_alloc.h"
 27 | 
 28 | #define MIN(x,y) (((x)<(y))?(x):(y))
 29 | #define MAX(x,y) (((x)>(y))?(x):(y))
 30 | 
 31 | #define DIV_UP(a,b) (((a)+((b)-1))/(b))
 32 | 
 33 | #define MAX_DEVICE_NAME (256)
 34 | 
 35 | #define CHECK_CUDA(call) {                                                   \
 36 |     cudaError_t err = call;                                                    \
 37 |     if( cudaSuccess != err) {                                                \
 38 |         fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
 39 |                 __FILE__, __LINE__, cudaGetErrorString( err) );              \
 40 |         exit(EXIT_FAILURE);                                                  \
 41 |     }}
 42 | 
 43 | #define CHECK_CU(call) {                                                        \
 44 |     CUresult res = call;                                                        \
 45 |     if(CUDA_SUCCESS != res) {                                                   \
 46 | 	const char *errstr=NULL;                                                \
 47 | 	cuGetErrorName(res, &errstr);                                           \
 48 |         fprintf(stderr, "Cuda driver API error in file '%s' in line %d: %s.\n", \
 49 |                 __FILE__, __LINE__, errstr);                                    \
 50 |         exit(EXIT_FAILURE);                                                     \
 51 |     }}
 52 | 
 53 | static void *Malloc(size_t sz) {
 54 | 
 55 | 	void *ptr;
 56 | 
 57 | 	ptr = (void *)malloc(sz);
 58 | 	if (!ptr) {
 59 | 		fprintf(stderr, "Cannot allocate %zu bytes...\n", sz);
 60 | 		exit(EXIT_FAILURE);
 61 | 	}
 62 | 	return ptr;
 63 | }
 64 | 
 65 | size_t vmmFabricGranularity(int device) {
 66 | 
 67 | 	CUmemAllocationProp prop = {};
 68 | 
 69 | 	prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 70 | 	prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 71 | 	prop.location.id = device;
 72 | 
 73 | 	// necessary to export the handle for remote memory access via NVLink
 74 | 	prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
 75 | 
 76 | 	size_t granularity = 0;
 77 | 	CHECK_CU(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
 78 | 
 79 | 	return granularity;
 80 | }
 81 | 
 82 | // call to "allocate" physical memory (cuMemCreate() handle) on GPU "device"
 83 | // On entry size contains de desired size of the allocation; on exit the actual
 84 | // size, which must be a multiple of the granularity
 85 | static CUmemGenericAllocationHandle allocatePhysicalMemory(int device, size_t size) {
 86 | 
 87 | 	CUmemAllocationProp prop = {};
 88 | 
 89 | 	prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 90 | 	prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 91 | 	prop.location.id = device;
 92 | 
 93 | 	// necessary to export the handle for remote memory access via NVLink
 94 | 	prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
 95 | 
 96 | 	size_t granularity = 0;
 97 | 	CHECK_CU(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
 98 | 
 99 | 	if (size % granularity) {
100 | 	
101 | 		cudaDeviceProp props;
102 | 		CHECK_CUDA(cudaGetDeviceProperties(&props, device));
103 | 
104 | 		int nameLen;
105 | 		char procName[MPI_MAX_PROCESSOR_NAME];
106 | 		MPI_Get_processor_name(procName, &nameLen);	
107 | 	
108 | 		fprintf(stderr,
109 | 			"%s:%d: error, requested allocation size (%zu bytes) is "
110 | 			"not a multiple of minimum supported granularity (%zu bytes) "
111 | 			"for device %d (%s) on node %s!\n",
112 | 			__func__, __LINE__, size, granularity, device, props.name, procName);
113 | 		MPI_Abort(MPI_COMM_WORLD, 0);
114 | 	}
115 | 
116 | 	// Ensure size matches granularity requirements for the allocation
117 | 	//size_t padded_size = DIV_UP(size, granularity)*granularity;
118 | #if 0
119 | 	printf("%s:%d: device %d, padded_size: %zu\n", __func__, __LINE__, device, padded_size);
120 | #endif
121 | 	// Allocate physical memory
122 | 	CUmemGenericAllocationHandle allocHandle;
123 | 
124 | 	//printf("device: %d, size: %zu\n", device, size);
125 | 	CHECK_CU(cuMemCreate(&allocHandle, size, &prop, 0));
126 | 
127 | 	return allocHandle;
128 | }
129 | 
130 | static void setAccessOnDevice(int device, CUdeviceptr ptr, size_t size) {
131 | 
132 | 	CUmemAccessDesc accessDesc = {};
133 | 
134 | 	accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
135 | 	accessDesc.location.id = device;
136 | 	accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
137 | 
138 | 	//printf("device: %d\n", device);
139 | 
140 | 	// Make the address accessible
141 | 	CHECK_CU(cuMemSetAccess(ptr, size, &accessDesc, 1));
142 | 
143 | 	return;
144 | }
145 | 
146 | vmmAllocCtx_t *vmmFabricMalloc(void **devPtr, size_t sizePerGpu) {
147 | 
148 | 	int inited = 0;
149 | 	MPI_Initialized(&inited);
150 | 
151 | 	if (!inited) {
152 | 		fprintf(stderr,
153 | 			"%s:%d: error, MPI must be initialized  before calling this function!\n",
154 | 			__func__, __LINE__);
155 | 		exit(EXIT_FAILURE);
156 | 	}
157 | 
158 | 	int rank, ntask;
159 | 
160 | 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
161 | 	MPI_Comm_size(MPI_COMM_WORLD, &ntask);
162 | 
163 | 	char (*procNames)[MPI_MAX_PROCESSOR_NAME] = (char (*)[MPI_MAX_PROCESSOR_NAME])Malloc(sizeof(*procNames)*ntask);
164 | 	int nameLen;
165 | 	MPI_Get_processor_name(procNames[rank], &nameLen);
166 | 	MPI_Gather(procNames[rank], MPI_MAX_PROCESSOR_NAME, MPI_CHAR, procNames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, MPI_COMM_WORLD);
167 | 
168 | 	int ndev = 0;
169 | 	CHECK_CUDA(cudaGetDeviceCount(&ndev));
170 | 
171 | 	int ndev_or;
172 | 	int ndev_and;
173 | 	MPI_Allreduce(&ndev, &ndev_or,  1, MPI_INT, MPI_BOR,  MPI_COMM_WORLD);
174 | 	MPI_Allreduce(&ndev, &ndev_and, 1, MPI_INT, MPI_BAND, MPI_COMM_WORLD);
175 | 	if (ndev_or != ndev_and) {
176 | 		if (!rank) {
177 | 			fprintf(stderr,
178 | 				"%s:%d: error, not all processes have the same number of GPUs!\n",
179 | 				__func__, __LINE__);
180 | 		}
181 | 		MPI_Abort(MPI_COMM_WORLD, 0);
182 | 	}
183 | 	
184 | 	// local GPUs
185 | 	cudaDeviceProp *props = (cudaDeviceProp *)Malloc(sizeof(*props)*ndev);
186 | 	for(int i = 0; i < ndev; i++) {
187 | 		CHECK_CUDA(cudaGetDeviceProperties(props+i, i));
188 | 	}
189 | 
190 | 	// check local GPUs support
191 | 	for(int i = 0; i < ndev; i++) {
192 | 
193 | 		int deviceSupportsVmm;
194 | 		CHECK_CU(cuDeviceGetAttribute(&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, i));
195 | 		if (!deviceSupportsVmm) {
196 | 			fprintf(stderr,
197 | 				"%s:%d: error, device %d (%s) on node %s does NOT support Virtual Memory Management!\n",
198 | 				__func__, __LINE__, i, props[i].name, procNames[rank]);
199 | 			MPI_Abort(MPI_COMM_WORLD, 0);
200 | 		}
201 | 
202 | 		// FOR FABRIC
203 | 		int deviceSupportsFabricMem;
204 | 		CHECK_CU(cuDeviceGetAttribute(&deviceSupportsFabricMem, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, i));
205 | 		if (deviceSupportsFabricMem == 0) {
206 | 			fprintf(stderr,
207 | 				"%s:%d: error, device %d (%s) on node %s does NOT support Fabric Handles!\n",
208 | 				__func__, __LINE__, i, props[i].name, procNames[rank]);
209 | 			MPI_Abort(MPI_COMM_WORLD, 0);
210 | 		}
211 | 	}
212 | 
213 | 	// check that all GPUs are of the same kind (this may be relaxed)
214 | 	cudaDeviceProp *props_all = NULL;
215 | 	if (!rank) {
216 | 		props_all = (cudaDeviceProp *)Malloc(sizeof(*props)*ntask*ndev);
217 | 	}
218 | 
219 | 	MPI_Datatype MPI_DEV_PROP;
220 | 	MPI_Type_contiguous(sizeof(cudaDeviceProp), MPI_BYTE, &MPI_DEV_PROP);
221 | 	MPI_Type_commit(&MPI_DEV_PROP);
222 | 
223 | 	MPI_Gather(props, ndev, MPI_DEV_PROP, props_all, ndev, MPI_DEV_PROP, 0, MPI_COMM_WORLD);
224 | 
225 | 	if (!rank) {
226 | 		for(int i = 1; i < ntask*ndev; i++) {
227 | 			if (strncmp(props_all[i-1].name, props_all[i].name, MAX_DEVICE_NAME)) {
228 | 				fprintf(stderr,
229 | 					"%s:%d: error, device %d from proc %d (%s) and "
230 | 					"device %d from proc %d (%s) are different:\n"
231 | 					"\t%s\n\t%s\n",
232 | 					__func__, __LINE__,
233 | 					(i-1)%ndev, (i-1)/ndev, procNames[(i-1)/ndev],
234 | 					 i   %ndev,  i   /ndev, procNames[ i   /ndev],
235 | 					props_all[i-1].name, props_all[i].name);
236 | 				MPI_Abort(MPI_COMM_WORLD, 0);
237 | 			}
238 | 		}
239 | 	}
240 | 	free(props);
241 | 	free(props_all);
242 | 
243 | 	// allocate local handles
244 | 	CUmemGenericAllocationHandle *handles = (CUmemGenericAllocationHandle *)Malloc(sizeof(*handles)*ntask*ndev);
245 | 	memset(handles, 0, sizeof(*handles)*ntask*ndev);
246 | 
247 | 	for(int i = 0; i < ndev; i++) {
248 | 		handles[rank*ndev + i] = allocatePhysicalMemory(i, sizePerGpu);
249 | 	}
250 | 
251 | 	// export local handles
252 | 	CUmemFabricHandle *fabricHandles = (CUmemFabricHandle *)Malloc(sizeof(*fabricHandles)*ntask*ndev);
253 | 	memset(fabricHandles, 0, sizeof(*fabricHandles)*ntask*ndev);
254 | 	for(int i = 0; i < ndev; i++) {
255 | 		//printf("CU_MEM_HANDLE_TYPE_FABRIC: %d, CU_MEM_HANDLE_TYPE_MAX: %d\n", CU_MEM_HANDLE_TYPE_FABRIC, CU_MEM_HANDLE_TYPE_MAX);
256 | 		CHECK_CU(cuMemExportToShareableHandle(&fabricHandles[ndev*rank + i],
257 | 						      handles[ndev*rank + i],
258 | 						      CU_MEM_HANDLE_TYPE_FABRIC, 0));
259 | 	}
260 | 
261 | 	// distribute local handles
262 | 	MPI_Datatype MPI_FABRIC_HANDLE;
263 | 	MPI_Type_contiguous(sizeof(CUmemFabricHandle), MPI_BYTE, &MPI_FABRIC_HANDLE);
264 | 	MPI_Type_commit(&MPI_FABRIC_HANDLE);
265 | 
266 | 	MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, fabricHandles, ndev, MPI_FABRIC_HANDLE, MPI_COMM_WORLD);
267 | 
268 | 	// import remote handles
269 | 	for(int i = 0; i < ntask; i++) {
270 | 		if (i == rank) {
271 | 			continue;
272 | 		}
273 | 		for(int d = 0; d < ndev; d++) {
274 | 			CHECK_CU(cuMemImportFromShareableHandle(&handles[i*ndev + d],
275 | 							        &fabricHandles[i*ndev + d],
276 | 								CU_MEM_HANDLE_TYPE_FABRIC));
277 | 		}
278 | 	}
279 | 	// this can now be removed?
280 | 	free(fabricHandles);
281 | 
282 | 	// create a (large) Virtual Address range and map local and remote handles
283 | 	const size_t totalSize = sizePerGpu*size_t(ntask)*size_t(ndev);
284 | 
285 | 	CUdeviceptr cuptr;
286 | 	CHECK_CU(cuMemAddressReserve(&cuptr, totalSize, 0, 0, 0));
287 | 
288 | 	for(size_t i = 0; i < ntask; i++) {
289 | 		for(size_t d = 0; d < ndev; d++) {
290 | 			CHECK_CU(cuMemMap(cuptr + i*sizePerGpu*ndev + d*sizePerGpu,
291 | 					  sizePerGpu, 0, handles[i*ndev + d], 0));
292 | 		}
293 | 	}
294 | 
295 | 	for(int d = 0; d < ndev; d++) {
296 | 		setAccessOnDevice(d, cuptr, totalSize); //sizePerGpu*ntask*ndev);
297 | 	}
298 | 
299 | 
300 | 	free(procNames);
301 | 
302 | 	vmmAllocCtx_t *ctx = (vmmAllocCtx_t *)Malloc(sizeof(*ctx));
303 | 	
304 | 	ctx->cuptr = cuptr;
305 | 	ctx->virtAddrRangeSize = totalSize;
306 | 
307 | 	ctx->handles = handles;
308 | 
309 | 	*devPtr = (void *)cuptr;
310 | 
311 | 	return ctx;
312 | }
313 | 
314 | void vmmFabricFree(vmmAllocCtx_t *ctx) {
315 | 
316 | 	int ndev = 0;
317 | 	CHECK_CUDA(cudaGetDeviceCount(&ndev));
318 | 	
319 | 	int rank, ntask;
320 | 
321 | 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
322 | 	MPI_Comm_size(MPI_COMM_WORLD, &ntask);
323 | 
324 | 	CHECK_CU(cuMemUnmap(ctx->cuptr, ctx->virtAddrRangeSize));
325 | 
326 | 	for(int i = 0; i < ntask; i++) {
327 | 		for(int d = 0; d < ndev; d++) {
328 | 			CHECK_CU(cuMemRelease(ctx->handles[i*ndev + d]));
329 | 		}
330 | 	}
331 | 	CHECK_CU(cuMemAddressFree(ctx->cuptr, ctx->virtAddrRangeSize));
332 | 
333 | 	free(ctx->handles);
334 | 	free(ctx);
335 | 
336 | 	return;
337 | }
338 | 


--------------------------------------------------------------------------------
/optimized/cuBlumeCapel/vmm_alloc.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | #ifndef __VMM_ALLOC_H__
23 | #define __VMM_ALLOC_H__
24 | 
25 | typedef struct {
26 | 
27 | 	CUdeviceptr cuptr;
28 | 	size_t virtAddrRangeSize;
29 | 
30 | 	CUmemGenericAllocationHandle *handles;
31 | 
32 | } vmmAllocCtx_t;
33 | 
34 | #ifdef __cplusplus
35 | extern "C" {
36 | #endif
37 | 
38 | // helper to obtain the minimum size for fabric allocations
39 | size_t vmmFabricGranularity(int device);
40 | 
41 | // Allocates sizePerGPU bytes on each device 
42 | // visible to each MPI rank and return to
43 | // each caller the starting address of a 
44 | // Virtual Address range to which all the 
45 | // allocations are mapped. Mappings are
46 | // performed in Rank,DeviceId order:
47 | //
48 | // <Rank   0, Device 0><Rank   0, Device 1>, <Rank   0, Device N-1>, 
49 | // <Rank   1, Device 0><Rank   1, Device 1>, <Rank   1, Device N-1>,
50 | // ...
51 | // <Rank M-1, Device 0><Rank M-1, Device 1>, <Rank M-1, Device N-1>,
52 | //
53 | // Remote memories are accessed via FABRIC handles.
54 | //
55 | // Requirements:
56 | //   * all ranks must have access to the same number of GPUs;
57 | //   * all the GPUs must be the same type;
58 | vmmAllocCtx_t *vmmFabricMalloc(void **devPtr, size_t sizePerGpu);
59 | 
60 | void vmmFabricFree(vmmAllocCtx_t *ctx);
61 | 
62 | #ifdef __cplusplus
63 | }
64 | #endif
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/optimized/cuIsingModel/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 NVIDIA Corporation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/optimized/cuIsingModel/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_HOME = /opt/cuda-12.8.1
 2 | CUDACC = $(CUDA_HOME)/bin/nvcc
 3 | CC = gcc
 4 | LD = $(CUDACC)
 5 | 
 6 | CFLAGS = -c -O3 -g -I$(CUDA_HOME)/include
 7 | 
 8 | SMS ?= 89
 9 | CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM))
10 | 
11 | CUDACFLAGS = -c -O3 -lineinfo $(CUDA_ARCH) -Xptxas=-v
12 | LDFLAGS    = -Xcompiler=-fopenmp -lquadmath
13 | 
14 | C_SRCS  = utils.c
15 | CU_SRCS = main.cu
16 |                                 
17 | ifdef USE_MNNVL
18 | $(info Compiling with MNNVL support...)
19 | MPI_HOME = /cm/shared/apps/openmpi/4.1.5
20 | 
21 | CUDACFLAGS += -I$(MPI_HOME)/include -DUSE_MNNVL
22 | LDFLAGS    += -L$(MPI_HOME)/lib -lcuda -lmpi
23 | CU_SRCS    += vmm_alloc.cu
24 | endif
25 | 
26 | C_OBJS = $(patsubst %.c, %.o, $(C_SRCS))
27 | CU_OBJS = $(patsubst %.cu, %.o, $(CU_SRCS))
28 | 
29 | all: cuIsing
30 | 
31 | cuIsing: $(CU_OBJS) $(C_OBJS)
32 | 	$(LD) -o cuIsing $(CU_OBJS) $(C_OBJS) $(LDFLAGS)
33 | 
34 | %.o: %.cu
35 | 	$(CUDACC) $(CUDACFLAGS) $<
36 | 
37 | %.o: %.c
38 | 	$(CC) $(CFLAGS) $< -o $@
39 | 
40 | clean:
41 | 	-@rm -f *.o cuIsing *.sass &> /dev/null || true
42 | 


--------------------------------------------------------------------------------
/optimized/cuIsingModel/README.md:
--------------------------------------------------------------------------------
  1 | # A CUDA implementation for the Ising model supporting Multi-Node NVLink
  2 | 
  3 | A high performance Ising model implementation for GPU. The code can run on
  4 | multiple GPUs connected to the same node or on multiple nodes connected via
  5 | NVLink (MNNVL).
  6 | 
  7 | To compile the code to run on single node, adjust the Makefile to point to your CUDA
  8 | installation, specify the CUDA architecture you want to compile for and then
  9 | run `make`. That should be enough to produce the ``cuIsing`` binary.
 10 | 
 11 | For multi-node, in addition to the Makefile adjustment above, also modify it to 
 12 | point to your MPI installation and then compile it with `make USE_MNNVL=1`.
 13 | 
 14 | When running on a single node, the code uses managed memory. On multiple nodes
 15 | with MNNVL, it uses [fabric memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#fabric-memory).
 16 | 
 17 | When more than one GPU is used, the spin system is partitioned vertically.
 18 | 
 19 | ## Usage
 20 | 
 21 | <PRE>
 22 | Usage: cuIsing [options]
 23 | options:
 24 |         -x|--x &lt;HORIZ_DIM&gt;
 25 |                 Specifies the horizontal dimension of the entire  lattice  (black+white  spins).
 26 |                 This dimension must be a multiple of 4096.
 27 | 
 28 |         -y|--y &lt;VERT_DIM&gt;
 29 |                 Specifies the vertical dimension of the per-GPU lattice.  This dimension must be
 30 |                 a multiple of 16.
 31 | 
 32 |         -n|--n &lt;NSTEPS&gt;
 33 |                 Specifies the number of iteration to run.
 34 |                 Defualt: 1
 35 | 
 36 |         -g|--gpus &lt;NUM_DEVICES&gt;
 37 |                 Specifies the number of GPUs to use. Will use devices with ids [0, NUM_DEVS-1].
 38 |                 Defualt: 1.
 39 | 
 40 |         -s|--seed &lt;SEED&gt;
 41 |                 Specifies the seed used to generate random numbers.
 42 |                 Default: 463463564571
 43 | 
 44 |         -a|--alpha &lt;ALPHA&gt;
 45 |                 Specifies the temperature in T_CRIT units.  If both this  option  and  '-t'  are
 46 |                 specified then the '-t' option is used.
 47 |                 Default: 0.100000
 48 | 
 49 |         -t|--temp &lt;TEMP_0&gt;[[,&lt;IT_1&gt;:&lt;TEMP_1&gt;]...]
 50 |                 Specifies the temperature(s), in absolute  units.   It  is  possible  to  use  a
 51 |                 temperature-changing   protocol   by   specifying   a   sequence   of    couples
 52 |                 &lt;IT_i&gt;:&lt;TEMP_i&gt; after the first temperature &lt;TEMP_0&gt;. The value &lt;IT_i&gt; specifies
 53 |                 the time step at which the temperature  changes  from  &lt;TEMP_i-1&gt;  to  &lt;TEMP_i&gt;.
 54 |                 Temperature &lt;TEMP_0&gt; is the starting temperature and thus  does  not  require  a
 55 |                 time step specification. 
 56 |                 Default: 0.226919
 57 | 
 58 |         -p|--print &lt;STAT_FREQ&gt;
 59 |                 Specifies the frequency, in no.  of  iteration,  with  which  the  magnetization
 60 |                 statistics is printed. If this option is used with --pexp, this option is ignored.
 61 |                 Default: only at the beginning and at end of the simulation
 62 | 
 63 |         --pexp
 64 |                 Prints statistics every power-of-2 time steps.  This  option  overrides  the  -p
 65 |                 option.
 66 |                 Default: disabled
 67 | 
 68 |         -c|--corr &lt;CORR_FILE_PATH&gt;
 69 |                 Enables correlation and writes to file CORR_FILE_PATH  the  correlation of  each
 70 |                 point with the vertical and  orizontal  neighbors at distance r <= 256.   Beyond
 71 |                 that, distance as chosen according to an  exponential rule, with 32  values  per
 72 |                 power of 2.  The  correlation  is  computed  every  time  the  magnetization  is
 73 |                 printed on screen (based  on  either  the  '-p'  or  '-e'  options)  and  it  is
 74 |                 written in the  file one line per measure.
 75 |                 Default: full correlation (see --corrfull option)
 76 | 
 77 |         --corrfull
 78 |                 Compute the correlation for each spin in the system.
 79 | 
 80 |         --corrdiag
 81 |                 Compute the correlation only for diagonal spins.
 82 | 
 83 |         --corrchkb
 84 |                 Computes the correlation for only one spin (the top-left one)  for each block of
 85 |                 16x16 spins (checkerboard pattern).
 86 | 
 87 |         --corrmixd
 88 |                 Computes the correlation using a mix of full and checkerboard modes.   The  full
 89 |                 correlation is used for  all distances  r &lt;= 32. Then,  for each spin in a 16x16
 90 |                 square, it is computed for each r &gt; 32.
 91 | 
 92 |         --writechkp &lt;CHECKPOINT_FILE_PATH&gt;
 93 |                 Enables write of checkpoint file at the end of the simulation.  The file can  be
 94 |                 later used to resume the simulation with the '-r' option.  This option and  '-r'
 95 |                 can be used together to break down a  large  run  into  multiple  smaller  runs.
 96 | 
 97 |         --readchkp &lt;CHECKPOINT_FILE_PATH&gt;
 98 |                 Enables the restart of a simulation from the state in a checkpoint file.  Please
 99 |                 note that in order for that to work, the non-checkpoint  command  lines  options
100 |                 used in the run where the checkpoint file was created must match with those used
101 |                 in the run where the checkpoint file is read.  This option and '-r' can be  used
102 |                 together  to  break   down   a   large   run   into   multiple   smaller   runs.
103 | 
104 |         -o|--o
105 |                 Enables the file dump of  the lattice  every time  the magnetization is printed.
106 |                 Default: off
107 | </PRE>
108 | 
109 | For example, to run 102400 steps on a 16384^2 lattice using one GPU, using temperature 1.5 and
110 | printing the statistics every 10240 steps:
111 | 
112 | <PRE>
113 | $ ./cuIsing -y 16384 -x 16384 -n 102400 -p 10240 -t 1.5
114 | 
115 | Using GPUs:
116 |          0 (NVIDIA RTX 6000 Ada Generation, 48 GB, 142 SMs, 1536 th/SM max, CC 8.9, ECC off)
117 | 
118 | Run configuration:
119 |         word size: 16
120 |         bits per spin: 1 (mask: 0x1)
121 |         spins/word: 128
122 |         spins: 268435456 (~2.68E+08)
123 |         seed: 463463564571
124 |         block size (X, Y): 16, 16
125 |         tile  size (X, Y): 16, 16
126 |         grid  size (X, Y): 4, 1024
127 |         spins per tile (X, Y): 2048, 2048
128 | 
129 |         iterations:
130 |                 beg: 1
131 |                 end: 102400
132 |                 tot: 102400
133 | 
134 |         print stats every 10240 steps
135 |         temp: 1.5 (0.661030190265538*T_crit)
136 | 
137 |         local lattice size:         16384 x    16384 spins
138 |         local lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
139 | 
140 |         total lattice size:         16384 x    16384 spins
141 |         total lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
142 | 
143 |         total memory: 0.03 GB (0.03 GB per GPU)
144 | 
145 |         random-bit table:
146 |                 size of element: 32-bit
147 |                 no. of elements: 16
148 |                 bits per lookup: 4
149 | 
150 | Setting up GPUs:
151 |         GPU  0 done in 0.001597 secs
152 | 
153 | Initializing spin lattice... done in 0.011790 secs
154 | 
155 | Running simulation...
156 | 
157 |         Step          Magn.          N(-1)           N(1)     SD value     flips/ns         GB/s          ERT
158 | 
159 |            0   8.381903E-05      134206478      134228978    16.936372
160 |        10240   5.389421E-02      141451286      126984170     1.000717      1399.28       527.46       19.65s
161 |        20480   6.544993E-02      143002269      125433187     0.999917      1392.13       524.77       19.70s
162 |        30720   7.027917E-02      143650439      124785017     1.000416      1387.92       523.18       19.74s
163 |        40960   7.348213E-02      144080332      124355124     0.998606      1385.64       522.32       19.76s
164 |        51200   7.878675E-02      144792307      123643149     1.000069      1385.46       522.25       19.78s
165 |        61440   8.068839E-02      145047541      123387915     0.997942      1384.75       521.99       19.79s
166 |        71680   7.845285E-02      144747491      123687965     1.000395      1383.86       521.65       19.80s
167 |        81920   7.937136E-02      144870771      123564685     1.000686      1378.90       519.78       19.82s
168 |        92160   7.773913E-02      144651698      123783758     0.998647      1375.31       518.43       19.84s
169 |       102400   8.023911E-02      144987239      123448217     1.000491      1371.54       517.00       19.86s
170 | 
171 | Final energy: -1.949967
172 | 
173 | Done in 1.986138E+04 ms (stats overhead: 0.05%, spins/ns: 1383.98, BW: 521.70 GB/s)
174 | </PRE>
175 | 
176 | Run 307200 steps on a 16384^2 lattice using one GPU, in three distinct runs
177 | each of 102400 steps using checkpointing:
178 | 
179 | <PRE>
180 | $ ./cuIsing -y 16384 -x 16384 -n 102400 -p 10240 -t 1.5 -w chkpfile
181 | 
182 | Using GPUs:
183 |          0 (NVIDIA RTX 6000 Ada Generation, 48 GB, 142 SMs, 1536 th/SM max, CC 8.9, ECC off)
184 | 
185 | Run configuration:
186 |         word size: 16
187 |         bits per spin: 1 (mask: 0x1)
188 |         spins/word: 128
189 |         spins: 268435456 (~2.68E+08)
190 |         seed: 463463564571
191 |         block size (X, Y): 16, 16
192 |         tile  size (X, Y): 16, 16
193 |         grid  size (X, Y): 4, 1024
194 |         spins per tile (X, Y): 2048, 2048
195 | 
196 |         iterations:
197 |                 beg: 1
198 |                 end: 102400
199 |                 tot: 102400
200 | 
201 |         print stats every 10240 steps
202 |         temp: 1.5 (0.661030190265538*T_crit)
203 | 
204 |         local lattice size:         16384 x    16384 spins
205 |         local lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
206 | 
207 |         total lattice size:         16384 x    16384 spins
208 |         total lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
209 | 
210 |         total memory: 0.03 GB (0.03 GB per GPU)
211 | 
212 |         random-bit table:
213 |                 size of element: 32-bit
214 |                 no. of elements: 16
215 |                 bits per lookup: 4
216 | 
217 | Setting up GPUs:
218 |         GPU  0 done in 0.001700 secs
219 | 
220 | Initializing spin lattice... done in 0.012194 secs
221 | 
222 | Running simulation...
223 | 
224 |         Step          Magn.          N(-1)           N(1)     SD value     flips/ns         GB/s          ERT
225 | 
226 |            0   8.381903E-05      134206478      134228978    16.936372
227 |        10240   5.389421E-02      141451286      126984170     1.000717      1351.59       509.49       20.34s
228 |        20480   6.544993E-02      143002269      125433187     0.999917      1352.59       509.86       20.34s
229 |        30720   7.027917E-02      143650439      124785017     1.000416      1347.67       508.01       20.36s
230 |        40960   7.348213E-02      144080332      124355124     0.998606      1349.08       508.54       20.36s
231 |        51200   7.878675E-02      144792307      123643149     1.000069      1351.91       509.61       20.36s
232 |        61440   8.068839E-02      145047541      123387915     0.997942      1355.41       510.93       20.35s
233 |        71680   7.845285E-02      144747491      123687965     1.000395      1353.09       510.05       20.34s
234 |        81920   7.937136E-02      144870771      123564685     1.000686      1352.15       509.70       20.34s
235 |        92160   7.773913E-02      144651698      123783758     0.998647      1347.46       507.93       20.35s
236 |       102400   8.023911E-02      144987239      123448217     1.000491      1345.72       507.27       20.36s
237 | 
238 | Final energy: -1.949967
239 | 
240 | Done in 2.035810E+04 ms (stats overhead: 0.05%, spins/ns: 1350.21, BW: 508.97 GB/s)
241 | 
242 | Writing checkpoint to file chkpfile... done in 0.083085 secs
243 | </PRE>
244 | <PRE>
245 | $ ./cuIsing -y 16384 -x 16384 -n 102400 -p 10240 -t 1.5 -w chkpfile -r chkpfile
246 | 
247 | Using GPUs:
248 |          0 (NVIDIA RTX 6000 Ada Generation, 48 GB, 142 SMs, 1536 th/SM max, CC 8.9, ECC off)
249 | 
250 | Reading checkpoint from file chkpfile... done in 0.010425 secs
251 | 
252 | Run configuration:
253 |         word size: 16
254 |         bits per spin: 1 (mask: 0x1)
255 |         spins/word: 128
256 |         spins: 268435456 (~2.68E+08)
257 |         seed: 463463564571
258 |         block size (X, Y): 16, 16
259 |         tile  size (X, Y): 16, 16
260 |         grid  size (X, Y): 4, 1024
261 |         spins per tile (X, Y): 2048, 2048
262 | 
263 |         iterations:
264 |                 beg: 102401
265 |                 end: 204800
266 |                 tot: 102400
267 | 
268 |         print stats every 10240 steps
269 |         temp: 1.5 (0.661030190265538*T_crit)
270 | 
271 |         local lattice size:         16384 x    16384 spins
272 |         local lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
273 | 
274 |         total lattice size:         16384 x    16384 spins
275 |         total lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
276 | 
277 |         total memory: 0.03 GB (0.03 GB per GPU)
278 | 
279 |         random-bit table:
280 |                 size of element: 32-bit
281 |                 no. of elements: 16
282 |                 bits per lookup: 4
283 | 
284 | Setting up GPUs:
285 |         GPU  0 done in 0.003768 secs
286 | 
287 | Running simulation...
288 | 
289 |         Step          Magn.          N(-1)           N(1)     SD value     flips/ns         GB/s          ERT
290 | 
291 |       102400   8.023911E-02      144987239      123448217     1.000491
292 |       112640   8.427709E-02      145529207      122906249     0.999487      1369.15       516.10       20.08s
293 |       122880   8.961894E-02      146246178      122189278     1.001249      1362.94       513.76       20.13s
294 |       133120   8.933730E-02      146208378      122227078     0.999772      1356.70       511.41       20.17s
295 |       143360   8.894347E-02      146155518      122279938     1.000053      1356.84       511.46       20.20s
296 |       153600   8.961185E-02      146245227      122190229     1.000030      1352.37       509.78       20.22s
297 |       163840   8.997627E-02      146294138      122141318     0.999970      1352.44       509.81       20.24s
298 |       174080   8.834548E-02      146075257      122360199     1.000698      1352.11       509.68       20.26s
299 |       184320   8.784929E-02      146008660      122426796     1.000313      1349.95       508.87       20.27s
300 |       194560   9.042334E-02      146354143      122081313     1.000820      1348.24       508.22       20.28s
301 |       204800   9.108921E-02      146443515      121991941     1.000014      1346.63       507.62       20.30s
302 | 
303 | Final energy: -1.950272
304 | 
305 | Done in 2.029726E+04 ms (stats overhead: 0.05%, spins/ns: 1354.26, BW: 510.49 GB/s)
306 | 
307 | Writing checkpoint to file chkpfile... done in 0.082859 secs
308 | </PRE>
309 | <PRE>
310 | $ ./cuIsing -y 16384 -x 16384 -n 102400 -p 10240 -t 1.5 -r chkpfile
311 | 
312 | Using GPUs:
313 |          0 (NVIDIA RTX 6000 Ada Generation, 48 GB, 142 SMs, 1536 th/SM max, CC 8.9, ECC off)
314 | 
315 | Reading checkpoint from file chkpfile... done in 0.010423 secs
316 | 
317 | Run configuration:
318 |         word size: 16
319 |         bits per spin: 1 (mask: 0x1)
320 |         spins/word: 128
321 |         spins: 268435456 (~2.68E+08)
322 |         seed: 463463564571
323 |         block size (X, Y): 16, 16
324 |         tile  size (X, Y): 16, 16
325 |         grid  size (X, Y): 4, 1024
326 |         spins per tile (X, Y): 2048, 2048
327 | 
328 |         iterations:
329 |                 beg: 204801
330 |                 end: 307200
331 |                 tot: 102400
332 | 
333 |         print stats every 10240 steps
334 |         temp: 1.5 (0.661030190265538*T_crit)
335 | 
336 |         local lattice size:         16384 x    16384 spins
337 |         local lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
338 | 
339 |         total lattice size:         16384 x    16384 spins
340 |         total lattice shape: 2 x    16384 x       64 ull2s (     2097152 total)
341 | 
342 |         total memory: 0.03 GB (0.03 GB per GPU)
343 | 
344 |         random-bit table:
345 |                 size of element: 32-bit
346 |                 no. of elements: 16
347 |                 bits per lookup: 4
348 | 
349 | Setting up GPUs:
350 |         GPU  0 done in 0.003810 secs
351 | 
352 | Running simulation...
353 | 
354 |         Step          Magn.          N(-1)           N(1)     SD value     flips/ns         GB/s          ERT
355 | 
356 |       204800   9.108921E-02      146443515      121991941     1.000014
357 |       215040   8.998523E-02      146295341      122140115     0.999673      1354.50       510.58       20.30s
358 |       225280   8.892218E-02      146152661      122282795     0.999000      1344.27       506.73       20.38s
359 |       235520   9.020317E-02      146324593      122110863     1.000224      1343.46       506.42       20.41s
360 |       245760   9.139725E-02      146484859      121950597     0.999815      1342.54       506.07       20.43s
361 |       256000   9.055272E-02      146371509      122063947     0.999528      1341.84       505.81       20.44s
362 |       266240   8.986650E-02      146279405      122156051     1.000316      1339.66       504.99       20.45s
363 |       276480   9.154957E-02      146505303      121930153     1.001214      1335.71       503.50       20.47s
364 |       286720   9.230582E-02      146606805      121828651     0.999690      1336.05       503.63       20.49s
365 |       296960   9.236395E-02      146614608      121820848     0.998615      1333.45       502.65       20.50s
366 |       307200   9.218215E-02      146590207      121845249     1.000438      1332.56       502.31       20.51s
367 | 
368 | Final energy: -1.950339
369 | 
370 | Done in 2.051432E+04 ms (stats overhead: 0.05%, spins/ns: 1339.93, BW: 505.09 GB/s)
371 | </PRE>
372 | 
373 | 
374 | To run 128 steps on a 2^20x2^20 lattice using 8 H100 GPUs:
375 | 
376 | <PRE>
377 | $ ./cuIsing -y $((2**20 / 8)) -x $((2**20)) -n 128 -p 128 -t 1.5 -g 8
378 | 
379 | Using GPUs:
380 | 	 0 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
381 | 	 1 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
382 | 	 2 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
383 | 	 3 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
384 | 	 4 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
385 | 	 5 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
386 | 	 6 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
387 | 	 7 (NVIDIA H100 80GB HBM3, 80 GB, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
388 | 
389 | Run configuration:
390 | 	word size: 16
391 | 	bits per spin: 1 (mask: 0x1)
392 | 	spins/word: 128
393 | 	spins: 1099511627776 (~1.10E+12)
394 | 	seed: 463463564571
395 | 	block size (X, Y): 16, 16
396 | 	tile  size (X, Y): 16, 16
397 | 	grid  size (X, Y): 256, 8192
398 | 	spins per tile (X, Y): 2048, 2048
399 | 
400 | 	iterations:
401 | 		beg: 1
402 | 		end: 128
403 | 		tot: 128
404 | 
405 | 	print stats every 128 steps
406 | 	temp: 1.5 (0.661030190265538*T_crit)
407 | 
408 | 	local lattice size:        131072 x  1048576 spins
409 | 	local lattice shape: 2 x   131072 x     4096 ull2s (  1073741824 total)
410 | 
411 | 	total lattice size:       1048576 x  1048576 spins
412 | 	total lattice shape: 2 x  1048576 x     4096 ull2s (  8589934592 total)
413 | 
414 | 	total memory: 128.00 GB (16.00 GB per GPU)
415 | 
416 | 	random-bit table:
417 | 		size of element: 32-bit
418 | 		no. of elements: 16
419 | 		bits per lookup: 4
420 | 
421 | Setting up GPUs:
422 | 	GPU  0 done in 0.001748 secs
423 | 	GPU  1 done in 0.166805 secs
424 | 	GPU  2 done in 0.166164 secs
425 | 	GPU  3 done in 0.166996 secs
426 | 	GPU  4 done in 0.186960 secs
427 | 	GPU  5 done in 0.187743 secs
428 | 	GPU  6 done in 0.182130 secs
429 | 	GPU  7 done in 0.192766 secs
430 | 
431 | Initializing spin lattice... done in 3.404245 secs
432 | 
433 | Running simulation...
434 | 
435 |         Step          Magn.        N(-1)         N(1)     SD value     flips/ns         GB/s          ERT
436 | 
437 |            0   7.547405E-07 549755398965 549756228811    16.936123
438 |          128   3.269196E-05 549737841294 549773786482     1.000580     10306.30      3867.38       13.78s
439 | 
440 | Final energy: -1.908699
441 | 
442 | Done in 1.377803E+04 ms (stats overhead: 0.90%, spins/ns: 10214.63, BW: 3832.98 GB/s)
443 | </PRE>
444 | 
445 | ## Contacts
446 | 
447 | For comments, questions or anything related, write to Mauro Bisson at maurob@nvidia.com.
448 | 


--------------------------------------------------------------------------------
/optimized/cuIsingModel/cudamacro.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | #ifndef __CUDA_MACRO_H__
23 | #define __CUDA_MACRO_H__
24 | 
25 | #define CHECK_CUDA(call) {                                                   \
26 |     cudaError_t err = call;                                                  \
27 |     if( cudaSuccess != err) {                                                \
28 |         fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
29 |                 __FILE__, __LINE__, cudaGetErrorString( err) );              \
30 |         exit(EXIT_FAILURE);                                                  \
31 |     }}
32 | 
33 | #define CHECK_ERROR(errorMessage) {                                          \
34 |     cudaError_t err = cudaGetLastError();                                    \
35 |     if( cudaSuccess != err) {                                                \
36 |         fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
37 |                 errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
38 |         exit(EXIT_FAILURE);                                                  \
39 |     }}
40 | #endif
41 | 


--------------------------------------------------------------------------------
/optimized/cuIsingModel/utils.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Mauro Bisson <maurob@nvidia.com>
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a
  7 |  * copy of this software and associated documentation files (the "Software"),
  8 |  * to deal in the Software without restriction, including without limitation
  9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 10 |  * and/or sell copies of the Software, and to permit persons to whom the
 11 |  * Software is furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in
 14 |  * all copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 22 |  * DEALINGS IN THE SOFTWARE.
 23 |  */
 24 | #include <errno.h>
 25 | #include <stdio.h>
 26 | #include <stdlib.h>
 27 | #include <string.h>
 28 | #include <sys/types.h>
 29 | #include <sys/stat.h>
 30 | #include <unistd.h>
 31 | #include <time.h>
 32 | 
 33 | void *Malloc(size_t sz) {
 34 | 
 35 | 	void *ptr;
 36 | 
 37 | 	if (!sz) {
 38 | 		printf("Allocating zero bytes...\n");
 39 | 		exit(EXIT_FAILURE);
 40 | 	}
 41 | 	ptr = (void *)malloc(sz);
 42 | 	if (!ptr) {
 43 | 		fprintf(stderr, "Cannot allocate %zu bytes...\n", sz);
 44 | 		exit(EXIT_FAILURE);
 45 | 	}
 46 | 	memset(ptr, 0, sz);
 47 | 	return ptr;
 48 | }
 49 | 
 50 | void Free(void **ptr) {
 51 | 
 52 | 	if (*ptr) {
 53 | 		free(*ptr);
 54 | 		*ptr = NULL;
 55 | 	}
 56 | 	return;
 57 | }
 58 | 
 59 | void *Realloc(void *ptr, size_t sz) {
 60 | 
 61 |         void *lp;
 62 | 
 63 | 	if (!sz) {
 64 | 		printf("Re-allocating to zero bytes, are you sure you want this?\n");
 65 | 	}
 66 |         lp = (void *)realloc(ptr, sz);
 67 |         if (!lp && sz) {
 68 |                 fprintf(stderr, "Cannot reallocate to %zu bytes...\n", sz);
 69 |                 exit(EXIT_FAILURE);
 70 |         }
 71 |         return lp;
 72 | }
 73 | 
 74 | FILE *Fopen(const char *path, const char *mode) {
 75 | 
 76 |         FILE *fp = NULL;
 77 |         fp = fopen(path, mode);
 78 |         if (!fp) {
 79 |                 fprintf(stderr, "Cannot open file %s...\n", path);
 80 |                 exit(EXIT_FAILURE);
 81 |         }
 82 |         return fp;
 83 | }
 84 | 
 85 | size_t Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) {
 86 | 
 87 | 	size_t wmemb=0;
 88 | 
 89 | 	wmemb = fwrite(ptr, size, nmemb, stream);
 90 | 	if (wmemb < nmemb) {
 91 | 		fprintf(stderr, "Error while writing to file!\n");
 92 | 		exit(EXIT_FAILURE);
 93 | 	}
 94 | 	return wmemb;
 95 | }
 96 | 
 97 | size_t Fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
 98 | 
 99 | 	size_t rmemb=0;
100 | 
101 | 	rmemb = fread(ptr, size, nmemb, stream);
102 | 	if (rmemb < nmemb && ferror(stream)) {
103 | 		fprintf(stderr, "Error while reading from file, could not read more than %zu elements!\n", rmemb);
104 | 		exit(EXIT_FAILURE);
105 | 	}
106 | 	return rmemb;
107 | }
108 | 
109 | int Remove(const char *pathname) {
110 | 
111 | 	int rv = remove(pathname);
112 | 	if (rv && errno != ENOENT) {
113 | 		fprintf(stderr, "Error removing file %s: %s\n", pathname, strerror(errno));
114 | 		exit(EXIT_FAILURE);
115 | 	}
116 | 	return rv;
117 | }
118 | 
119 | off_t getFsize(const char *fpath) {
120 | 
121 |         struct stat     st;
122 |         int             rv;
123 | 
124 |         rv = stat(fpath, &st);
125 |         if (rv) {
126 |                 fprintf(stderr, "Cannot stat file %s...\n", fpath);
127 |                 exit(EXIT_FAILURE);
128 |         }
129 |         return st.st_size;
130 | }
131 | 
132 | double Wtime(void) {
133 | 	struct timespec tp;
134 | 
135 | 	int rv = clock_gettime(CLOCK_MONOTONIC, &tp);
136 | 	if(rv) return 0;
137 | 
138 | 	return tp.tv_nsec/1.0E+9 + (double)tp.tv_sec;
139 | }
140 | 


--------------------------------------------------------------------------------
/optimized/cuIsingModel/utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Mauro Bisson <maurob@nvidia.com>
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a
 7 |  * copy of this software and associated documentation files (the "Software"),
 8 |  * to deal in the Software without restriction, including without limitation
 9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 |  * and/or sell copies of the Software, and to permit persons to whom the
11 |  * Software is furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in
14 |  * all copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 |  * DEALINGS IN THE SOFTWARE.
23 |  */
24 | #ifndef __UTILS_H__
25 | #define __UTILS_H__
26 | 
27 | #ifdef __cplusplus
28 | #define UTILS_LINKAGE "C"
29 | #else
30 | #define UTILS_LINKAGE
31 | #endif
32 | 
33 | extern UTILS_LINKAGE void *Malloc(size_t sz);
34 | extern UTILS_LINKAGE void Free(void **ptr);
35 | extern UTILS_LINKAGE void *Realloc(void *ptr, size_t sz);
36 | extern UTILS_LINKAGE FILE *Fopen(const char *path, const char *mode);
37 | extern UTILS_LINKAGE size_t Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
38 | extern UTILS_LINKAGE size_t Fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
39 | extern UTILS_LINKAGE int Remove(const char *pathname);
40 | extern UTILS_LINKAGE off_t getFsize(const char *fpath);
41 | extern UTILS_LINKAGE double Wtime(void);
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/optimized/cuIsingModel/vmm_alloc.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and associated documentation files (the "Software"),
  6 |  * to deal in the Software without restriction, including without limitation
  7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 |  * and/or sell copies of the Software, and to permit persons to whom the
  9 |  * Software is furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in
 12 |  * all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 |  * DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | #include <stdio.h>
 23 | #include <stdlib.h>
 24 | #include <cuda.h>
 25 | #include <mpi.h>
 26 | #include "vmm_alloc.h"
 27 | 
 28 | #define MIN(x,y) (((x)<(y))?(x):(y))
 29 | #define MAX(x,y) (((x)>(y))?(x):(y))
 30 | 
 31 | #define DIV_UP(a,b) (((a)+((b)-1))/(b))
 32 | 
 33 | #define MAX_DEVICE_NAME (256)
 34 | 
 35 | #define CHECK_CUDA(call) {                                                   \
 36 |     cudaError_t err = call;                                                    \
 37 |     if( cudaSuccess != err) {                                                \
 38 |         fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
 39 |                 __FILE__, __LINE__, cudaGetErrorString( err) );              \
 40 |         exit(EXIT_FAILURE);                                                  \
 41 |     }}
 42 | 
 43 | #define CHECK_CU(call) {                                                        \
 44 |     CUresult res = call;                                                        \
 45 |     if(CUDA_SUCCESS != res) {                                                   \
 46 | 	const char *errstr=NULL;                                                \
 47 | 	cuGetErrorName(res, &errstr);                                           \
 48 |         fprintf(stderr, "Cuda driver API error in file '%s' in line %d: %s.\n", \
 49 |                 __FILE__, __LINE__, errstr);                                    \
 50 |         exit(EXIT_FAILURE);                                                     \
 51 |     }}
 52 | 
 53 | static void *Malloc(size_t sz) {
 54 | 
 55 | 	void *ptr;
 56 | 
 57 | 	ptr = (void *)malloc(sz);
 58 | 	if (!ptr) {
 59 | 		fprintf(stderr, "Cannot allocate %zu bytes...\n", sz);
 60 | 		exit(EXIT_FAILURE);
 61 | 	}
 62 | 	return ptr;
 63 | }
 64 | 
 65 | size_t vmmFabricGranularity(int device) {
 66 | 
 67 | 	CUmemAllocationProp prop = {};
 68 | 
 69 | 	prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 70 | 	prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 71 | 	prop.location.id = device;
 72 | 
 73 | 	// necessary to export the handle for remote memory access via NVLink
 74 | 	prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
 75 | 
 76 | 	size_t granularity = 0;
 77 | 	CHECK_CU(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
 78 | 
 79 | 	return granularity;
 80 | }
 81 | 
 82 | // call to "allocate" physical memory (cuMemCreate() handle) on GPU "device"
 83 | // On entry size contains de desired size of the allocation; on exit the actual
 84 | // size, which must be a multiple of the granularity
 85 | static CUmemGenericAllocationHandle allocatePhysicalMemory(int device, size_t size) {
 86 | 
 87 | 	CUmemAllocationProp prop = {};
 88 | 
 89 | 	prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 90 | 	prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 91 | 	prop.location.id = device;
 92 | 
 93 | 	// necessary to export the handle for remote memory access via NVLink
 94 | 	prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
 95 | 
 96 | 	size_t granularity = 0;
 97 | 	CHECK_CU(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
 98 | 
 99 | 	if (size % granularity) {
100 | 	
101 | 		cudaDeviceProp props;
102 | 		CHECK_CUDA(cudaGetDeviceProperties(&props, device));
103 | 
104 | 		int nameLen;
105 | 		char procName[MPI_MAX_PROCESSOR_NAME];
106 | 		MPI_Get_processor_name(procName, &nameLen);	
107 | 	
108 | 		fprintf(stderr,
109 | 			"%s:%d: error, requested allocation size (%zu bytes) is "
110 | 			"not a multiple of minimum supported granularity (%zu bytes) "
111 | 			"for device %d (%s) on node %s!\n",
112 | 			__func__, __LINE__, size, granularity, device, props.name, procName);
113 | 		MPI_Abort(MPI_COMM_WORLD, 0);
114 | 	}
115 | 
116 | 	// Ensure size matches granularity requirements for the allocation
117 | 	//size_t padded_size = DIV_UP(size, granularity)*granularity;
118 | #if 0
119 | 	printf("%s:%d: device %d, padded_size: %zu\n", __func__, __LINE__, device, padded_size);
120 | #endif
121 | 	// Allocate physical memory
122 | 	CUmemGenericAllocationHandle allocHandle;
123 | 
124 | 	//printf("device: %d, size: %zu\n", device, size);
125 | 	CHECK_CU(cuMemCreate(&allocHandle, size, &prop, 0));
126 | 
127 | 	return allocHandle;
128 | }
129 | 
130 | static void setAccessOnDevice(int device, CUdeviceptr ptr, size_t size) {
131 | 
132 | 	CUmemAccessDesc accessDesc = {};
133 | 
134 | 	accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
135 | 	accessDesc.location.id = device;
136 | 	accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
137 | 
138 | 	//printf("device: %d\n", device);
139 | 
140 | 	// Make the address accessible
141 | 	CHECK_CU(cuMemSetAccess(ptr, size, &accessDesc, 1));
142 | 
143 | 	return;
144 | }
145 | 
146 | vmmAllocCtx_t *vmmFabricMalloc(void **devPtr, size_t sizePerGpu) {
147 | 
148 | 	int inited = 0;
149 | 	MPI_Initialized(&inited);
150 | 
151 | 	if (!inited) {
152 | 		fprintf(stderr,
153 | 			"%s:%d: error, MPI must be initialized  before calling this function!\n",
154 | 			__func__, __LINE__);
155 | 		exit(EXIT_FAILURE);
156 | 	}
157 | 
158 | 	int rank, ntask;
159 | 
160 | 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
161 | 	MPI_Comm_size(MPI_COMM_WORLD, &ntask);
162 | 
163 | 	char (*procNames)[MPI_MAX_PROCESSOR_NAME] = (char (*)[MPI_MAX_PROCESSOR_NAME])Malloc(sizeof(*procNames)*ntask);
164 | 	int nameLen;
165 | 	MPI_Get_processor_name(procNames[rank], &nameLen);
166 | 	MPI_Gather(procNames[rank], MPI_MAX_PROCESSOR_NAME, MPI_CHAR, procNames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, MPI_COMM_WORLD);
167 | 
168 | 	int ndev = 0;
169 | 	CHECK_CUDA(cudaGetDeviceCount(&ndev));
170 | 
171 | 	int ndev_or;
172 | 	int ndev_and;
173 | 	MPI_Allreduce(&ndev, &ndev_or,  1, MPI_INT, MPI_BOR,  MPI_COMM_WORLD);
174 | 	MPI_Allreduce(&ndev, &ndev_and, 1, MPI_INT, MPI_BAND, MPI_COMM_WORLD);
175 | 	if (ndev_or != ndev_and) {
176 | 		if (!rank) {
177 | 			fprintf(stderr,
178 | 				"%s:%d: error, not all processes have the same number of GPUs!\n",
179 | 				__func__, __LINE__);
180 | 		}
181 | 		MPI_Abort(MPI_COMM_WORLD, 0);
182 | 	}
183 | 	
184 | 	// local GPUs
185 | 	cudaDeviceProp *props = (cudaDeviceProp *)Malloc(sizeof(*props)*ndev);
186 | 	for(int i = 0; i < ndev; i++) {
187 | 		CHECK_CUDA(cudaGetDeviceProperties(props+i, i));
188 | 	}
189 | 
190 | 	// check local GPUs support
191 | 	for(int i = 0; i < ndev; i++) {
192 | 
193 | 		int deviceSupportsVmm;
194 | 		CHECK_CU(cuDeviceGetAttribute(&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, i));
195 | 		if (!deviceSupportsVmm) {
196 | 			fprintf(stderr,
197 | 				"%s:%d: error, device %d (%s) on node %s does NOT support Virtual Memory Management!\n",
198 | 				__func__, __LINE__, i, props[i].name, procNames[rank]);
199 | 			MPI_Abort(MPI_COMM_WORLD, 0);
200 | 		}
201 | 
202 | 		// FOR FABRIC
203 | 		int deviceSupportsFabricMem;
204 | 		CHECK_CU(cuDeviceGetAttribute(&deviceSupportsFabricMem, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, i));
205 | 		if (deviceSupportsFabricMem == 0) {
206 | 			fprintf(stderr,
207 | 				"%s:%d: error, device %d (%s) on node %s does NOT support Fabric Handles!\n",
208 | 				__func__, __LINE__, i, props[i].name, procNames[rank]);
209 | 			MPI_Abort(MPI_COMM_WORLD, 0);
210 | 		}
211 | 	}
212 | 
213 | 	// check that all GPUs are of the same kind (this may be relaxed)
214 | 	cudaDeviceProp *props_all = NULL;
215 | 	if (!rank) {
216 | 		props_all = (cudaDeviceProp *)Malloc(sizeof(*props)*ntask*ndev);
217 | 	}
218 | 
219 | 	MPI_Datatype MPI_DEV_PROP;
220 | 	MPI_Type_contiguous(sizeof(cudaDeviceProp), MPI_BYTE, &MPI_DEV_PROP);
221 | 	MPI_Type_commit(&MPI_DEV_PROP);
222 | 
223 | 	MPI_Gather(props, ndev, MPI_DEV_PROP, props_all, ndev, MPI_DEV_PROP, 0, MPI_COMM_WORLD);
224 | 
225 | 	if (!rank) {
226 | 		for(int i = 1; i < ntask*ndev; i++) {
227 | 			if (strncmp(props_all[i-1].name, props_all[i].name, MAX_DEVICE_NAME)) {
228 | 				fprintf(stderr,
229 | 					"%s:%d: error, device %d from proc %d (%s) and "
230 | 					"device %d from proc %d (%s) are different:\n"
231 | 					"\t%s\n\t%s\n",
232 | 					__func__, __LINE__,
233 | 					(i-1)%ndev, (i-1)/ndev, procNames[(i-1)/ndev],
234 | 					 i   %ndev,  i   /ndev, procNames[ i   /ndev],
235 | 					props_all[i-1].name, props_all[i].name);
236 | 				MPI_Abort(MPI_COMM_WORLD, 0);
237 | 			}
238 | 		}
239 | 	}
240 | 	free(props);
241 | 	free(props_all);
242 | 
243 | 	// allocate local handles
244 | 	CUmemGenericAllocationHandle *handles = (CUmemGenericAllocationHandle *)Malloc(sizeof(*handles)*ntask*ndev);
245 | 	memset(handles, 0, sizeof(*handles)*ntask*ndev);
246 | 
247 | 	for(int i = 0; i < ndev; i++) {
248 | 		handles[rank*ndev + i] = allocatePhysicalMemory(i, sizePerGpu);
249 | 	}
250 | 
251 | 	// export local handles
252 | 	CUmemFabricHandle *fabricHandles = (CUmemFabricHandle *)Malloc(sizeof(*fabricHandles)*ntask*ndev);
253 | 	memset(fabricHandles, 0, sizeof(*fabricHandles)*ntask*ndev);
254 | 	for(int i = 0; i < ndev; i++) {
255 | 		//printf("CU_MEM_HANDLE_TYPE_FABRIC: %d, CU_MEM_HANDLE_TYPE_MAX: %d\n", CU_MEM_HANDLE_TYPE_FABRIC, CU_MEM_HANDLE_TYPE_MAX);
256 | 		CHECK_CU(cuMemExportToShareableHandle(&fabricHandles[ndev*rank + i],
257 | 						      handles[ndev*rank + i],
258 | 						      CU_MEM_HANDLE_TYPE_FABRIC, 0));
259 | 	}
260 | 
261 | 	// distribute local handles
262 | 	MPI_Datatype MPI_FABRIC_HANDLE;
263 | 	MPI_Type_contiguous(sizeof(CUmemFabricHandle), MPI_BYTE, &MPI_FABRIC_HANDLE);
264 | 	MPI_Type_commit(&MPI_FABRIC_HANDLE);
265 | 
266 | 	MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, fabricHandles, ndev, MPI_FABRIC_HANDLE, MPI_COMM_WORLD);
267 | 
268 | 	// import remote handles
269 | 	for(int i = 0; i < ntask; i++) {
270 | 		if (i == rank) {
271 | 			continue;
272 | 		}
273 | 		for(int d = 0; d < ndev; d++) {
274 | 			CHECK_CU(cuMemImportFromShareableHandle(&handles[i*ndev + d],
275 | 							        &fabricHandles[i*ndev + d],
276 | 								CU_MEM_HANDLE_TYPE_FABRIC));
277 | 		}
278 | 	}
279 | 	// this can now be removed?
280 | 	free(fabricHandles);
281 | 
282 | 	// create a (large) Virtual Address range and map local and remote handles
283 | 	const size_t totalSize = sizePerGpu*size_t(ntask)*size_t(ndev);
284 | 
285 | 	CUdeviceptr cuptr;
286 | 	CHECK_CU(cuMemAddressReserve(&cuptr, totalSize, 0, 0, 0));
287 | 
288 | 	for(size_t i = 0; i < ntask; i++) {
289 | 		for(size_t d = 0; d < ndev; d++) {
290 | 			CHECK_CU(cuMemMap(cuptr + i*sizePerGpu*ndev + d*sizePerGpu,
291 | 					  sizePerGpu, 0, handles[i*ndev + d], 0));
292 | 		}
293 | 	}
294 | 
295 | 	for(int d = 0; d < ndev; d++) {
296 | 		setAccessOnDevice(d, cuptr, totalSize); //sizePerGpu*ntask*ndev);
297 | 	}
298 | 
299 | 
300 | 	free(procNames);
301 | 
302 | 	vmmAllocCtx_t *ctx = (vmmAllocCtx_t *)Malloc(sizeof(*ctx));
303 | 	
304 | 	ctx->cuptr = cuptr;
305 | 	ctx->virtAddrRangeSize = totalSize;
306 | 
307 | 	ctx->handles = handles;
308 | 
309 | 	*devPtr = (void *)cuptr;
310 | 
311 | 	return ctx;
312 | }
313 | 
314 | void vmmFabricFree(vmmAllocCtx_t *ctx) {
315 | 
316 | 	int ndev = 0;
317 | 	CHECK_CUDA(cudaGetDeviceCount(&ndev));
318 | 	
319 | 	int rank, ntask;
320 | 
321 | 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
322 | 	MPI_Comm_size(MPI_COMM_WORLD, &ntask);
323 | 
324 | 	CHECK_CU(cuMemUnmap(ctx->cuptr, ctx->virtAddrRangeSize));
325 | 
326 | 	for(int i = 0; i < ntask; i++) {
327 | 		for(int d = 0; d < ndev; d++) {
328 | 			CHECK_CU(cuMemRelease(ctx->handles[i*ndev + d]));
329 | 		}
330 | 	}
331 | 	CHECK_CU(cuMemAddressFree(ctx->cuptr, ctx->virtAddrRangeSize));
332 | 
333 | 	free(ctx->handles);
334 | 	free(ctx);
335 | 
336 | 	return;
337 | }
338 | 


--------------------------------------------------------------------------------
/optimized/cuIsingModel/vmm_alloc.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | #ifndef __VMM_ALLOC_H__
23 | #define __VMM_ALLOC_H__
24 | 
25 | typedef struct {
26 | 
27 | 	CUdeviceptr cuptr;
28 | 	size_t virtAddrRangeSize;
29 | 
30 | 	CUmemGenericAllocationHandle *handles;
31 | 
32 | } vmmAllocCtx_t;
33 | 
34 | #ifdef __cplusplus
35 | extern "C" {
36 | #endif
37 | 
38 | // helper to obtain the minimum size for fabric allocations
39 | size_t vmmFabricGranularity(int device);
40 | 
41 | // Allocates sizePerGPU bytes on each device 
42 | // visible to each MPI rank and return to
43 | // each caller the starting address of a 
44 | // Virtual Address range to which all the 
45 | // allocations are mapped. Mappings are
46 | // performed in Rank,DeviceId order:
47 | //
48 | // <Rank   0, Device 0><Rank   0, Device 1>, <Rank   0, Device N-1>, 
49 | // <Rank   1, Device 0><Rank   1, Device 1>, <Rank   1, Device N-1>,
50 | // ...
51 | // <Rank M-1, Device 0><Rank M-1, Device 1>, <Rank M-1, Device N-1>,
52 | //
53 | // Remote memories are accessed via FABRIC handles.
54 | //
55 | // Requirements:
56 | //   * all ranks must have access to the same number of GPUs;
57 | //   * all the GPUs must be the same type;
58 | vmmAllocCtx_t *vmmFabricMalloc(void **devPtr, size_t sizePerGpu);
59 | 
60 | void vmmFabricFree(vmmAllocCtx_t *ctx);
61 | 
62 | #ifdef __cplusplus
63 | }
64 | #endif
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/optimized/old/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_HOME=/usr/local/cuda
 2 | CUDACC=$(CUDA_HOME)/bin/nvcc
 3 | CC=gcc
 4 | LD=$(CUDACC)
 5 | CFLAGS=-c -O3 -g -I$(CUDA_HOME)/include
 6 | CUDACFLAGS=-c -O3 --use_fast_math -lineinfo -arch=sm_70 -Xptxas=-v
 7 | LDFLAGS= -Xcompiler=-fopenmp
 8 | 
 9 | all: cuIsing
10 | 
11 | cuIsing: main.o utils.o
12 | 	$(LD) -o cuIsing main.o utils.o $(LDFLAGS)
13 | 
14 | %.o: %.cu
15 | 	$(CUDACC) $(CUDACFLAGS) $<
16 | 
17 | %.o: %.c
18 | 	$(CC) $(CFLAGS) $< -o $@
19 | 
20 | clean:
21 | 	rm *.o cuIsing
22 | 


--------------------------------------------------------------------------------
/optimized/old/README.md:
--------------------------------------------------------------------------------
  1 | # Optimized CUDA implementation
  2 | 
  3 | To compile the code simply adjust the Makefile to point to your CUDA
  4 | installation and specify the CUDA architecture you want  to compile for. A
  5 | simple `make` should be enough to produce the ``cuIsing`` binary.
  6 | 
  7 | ## Usage
  8 | 
  9 | <PRE>
 10 | Usage: cuIsing [options]
 11 | options:
 12 |         -x|--x &lt;HORIZ_DIM&gt;
 13 |                 Specifies the horizontal dimension of the entire  lattice  (black+white  spins),
 14 |                 per GPU. This dimension must be a multiple of 2048.
 15 | 
 16 |         -y|--y &lt;VERT_DIM&gt;
 17 |                 Specifies the vertical dimension of the entire lattice (black+white spins),  per
 18 |                 GPU. This dimension must be a multiple of 16.
 19 | 
 20 |         -n|--n &lt;NSTEPS&gt;
 21 |                 Specifies the number of iteration to run.
 22 |                 Defualt: 1
 23 | 
 24 |         -d|--devs &lt;NUM_DEVICES&gt;
 25 |                 Specifies the number of GPUs to use. Will use devices with ids [0, NUM_DEVS-1].
 26 |                 Defualt: 1.
 27 | 
 28 |         -s|--seed &lt;SEED&gt;
 29 |                 Specifies the seed used to generate random numbers.
 30 |                 Default: 463463564571
 31 | 
 32 |         -a|--alpha &lt;ALPHA&gt;
 33 |                 Specifies the temperature in T_CRIT units.  If both this  option  and  '-t'  are
 34 |                 specified then the '-t' option is used.
 35 |                 Default: 0.100000
 36 | 
 37 |         -t|--temp &lt;TEMP&gt;
 38 |                 Specifies the temperature in absolute units.  If both this option and  '-a'  are
 39 |                 specified then this option is used.
 40 |                 Default: 0.226919
 41 | 
 42 |         -p|--print &lt;STAT_FREQ&gt;
 43 |                 Specifies the frequency, in no.  of  iteration,  with  which  the  magnetization
 44 |                 statistics is printed.  If this option is used together to the '-e' option, this
 45 |                 option is ignored.
 46 |                 Default: only at the beginning and at end of the simulation
 47 | 
 48 |         -e|--exppr
 49 |                 Prints the magnetization at time steps in the series 0 &lt;= 2^(x/4) &lt; NSTEPS.   If
 50 |                 this option is used  together  to  the  '-p'  option,  the  latter  is  ignored.
 51 |                 Default: disabled
 52 | 
 53 |         -c|--corr
 54 |                 Dumps to a  file  named  corr_{X}x{Y}_T_{TEMP}  the  correlation  of each  point
 55 |                 with the  128 points on the right and below.  The correlation is computed  every
 56 |                 time the magnetization is printed on screen (based on either the  '-p'  or  '-e'
 57 |                 option) and it is written in the file one line per measure.
 58 |                 Default: disabled
 59 | 
 60 |         -m|--magn &lt;TGT_MAGN&gt;
 61 |                 Specifies the magnetization value at which the simulation is  interrupted.   The
 62 |                 magnetization of the system is checked against TGT_MAGN every STAT_FREQ, if  the
 63 |                 '-p' option is specified, or according to the exponential  timestep  series,  if
 64 |                 the '-e' option is specified.  If neither '-p' not '-e' are specified then  this
 65 |                 option is ignored.
 66 |                 Default: unset
 67 | 
 68 |         -J|--J &lt;PROB&gt;
 69 |                 Specifies the probability [0.0-1.0] that links  connecting  any  two  spins  are
 70 |                 anti-ferromagnetic. 
 71 |                 Default: 0.0
 72 | 
 73 |            --xsl &lt;HORIZ_SUB_DIM&gt;
 74 |                 Specifies the horizontal dimension of each sub-lattice (black+white spins),  per
 75 |                 GPU.  This dimension must be a divisor of the horizontal dimension of the entire
 76 |                 lattice per  GPU  (specified  with  the  '-x'  option) and a multiple of 2048.
 77 |                 Default: sub-lattices are disabled.
 78 | 
 79 |            --ysl &lt;VERT_SUB_DIM&gt;
 80 |                 Specifies the vertical  dimension of each  sub-lattice (black+white spins),  per
 81 |                 GPU.  This dimension must be a divisor of the vertical dimension of  the  entire
 82 |                 lattice per  GPU  (specified  with  the  '-y'  option) and a multiple of 16.
 83 | 
 84 |         -o|--o
 85 |                 Enables the file dump of  the lattice  every time  the magnetization is printed.
 86 |                 Default: off
 87 | </PRE>
 88 | 
 89 | For example, to run 128 update steps on a 65536^2 lattice using two V100 GPUs
 90 | connected via NVLink and printing the magnetization every 16 steps:
 91 | 
 92 | <PRE>
 93 | # 2xV100
 94 | $ ./cuIsing -y 32768 -x 65536 -n 128 -p 16 -d 2 -t 1.5  
 95 | 
 96 | Using GPUs:
 97 |          0 (Tesla V100-DGXS-16GB, 80 SMs, 2048 th/SM max, CC 7.0, ECC on)
 98 |          1 (Tesla V100-DGXS-16GB, 80 SMs, 2048 th/SM max, CC 7.0, ECC on)
 99 | 
100 | GPUs direct access matrix:
101 |           0   1
102 | GPU  0:   V   V
103 | GPU  1:   V   V
104 | 
105 | Run configuration:
106 |         spin/word: 16
107 |         spins: 4294967296
108 |         seed: 463463564571
109 |         iterations: 128
110 |         block (X, Y): 16, 16
111 |         tile  (X, Y): 32, 16
112 |         grid  (X, Y): 32, 2048
113 |         print magn. every 16 steps
114 |         temp: 1.500000 (0.661030*T_crit)
115 |         temp update not set
116 |         not using Hamiltonian buffer
117 | 
118 |         local lattice size:         32768 x    65536
119 |         total lattice size:         65536 x    65536
120 |         local lattice shape: 2 x    32768 x     2048 (   134217728 ulls)
121 |         total lattice shape: 2 x    65536 x     2048 (   268435456 ulls)
122 |         memory: 2048.00 MB (1024.00 MB per GPU)
123 | 
124 | Setting up multi-gpu configuration:
125 |         GPU  0 done
126 |         GPU  1 done
127 | 
128 | Initial magnetization:  0.000000, up_s:   2147484090, dw_s:   2147483206
129 |         magnetization:  0.000043, up_s:   2147575418, dw_s:   2147391878 (iter:       16)
130 |         magnetization:  0.000074, up_s:   2147641872, dw_s:   2147325424 (iter:       32)
131 |         magnetization:  0.000057, up_s:   2147605659, dw_s:   2147361637 (iter:       48)
132 |         magnetization:  0.000101, up_s:   2147701147, dw_s:   2147266149 (iter:       64)
133 |         magnetization:  0.000035, up_s:   2147558546, dw_s:   2147408750 (iter:       80)
134 |         magnetization:  0.000006, up_s:   2147471275, dw_s:   2147496021 (iter:       96)
135 |         magnetization:  0.000060, up_s:   2147612509, dw_s:   2147354787 (iter:      112)
136 |         magnetization:  0.000091, up_s:   2147678887, dw_s:   2147288409 (iter:      128)
137 | Final   magnetization:  0.000091, up_s:   2147678887, dw_s:   2147288409 (iter:      128)
138 | 
139 | Kernel execution time for 128 update steps: 7.174555E+02 ms, 766.26 flips/ns (BW: 1150.32 GB/s)
140 | 
141 | </PRE>
142 | 
143 | Or, to run concurrently 1024 independent sub-lattices of size 2048^2 using two
144 | V100 GPUs connected via NVLink and printing the magnetization every 16 steps:
145 | 
146 | <PRE>
147 | # 2xV100
148 | $ ./cuIsing -y 32768 -x 65536 -n 128 -p 16 -d 2 -t 1.5 --xsl 2048 --ysl 2048
149 | 
150 | Using GPUs:
151 |          0 (Tesla V100-DGXS-16GB, 80 SMs, 2048 th/SM max, CC 7.0, ECC on)
152 |          1 (Tesla V100-DGXS-16GB, 80 SMs, 2048 th/SM max, CC 7.0, ECC on)
153 | 
154 | GPUs direct access matrix:
155 |           0   1
156 | GPU  0:   V   V
157 | GPU  1:   V   V
158 | 
159 | Run configuration:
160 |         spin/word: 16
161 |         spins: 4294967296
162 |         seed: 463463564571
163 |         iterations: 128
164 |         block (X, Y): 16, 16
165 |         tile  (X, Y): 32, 16
166 |         grid  (X, Y): 32, 2048
167 |         print magn. every 16 steps
168 |         temp: 1.500000 (0.661030*T_crit)
169 |         temp update not set
170 |         not using Hamiltonian buffer
171 | 
172 |         using sub-lattices:
173 |                 no. of sub-lattices per GPU:      512
174 |                 no. of sub-lattices (total):     1024
175 |                 sub-lattices size:              2048 x    2048
176 | 
177 |         local lattice size:         32768 x    65536
178 |         total lattice size:         65536 x    65536
179 |         local lattice shape: 2 x    32768 x     2048 (   134217728 ulls)
180 |         total lattice shape: 2 x    65536 x     2048 (   268435456 ulls)
181 |         memory: 2048.00 MB (1024.00 MB per GPU)
182 | 
183 | Setting up multi-gpu configuration:
184 |         GPU  0 done
185 |         GPU  1 done
186 | 
187 | Initial magnetization:  0.000000, up_s:   2147484090, dw_s:   2147483206
188 |         magnetization:  0.000052, up_s:   2147594634, dw_s:   2147372662 (iter:       16)
189 |         magnetization:  0.000069, up_s:   2147631783, dw_s:   2147335513 (iter:       32)
190 |         magnetization:  0.000031, up_s:   2147550893, dw_s:   2147416403 (iter:       48)
191 |         magnetization:  0.000068, up_s:   2147630364, dw_s:   2147336932 (iter:       64)
192 |         magnetization:  0.000008, up_s:   2147500244, dw_s:   2147467052 (iter:       80)
193 |         magnetization:  0.000059, up_s:   2147357073, dw_s:   2147610223 (iter:       96)
194 |         magnetization:  0.000000, up_s:   2147482936, dw_s:   2147484360 (iter:      112)
195 |         magnetization:  0.000010, up_s:   2147461873, dw_s:   2147505423 (iter:      128)
196 | Final   magnetization:  0.000010, up_s:   2147461873, dw_s:   2147505423 (iter:      128)
197 | 
198 | Kernel execution time for 128 update steps: 7.147521E+02 ms, 769.16 flips/ns (BW: 1154.67 GB/s)
199 | </PRE>
200 | 
201 | To run 128 update steps on a 131072x65536 lattice using 2 and 8 A100 GPUs
202 | connected via NVLink and printing the magnetization every 16 steps:
203 | 
204 | <PRE>
205 | # 2xA100
206 | $ ./cuIsing -y $((32*2048)) -x $((32*2048)) -n 128 -p 16 -d 2 -t 1.5
207 | 
208 | Using GPUs:
209 |          0 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
210 |          1 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
211 | 
212 | GPUs direct access matrix:
213 |           0   1
214 | GPU  0:   V   V
215 | GPU  1:   V   V
216 | 
217 | Run configuration:
218 |         spin/word: 16
219 |         spins: 8589934592
220 |         seed: 463463564571
221 |         iterations: 128
222 |         block (X, Y): 16, 16
223 |         tile  (X, Y): 32, 16
224 |         grid  (X, Y): 32, 4096
225 |         print magn. every 16 steps
226 |         temp: 1.500000 (0.661030*T_crit)
227 |         temp update not set
228 |         not using Hamiltonian buffer
229 | 
230 |         local lattice size:         65536 x    65536
231 |         total lattice size:        131072 x    65536
232 |         local lattice shape: 2 x    65536 x     2048 (   268435456 ulls)
233 |         total lattice shape: 2 x   131072 x     2048 (   536870912 ulls)
234 |         memory: 4096.00 MB (2048.00 MB per GPU)
235 | 
236 | Setting up multi-gpu configuration:
237 |         GPU  0 done
238 |         GPU  1 done
239 | 
240 | Initial magnetization:  0.000005, up_s:   4294989182, dw_s:   4294945410
241 |         magnetization:  0.000082, up_s:   4294617248, dw_s:   4295317344 (iter:       16)
242 |         magnetization:  0.000249, up_s:   4293898346, dw_s:   4296036246 (iter:       32)
243 |         magnetization:  0.000503, up_s:   4292806461, dw_s:   4297128131 (iter:       48)
244 |         magnetization:  0.000725, up_s:   4291852263, dw_s:   4298082329 (iter:       64)
245 |         magnetization:  0.000904, up_s:   4291086016, dw_s:   4298848576 (iter:       80)
246 |         magnetization:  0.001097, up_s:   4290256223, dw_s:   4299678369 (iter:       96)
247 |         magnetization:  0.001245, up_s:   4289621029, dw_s:   4300313563 (iter:      112)
248 |         magnetization:  0.001418, up_s:   4288877118, dw_s:   4301057474 (iter:      128)
249 | Final   magnetization:  0.001418, up_s:   4288877118, dw_s:   4301057474 (iter:      128)
250 | 
251 | Kernel execution time for 128 update steps: 1.055835E+03 ms, 1041.37 flips/ns (BW: 1563.32 GB/s)
252 | 
253 | 
254 | # 8xA100
255 | $ ./cuIsing -y $((32*2048)) -x $((32*2048)) -n 128 -p 16 -d 8 -t 1.5
256 | 
257 | Using GPUs:
258 |          0 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
259 |          1 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
260 |          2 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
261 |          3 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
262 |          4 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
263 |          5 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
264 |          6 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
265 |          7 (NVIDIA A100-SXM4-80GB, 108 SMs, 2048 th/SM max, CC 8.0, ECC on)
266 | 
267 | GPUs direct access matrix:
268 |           0   1   2   3   4   5   6   7
269 | GPU  0:   V   V   V   V   V   V   V   V
270 | GPU  1:   V   V   V   V   V   V   V   V
271 | GPU  2:   V   V   V   V   V   V   V   V
272 | GPU  3:   V   V   V   V   V   V   V   V
273 | GPU  4:   V   V   V   V   V   V   V   V
274 | GPU  5:   V   V   V   V   V   V   V   V
275 | GPU  6:   V   V   V   V   V   V   V   V
276 | GPU  7:   V   V   V   V   V   V   V   V
277 | 
278 | Run configuration:
279 |         spin/word: 16
280 |         spins: 34359738368
281 |         seed: 463463564571
282 |         iterations: 128
283 |         block (X, Y): 16, 16
284 |         tile  (X, Y): 32, 16
285 |         grid  (X, Y): 32, 4096
286 |         print magn. every 16 steps
287 |         temp: 1.500000 (0.661030*T_crit)
288 |         temp update not set
289 |         not using Hamiltonian buffer
290 | 
291 |         local lattice size:         65536 x    65536
292 |         total lattice size:        524288 x    65536
293 |         local lattice shape: 2 x    65536 x     2048 (   268435456 ulls)
294 |         total lattice shape: 2 x   524288 x     2048 (  2147483648 ulls)
295 |         memory: 16384.00 MB (2048.00 MB per GPU)
296 | 
297 | Setting up multi-gpu configuration:
298 |         GPU  0 done
299 |         GPU  1 done
300 |         GPU  2 done
301 |         GPU  3 done
302 |         GPU  4 done
303 |         GPU  5 done
304 |         GPU  6 done
305 |         GPU  7 done
306 | 
307 | Initial magnetization:  0.000010, up_s:  17179689306, dw_s:  17180049062
308 |         magnetization:  0.000203, up_s:  17176389528, dw_s:  17183348840 (iter:       16)
309 |         magnetization:  0.000402, up_s:  17172963073, dw_s:  17186775295 (iter:       32)
310 |         magnetization:  0.000539, up_s:  17170610910, dw_s:  17189127458 (iter:       48)
311 |         magnetization:  0.000642, up_s:  17168843228, dw_s:  17190895140 (iter:       64)
312 |         magnetization:  0.000749, up_s:  17167009008, dw_s:  17192729360 (iter:       80)
313 |         magnetization:  0.000865, up_s:  17165014291, dw_s:  17194724077 (iter:       96)
314 |         magnetization:  0.000941, up_s:  17163708078, dw_s:  17196030290 (iter:      112)
315 |         magnetization:  0.001023, up_s:  17162287230, dw_s:  17197451138 (iter:      128)
316 | Final   magnetization:  0.001023, up_s:  17162287230, dw_s:  17197451138 (iter:      128)
317 | 
318 | Kernel execution time for 128 update steps: 1.063368E+03 ms, 4135.96 flips/ns (BW: 6205.20 GB/s)
319 | </PRE>
320 | 
321 | To run 128 update steps on a 131072x65536 lattice using 2 and 8 H100 GPUs
322 | connected via NVLink and printing the magnetization every 16 steps:
323 | 
324 | <PRE>
325 | 
326 | # 2xH100
327 | $ ./cuIsing -y $((32*2048)) -x $((32*2048)) -n 128 -p 16 -d 2 -t 1.5
328 | 
329 | Using GPUs:
330 |          0 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
331 |          1 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
332 | 
333 | GPUs direct access matrix:
334 |           0   1
335 | GPU  0:   V   V
336 | GPU  1:   V   V
337 | 
338 | Run configuration:
339 |         spin/word: 16
340 |         spins: 8589934592
341 |         seed: 463463564571
342 |         iterations: 128
343 |         block (X, Y): 16, 16
344 |         tile  (X, Y): 32, 16
345 |         grid  (X, Y): 32, 4096
346 |         print magn. every 16 steps
347 |         temp: 1.500000 (0.661030*T_crit)
348 |         temp update not set
349 |         not using Hamiltonian buffer
350 | 
351 |         local lattice size:         65536 x    65536
352 |         total lattice size:        131072 x    65536
353 |         local lattice shape: 2 x    65536 x     2048 (   268435456 ulls)
354 |         total lattice shape: 2 x   131072 x     2048 (   536870912 ulls)
355 |         memory: 4096.00 MB (2048.00 MB per GPU)
356 | 
357 | Setting up multi-gpu configuration:
358 |         GPU  0 done
359 |         GPU  1 done
360 | 
361 | Initial magnetization:  0.000005, up_s:   4294989182, dw_s:   4294945410
362 |         magnetization:  0.000082, up_s:   4294617248, dw_s:   4295317344 (iter:       16)
363 |         magnetization:  0.000249, up_s:   4293898346, dw_s:   4296036246 (iter:       32)
364 |         magnetization:  0.000503, up_s:   4292806461, dw_s:   4297128131 (iter:       48)
365 |         magnetization:  0.000725, up_s:   4291852263, dw_s:   4298082329 (iter:       64)
366 |         magnetization:  0.000904, up_s:   4291086016, dw_s:   4298848576 (iter:       80)
367 |         magnetization:  0.001097, up_s:   4290256223, dw_s:   4299678369 (iter:       96)
368 |         magnetization:  0.001245, up_s:   4289621029, dw_s:   4300313563 (iter:      112)
369 |         magnetization:  0.001418, up_s:   4288877118, dw_s:   4301057474 (iter:      128)
370 | Final   magnetization:  0.001418, up_s:   4288877118, dw_s:   4301057474 (iter:      128)
371 | 
372 | Kernel execution time for 128 update steps: 6.105666E+02 ms, 1800.81 flips/ns (BW: 2703.41 GB/s)
373 | 
374 | # 8xH100
375 | $ ./cuIsing -y $((32*2048)) -x $((32*2048)) -n 128 -p 16 -d 8 -t 1.5
376 | 
377 | Using GPUs:
378 |          0 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
379 |          1 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
380 |          2 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
381 |          3 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
382 |          4 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
383 |          5 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
384 |          6 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
385 |          7 (NVIDIA H100 80GB HBM3, 132 SMs, 2048 th/SM max, CC 9.0, ECC on)
386 | 
387 | GPUs direct access matrix:
388 |           0   1   2   3   4   5   6   7
389 | GPU  0:   V   V   V   V   V   V   V   V
390 | GPU  1:   V   V   V   V   V   V   V   V
391 | GPU  2:   V   V   V   V   V   V   V   V
392 | GPU  3:   V   V   V   V   V   V   V   V
393 | GPU  4:   V   V   V   V   V   V   V   V
394 | GPU  5:   V   V   V   V   V   V   V   V
395 | GPU  6:   V   V   V   V   V   V   V   V
396 | GPU  7:   V   V   V   V   V   V   V   V
397 | 
398 | Run configuration:
399 |         spin/word: 16
400 |         spins: 34359738368
401 |         seed: 463463564571
402 |         iterations: 128
403 |         block (X, Y): 16, 16
404 |         tile  (X, Y): 32, 16
405 |         grid  (X, Y): 32, 4096
406 |         print magn. every 16 steps
407 |         temp: 1.500000 (0.661030*T_crit)
408 |         temp update not set
409 |         not using Hamiltonian buffer
410 | 
411 |         local lattice size:         65536 x    65536
412 |         total lattice size:        524288 x    65536
413 |         local lattice shape: 2 x    65536 x     2048 (   268435456 ulls)
414 |         total lattice shape: 2 x   524288 x     2048 (  2147483648 ulls)
415 |         memory: 16384.00 MB (2048.00 MB per GPU)
416 | 
417 | Setting up multi-gpu configuration:
418 |         GPU  0 done
419 |         GPU  1 done
420 |         GPU  2 done
421 |         GPU  3 done
422 |         GPU  4 done
423 |         GPU  5 done
424 |         GPU  6 done
425 |         GPU  7 done
426 | 
427 | Initial magnetization:  0.000010, up_s:  17179689306, dw_s:  17180049062
428 |         magnetization:  0.000203, up_s:  17176389528, dw_s:  17183348840 (iter:       16)
429 |         magnetization:  0.000402, up_s:  17172963073, dw_s:  17186775295 (iter:       32)
430 |         magnetization:  0.000539, up_s:  17170610910, dw_s:  17189127458 (iter:       48)
431 |         magnetization:  0.000642, up_s:  17168843228, dw_s:  17190895140 (iter:       64)
432 |         magnetization:  0.000749, up_s:  17167009008, dw_s:  17192729360 (iter:       80)
433 |         magnetization:  0.000865, up_s:  17165014291, dw_s:  17194724077 (iter:       96)
434 |         magnetization:  0.000941, up_s:  17163708078, dw_s:  17196030290 (iter:      112)
435 |         magnetization:  0.001023, up_s:  17162287230, dw_s:  17197451138 (iter:      128)
436 | Final   magnetization:  0.001023, up_s:  17162287230, dw_s:  17197451138 (iter:      128)
437 | 
438 | Kernel execution time for 128 update steps: 6.158027E+02 ms, 7141.97 flips/ns (BW: 10715.14 GB/s)
439 | </PRE>
440 | 
441 | ## Visualizing results
442 | 
443 | Running the code with the '-o' option enables the lattice dump at every timestep in which the
444 | magnetization is printed on screen (depends on either the '-p' and '-e' options). The file name
445 | has the following format:
446 | 
447 | <PRE>
448 | lattice_&lt;LOCAL_Y&gt;x&lt;LOCAL_X&gt;_T_&lt;TEMP&gt;_IT_&lt;IT_NUMBER&gt;_&lt;GPU_ID&gt;.txt
449 | </PRE>
450 | 
451 | The included `plotLattice.py` script allows to create an image from those output files. For example,
452 | the following command:
453 | 
454 | <PRE>
455 | $ ./plotLattice.py lattice_8192x8192_T_1.500000_IT_00001024_0.txt
456 | </PRE>
457 | 
458 | will generate an image file named `lattice_8192x8192_T_1.500000_IT_00001024_0.txt.png` like:
459 | 
460 | ![image_1](images/lattice_8192x8192_T_1.500000_IT_00001024_0.txt.png)
461 | 
462 | ## Contacts
463 | 
464 | For comments, questions or anything related, write to Mauro Bisson at maurob@nvidia.com.
465 | 
466 | 


--------------------------------------------------------------------------------
/optimized/old/cudamacro.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | #ifndef __CUDA_MACRO_H__
23 | #define __CUDA_MACRO_H__
24 | 
25 | #define CHECK_CUDA(call) {                                                   \
26 |     cudaError_t err = call;                                                  \
27 |     if( cudaSuccess != err) {                                                \
28 |         fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
29 |                 __FILE__, __LINE__, cudaGetErrorString( err) );              \
30 |         exit(EXIT_FAILURE);                                                  \
31 |     }}
32 | 
33 | #define CHECK_ERROR(errorMessage) {                                          \
34 |     cudaError_t err = cudaGetLastError();                                    \
35 |     if( cudaSuccess != err) {                                                \
36 |         fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
37 |                 errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
38 |         exit(EXIT_FAILURE);                                                  \
39 |     }}
40 | #endif
41 | 


--------------------------------------------------------------------------------
/optimized/old/images/lattice_8192x8192_T_1.500000_IT_00001024_0.txt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ising-gpu/935796d7a26016670363af7a0dced8a9ebcd4714/optimized/old/images/lattice_8192x8192_T_1.500000_IT_00001024_0.txt.png


--------------------------------------------------------------------------------
/optimized/old/plotLattice.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import numpy as np
 5 | from matplotlib import pyplot as plt
 6 | 
 7 | data = []
 8 | f=open(sys.argv[1])
 9 | for l in f:
10 |     data.append([int(c) for c in l.strip(" \n\r")])
11 | 
12 | print len(data), 'x', len(data[0])
13 | 
14 | plt.imshow(data, interpolation='nearest')
15 | 
16 | outFile = sys.argv[1]+".png"
17 | plt.savefig(outFile)
18 | 


--------------------------------------------------------------------------------
/optimized/old/utils.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Mauro Bisson <maurob@nvidia.com>
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a
  7 |  * copy of this software and associated documentation files (the "Software"),
  8 |  * to deal in the Software without restriction, including without limitation
  9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 10 |  * and/or sell copies of the Software, and to permit persons to whom the
 11 |  * Software is furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in
 14 |  * all copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 22 |  * DEALINGS IN THE SOFTWARE.
 23 |  */
 24 | #include <errno.h>
 25 | #include <stdio.h>
 26 | #include <stdlib.h>
 27 | #include <string.h>
 28 | #include <sys/types.h>
 29 | #include <sys/stat.h>
 30 | #include <unistd.h>
 31 | #include <time.h>
 32 | 
 33 | void *Malloc(size_t sz) {
 34 | 
 35 | 	void *ptr;
 36 | 
 37 | 	if (!sz) {
 38 | 		printf("Allocating zero bytes...\n");
 39 | 		exit(EXIT_FAILURE);
 40 | 	}
 41 | 	ptr = (void *)malloc(sz);
 42 | 	if (!ptr) {
 43 | 		fprintf(stderr, "Cannot allocate %zu bytes...\n", sz);
 44 | 		exit(EXIT_FAILURE);
 45 | 	}
 46 | 	memset(ptr, 0, sz);
 47 | 	return ptr;
 48 | }
 49 | 
 50 | void Free(void **ptr) {
 51 | 
 52 | 	if (*ptr) {
 53 | 		free(*ptr);
 54 | 		*ptr = NULL;
 55 | 	}
 56 | 	return;
 57 | }
 58 | 
 59 | void *Realloc(void *ptr, size_t sz) {
 60 | 
 61 |         void *lp;
 62 | 
 63 | 	if (!sz) {
 64 | 		printf("Re-allocating to zero bytes, are you sure you want this?\n");
 65 | 	}
 66 |         lp = (void *)realloc(ptr, sz);
 67 |         if (!lp && sz) {
 68 |                 fprintf(stderr, "Cannot reallocate to %zu bytes...\n", sz);
 69 |                 exit(EXIT_FAILURE);
 70 |         }
 71 |         return lp;
 72 | }
 73 | 
 74 | FILE *Fopen(const char *path, const char *mode) {
 75 | 
 76 |         FILE *fp = NULL;
 77 |         fp = fopen(path, mode);
 78 |         if (!fp) {
 79 |                 fprintf(stderr, "Cannot open file %s...\n", path);
 80 |                 exit(EXIT_FAILURE);
 81 |         }
 82 |         return fp;
 83 | }
 84 | 
 85 | size_t Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) {
 86 | 
 87 | 	size_t wmemb=0;
 88 | 
 89 | 	wmemb = fwrite(ptr, size, nmemb, stream);
 90 | 	if (wmemb < nmemb) {
 91 | 		fprintf(stderr, "Error while writing to file!\n");
 92 | 		exit(EXIT_FAILURE);
 93 | 	}
 94 | 	return wmemb;
 95 | }
 96 | 
 97 | size_t Fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
 98 | 
 99 | 	size_t rmemb=0;
100 | 
101 | 	rmemb = fread(ptr, size, nmemb, stream);
102 | 	if (rmemb < nmemb && ferror(stream)) {
103 | 		fprintf(stderr, "Error while reading from file, could not read more than %zu elements!\n", rmemb);
104 | 		exit(EXIT_FAILURE);
105 | 	}
106 | 	return rmemb;
107 | }
108 | 
109 | int Remove(const char *pathname) {
110 | 
111 | 	int rv = remove(pathname);
112 | 	if (rv && errno != ENOENT) {
113 | 		fprintf(stderr, "Error removing file %s: %s\n", pathname, strerror(errno));
114 | 		exit(EXIT_FAILURE);
115 | 	}
116 | 	return rv;
117 | }
118 | 
119 | off_t getFsize(const char *fpath) {
120 | 
121 |         struct stat     st;
122 |         int             rv;
123 | 
124 |         rv = stat(fpath, &st);
125 |         if (rv) {
126 |                 fprintf(stderr, "Cannot stat file %s...\n", fpath);
127 |                 exit(EXIT_FAILURE);
128 |         }
129 |         return st.st_size;
130 | }
131 | 
132 | double Wtime(void) {
133 | 	struct timespec tp;
134 | 
135 | 	int rv = clock_gettime(CLOCK_MONOTONIC, &tp);
136 | 	if(rv) return 0;
137 | 
138 | 	return tp.tv_nsec/1.0E+9 + (double)tp.tv_sec;
139 | }
140 | 


--------------------------------------------------------------------------------
/optimized/old/utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Mauro Bisson <maurob@nvidia.com>
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a
 7 |  * copy of this software and associated documentation files (the "Software"),
 8 |  * to deal in the Software without restriction, including without limitation
 9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 |  * and/or sell copies of the Software, and to permit persons to whom the
11 |  * Software is furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in
14 |  * all copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 |  * DEALINGS IN THE SOFTWARE.
23 |  */
24 | #ifndef __UTILS_H__
25 | #define __UTILS_H__
26 | 
27 | #ifdef __cplusplus
28 | #define UTILS_LINKAGE "C"
29 | #else
30 | #define UTILS_LINKAGE
31 | #endif
32 | 
33 | extern UTILS_LINKAGE void *Malloc(size_t sz);
34 | extern UTILS_LINKAGE void Free(void **ptr);
35 | extern UTILS_LINKAGE void *Realloc(void *ptr, size_t sz);
36 | extern UTILS_LINKAGE FILE *Fopen(const char *path, const char *mode);
37 | extern UTILS_LINKAGE size_t Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
38 | extern UTILS_LINKAGE size_t Fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
39 | extern UTILS_LINKAGE int Remove(const char *pathname);
40 | extern UTILS_LINKAGE off_t getFsize(const char *fpath);
41 | extern UTILS_LINKAGE double Wtime(void);
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/tensorcore/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_HOME=/usr/local/cuda
 2 | CUDACC=$(CUDA_HOME)/bin/nvcc
 3 | CC=gcc
 4 | LD=$(CUDACC)
 5 | CFLAGS=-c -O3 -g -I$(CUDA_HOME)/include
 6 | CUDACFLAGS= -std=c++11 -c -O3 -lineinfo -arch=sm_70 -Xptxas=-v -I../external/cub
 7 | LDFLAGS= -lcurand -lcublas
 8 | 
 9 | all: ising_tensorcore
10 | 
11 | ising_tensorcore: main.o
12 | 	$(LD) -o ising_tensorcore main.o $(LDFLAGS)
13 | 
14 | %.o: %.cu
15 | 	nvcc -c $(CUDACFLAGS) $<
16 | 
17 | clean:
18 | 	rm *.o ising_tensorcore
19 | 


--------------------------------------------------------------------------------
/tensorcore/README.md:
--------------------------------------------------------------------------------
 1 | ### Tensor Core implementation using CUDA C
 2 | 
 3 | ### Basic Usage
 4 | Compile binary with `make`.
 5 | 
 6 | Example run command:
 7 | 
 8 | `./ising_tensorcore -g <number of GPUs> -x <rows / 256> -y <columns / 256> -n <number of iterations> `
 9 | 
10 | Run `./ising_tensorcore --help` for more options.
11 | 
12 | ### Visualizing Results
13 | `-o` flag enables output of final lattice configuration to text file `final.txt`. Use provided `plot_ising.py` to visualize output.
14 | 
15 | For example:
16 | ```
17 | $ ./ising_tensorcore -g 2 -x 8 -y 8 -n 100 -a 0.5 -o
18 | ...
19 | Writing lattice to final.txt...
20 | 
21 | $ python plot_ising.py
22 | ```
23 | 
24 | This will produce the following output:
25 | 
26 | ![sample_plot.png](sample_plot.png)
27 | 


--------------------------------------------------------------------------------
/tensorcore/cudamacro.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | #ifndef __CUDA_MACRO_H__
23 | #define __CUDA_MACRO_H__
24 | 
25 | #define CHECK_CUDA(call) {                                                   \
26 |     cudaError_t err = call;                                                  \
27 |     if( cudaSuccess != err) {                                                \
28 |         fprintf(stderr, "CUDA error in file '%s' in line %i : %s.\n",        \
29 |                 __FILE__, __LINE__, cudaGetErrorString( err) );              \
30 |         exit(EXIT_FAILURE);                                                  \
31 |     }}
32 | 
33 | #define CHECK_CUBLAS(call) {                                                 \
34 |     cublasStatus_t status = call;                                            \
35 |     if( CUBLAS_STATUS_SUCCESS != status) {                                   \
36 |         fprintf(stderr, "CUBLAS error: %s = %d at (%s:%d)\n", #call,         \
37 |                 status, __FILE__, __LINE__);                                 \
38 |         exit(EXIT_FAILURE);                                                  \
39 |     }}
40 | 
41 | #define CHECK_CURAND(call) {                                                 \
42 |     curandStatus_t status = call;                                            \
43 |     if( CURAND_STATUS_SUCCESS != status) {                                   \
44 |         fprintf(stderr, "CURAND error: %s = %d at (%s:%d)\n", #call,         \
45 |                 status, __FILE__, __LINE__);                                 \
46 |         exit(EXIT_FAILURE);                                                  \
47 |     }}
48 | 
49 | #define CHECK_ERROR(errorMessage) {                                          \
50 |     cudaError_t err = cudaGetLastError();                                    \
51 |     if( cudaSuccess != err) {                                                \
52 |         fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
53 |                 errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
54 |         exit(EXIT_FAILURE);                                                  \
55 |     }}
56 | #endif
57 | 


--------------------------------------------------------------------------------
/tensorcore/main.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and associated documentation files (the "Software"),
  6 |  * to deal in the Software without restriction, including without limitation
  7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 |  * and/or sell copies of the Software, and to permit persons to whom the
  9 |  * Software is furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in
 12 |  * all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 |  * DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | 
 23 | #include <chrono>
 24 | #include <fstream>
 25 | #include <getopt.h>
 26 | #include <iostream>
 27 | #include <string>
 28 | 
 29 | #include <cuda_fp16.h>
 30 | #include <curand_kernel.h>
 31 | #include <cublas_v2.h>
 32 | 
 33 | #include <cub/cub.cuh>
 34 | 
 35 | #include "cudamacro.h"
 36 | 
 37 | #define LATTICE_SUP_N (256)
 38 | #define LATTICE_SUB_N (LATTICE_SUP_N / 2)
 39 | #define TCRIT 2.26918531421f
 40 | #define THREADS  (LATTICE_SUB_N)
 41 | 
 42 | #define SUP_OFFSET(i,j,nbx) (((j)*(long long)(nbx) + (i))*LATTICE_SUP_N*LATTICE_SUP_N)
 43 | #define SUB_OFFSET(i,j) (((j)*LATTICE_SUP_N + (i)*LATTICE_SUB_N)*LATTICE_SUB_N)
 44 | #define SUB_ELEM(i,j) ((j)*LATTICE_SUB_N + (i))
 45 | 
 46 | #define CUB_CHUNK_SIZE ((1ll<<31) - (1ll<<28))
 47 | 
 48 | __global__ void set_k(__half* k, __half* kT) {
 49 |   const int tid = blockDim.x * blockIdx.x + threadIdx.x;
 50 |   const int i = tid % LATTICE_SUB_N;
 51 |   const int j = tid / LATTICE_SUB_N;
 52 |   if (j >= LATTICE_SUB_N) return;
 53 | 
 54 |   __half val = __float2half(0.0f);
 55 |   if (i == j || i + 1 == j) {
 56 |     val = __float2half(1.0f);
 57 |   }
 58 | 
 59 |   k[j*LATTICE_SUB_N + i] = val;
 60 |   kT[i*LATTICE_SUB_N + j] = val;
 61 | }
 62 | 
 63 | __global__ void init_spins(__half* lattice,
 64 |                            const unsigned long long seed,
 65 |                            const int nbx,
 66 |                            const int nby,
 67 |                            const long long offset) {
 68 |   const long long tid = static_cast<long long>(blockDim.x) * blockIdx.x + threadIdx.x + offset;
 69 |   const long long nx = nbx * LATTICE_SUP_N;
 70 |   const long long ny = nby * LATTICE_SUP_N;
 71 |   if (tid >= nx * ny) return;
 72 | 
 73 |   curandStatePhilox4_32_10_t state;
 74 |   curand_init(seed, tid, 0, &state);
 75 |   float randval = curand_uniform(&state);
 76 |   __half val = (randval < 0.5f) ? __float2half(-1.0f) : __float2half(1.0f);
 77 | 
 78 |   lattice[tid] = val;
 79 | }
 80 | 
 81 | template <int N>
 82 | struct __align__(sizeof(__half)*N) halfn {
 83 |   __half val[N];
 84 | };
 85 | 
 86 | #define NLOOPS 2
 87 | #define SPINSPERTHREAD 8
 88 | template<bool is_black>
 89 | __global__ void update_spins(__half* lattice,
 90 |                              float inv_temp,
 91 |                              const __half* __restrict__ nn_sums,
 92 |                              const unsigned long long seed,
 93 |                              const unsigned long long iter,
 94 |                              const int nbx,
 95 |                              const int nby,
 96 |                              const long long  offset) {
 97 |   const long long tid = static_cast<long long>(blockDim.x) * blockIdx.x + threadIdx.x + offset;
 98 | 
 99 |   const int threads_per_subblock = LATTICE_SUB_N * LATTICE_SUB_N / (NLOOPS * SPINSPERTHREAD);
100 | 
101 |   int bi = tid / threads_per_subblock % (2 * nbx);
102 |   int bj = tid / (threads_per_subblock * 2 * nbx);
103 | 
104 |   // subblock local thread idx
105 |   int tl = tid % threads_per_subblock;
106 | 
107 |   if (bj >= nby) return;
108 | 
109 |   // Offset threads depending on parity and color
110 |   if (is_black) {
111 |     if (bi % 2) {
112 |       bj = 2*bj + 1;
113 |     } else {
114 |       bj = 2*bj;
115 |     }
116 |   } else {
117 |     if (bi % 2) {
118 |       bj = 2*bj;
119 |     } else {
120 |       bj = 2*bj + 1;
121 |     }
122 |   }
123 | 
124 |   curandStatePhilox4_32_10_t state;
125 |   curand_init(seed, tid, iter, &state);
126 | 
127 |   #pragma unroll
128 |   for (int n = 0; n < NLOOPS; n++) {
129 |     size_t elem_offset = SUP_OFFSET(bi/2, bj/2, nbx) + SUB_OFFSET(bi%2, bj%2) + (tl + n * threads_per_subblock) * SPINSPERTHREAD;
130 | 
131 |     halfn<SPINSPERTHREAD> lij = *(reinterpret_cast<halfn<SPINSPERTHREAD>*>(lattice + elem_offset));
132 |     const halfn<SPINSPERTHREAD> nn = *(reinterpret_cast<const halfn<SPINSPERTHREAD>*>(nn_sums + elem_offset));
133 | 
134 |     #pragma unroll
135 |     for (int m = 0; m < SPINSPERTHREAD; m++) {
136 |       float randval = curand_uniform(&state);
137 |       float accept = exp(-2.0f * inv_temp * __half2float(nn.val[m] * lij.val[m]));
138 |       if (randval < accept) {
139 |         lij.val[m] = -lij.val[m];
140 |       }
141 |     }
142 | 
143 |     *reinterpret_cast<halfn<SPINSPERTHREAD>*>(lattice + elem_offset) = lij;
144 | 
145 |   }
146 | }
147 | 
148 | template<bool is_black>
149 | __global__ void add_boundaries(const __half* __restrict__ lattice,
150 |                                __half* nn_sums,
151 |                                const int nbx,
152 |                                const int nby,
153 |                                const long long offset) {
154 |   const long long tid = static_cast<long long>(blockDim.x) * blockIdx.x + threadIdx.x + offset;
155 | 
156 |   // subblock i,j (1 thread block per subblock)
157 |   int bi = tid / LATTICE_SUB_N % (2 * nbx);
158 |   int bj = tid / (LATTICE_SUB_N * 2 * nbx);
159 | 
160 |   // subblock local i
161 |   int il = tid % LATTICE_SUB_N;
162 | 
163 |   if (bj >= nby) return;
164 | 
165 |   // Offset threads depending on parity and color
166 |   int jl, jb;
167 |   if (is_black) {
168 |     if (bi % 2) {
169 |       bj = 2*bj + 1;
170 |       jl = LATTICE_SUB_N - 1;
171 |       jb = 0;
172 |     } else {
173 |       bj = 2*bj;
174 |       jl = 0;
175 |       jb = LATTICE_SUB_N - 1;
176 |     }
177 |   } else {
178 |     if (bi % 2) {
179 |       bj = 2*bj;
180 |       jl = 0;
181 |       jb = LATTICE_SUB_N - 1;
182 |     } else {
183 |       bj = 2*bj + 1;
184 |       jl = LATTICE_SUB_N - 1;
185 |       jb = 0;
186 |     }
187 |   }
188 | 
189 |   int bn = 2*nbx;
190 |   int bm = 2*nby;
191 |   int bin = (bi - 1 >= 0) ? bi - 1 : bn - 1;
192 |   int bip = (bi + 1 < bn) ? bi + 1 : 0;
193 |   int bjn = (bj - 1 >= 0) ? bj - 1 : bm - 1;
194 |   int bjp = (bj + 1 < bm) ? bj + 1 : 0;
195 | 
196 |   // Update LR
197 |   size_t boundary_offset;
198 |   if (jl == 0) {
199 |     boundary_offset = SUP_OFFSET(bi/2, bjn/2, nbx) + SUB_OFFSET(bi%2, bjn%2);
200 |   } else {
201 |     boundary_offset = SUP_OFFSET(bi/2, bjp/2, nbx) + SUB_OFFSET(bi%2, bjp%2);
202 |   }
203 | 
204 |   size_t local_offset = SUP_OFFSET(bi/2, bj/2, nbx) + SUB_OFFSET(bi%2, bj%2);
205 |   *(nn_sums + local_offset + SUB_ELEM(il, jl)) += *(lattice + boundary_offset + SUB_ELEM(il, jb));
206 | 
207 | 
208 |   // Update UD
209 |   if (!is_black) {
210 |     jl = (jl == 0) ? LATTICE_SUB_N - 1 : 0;
211 |     jb = (jb == 0) ? LATTICE_SUB_N - 1 : 0;
212 |   }
213 | 
214 |   if (jl == 0) {
215 |     boundary_offset = SUP_OFFSET(bin/2, bj/2, nbx) + SUB_OFFSET(bin%2, bj%2);
216 |   } else {
217 |     boundary_offset = SUP_OFFSET(bip/2, bj/2, nbx) + SUB_OFFSET(bip%2, bj%2);
218 |   }
219 | 
220 |   __half bval = *(lattice + boundary_offset + SUB_ELEM(jb, il));
221 | 
222 |   __syncthreads();
223 | 
224 |   *(nn_sums + local_offset + SUB_ELEM(jl, il)) += bval;
225 | 
226 | }
227 | 
228 | void sync(int nGPUs) {
229 |   // Sync all devices
230 |   for (int dev = 0; dev < nGPUs; dev++) {
231 |     CHECK_CUDA(cudaSetDevice(dev));
232 |     CHECK_CUDA(cudaDeviceSynchronize());
233 |   }
234 | }
235 | 
236 | 
237 | void update(__half **Ab0, __half **Bb0, __half **Ab1, __half **Bb1, __half **Cb,
238 |             __half **Aw0, __half **Bw0, __half **Aw1, __half **Bw1, __half **Cw,
239 |             __half *lattice, float inv_temp, __half *nn_sums, cublasHandle_t *cublas_handles, int iter,
240 |             int nbx, int nby, unsigned long long seed, int nGPUs) {
241 | 
242 |   int batchCount = 2 * nbx * nby;
243 |   int batchCountPerGPU = batchCount / nGPUs;
244 | 
245 |   __half alpha = __float2half(1.0f);
246 |   __half beta0 =  __float2half(0.0f);
247 |   __half beta1 =  __float2half(1.0f);
248 | 
249 |   // Update black
250 |   for (int dev = 0; dev < nGPUs; dev++) {
251 |     CHECK_CUDA(cudaSetDevice(dev));
252 |     CHECK_CUBLAS(cublasGemmBatchedEx(cublas_handles[dev], CUBLAS_OP_N, CUBLAS_OP_N, LATTICE_SUB_N, LATTICE_SUB_N, LATTICE_SUB_N,
253 |                                      &alpha, (void**) &Ab0[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N,
254 |                                      (void**) &Bb0[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, &beta0,
255 |                                      (void**) &Cb[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, batchCountPerGPU,
256 |                                      CUDA_R_16F, CUBLAS_GEMM_ALGO0_TENSOR_OP));
257 | 
258 |     CHECK_CUBLAS(cublasGemmBatchedEx(cublas_handles[dev], CUBLAS_OP_N, CUBLAS_OP_N, LATTICE_SUB_N, LATTICE_SUB_N, LATTICE_SUB_N,
259 |                                      &alpha, (void**) &Ab1[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N,
260 |                                      (void**) &Bb1[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, &beta1,
261 |                                      (void**) &Cb[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, batchCountPerGPU,
262 |                                      CUDA_R_16F, CUBLAS_GEMM_ALGO0_TENSOR_OP));
263 | 
264 |     int blocks = (2 * nbx * nby);
265 |     int blocksPerGPU = blocks / nGPUs;
266 |     add_boundaries<true><<<blocksPerGPU, THREADS>>>(lattice, nn_sums, nbx, nby, dev * ((long long)blocksPerGPU * THREADS));
267 |     blocks = (2 * nbx * nby * LATTICE_SUB_N) / (NLOOPS * SPINSPERTHREAD);
268 |     blocksPerGPU = blocks / nGPUs;
269 |     update_spins<true><<<blocksPerGPU, THREADS>>>(lattice, inv_temp, nn_sums, seed, (2*iter) * (NLOOPS * SPINSPERTHREAD), nbx, nby, dev * ((long long)blocksPerGPU * THREADS));
270 |   }
271 | 
272 |   sync(nGPUs);
273 | 
274 |   // Update white
275 |   for (int dev = 0; dev < nGPUs; dev++) {
276 |     cudaSetDevice(dev);
277 |     CHECK_CUBLAS(cublasGemmBatchedEx(cublas_handles[dev], CUBLAS_OP_N, CUBLAS_OP_N, LATTICE_SUB_N, LATTICE_SUB_N, LATTICE_SUB_N,
278 |                                      &alpha, (void**) &Aw0[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N,
279 |                                      (void**) &Bw0[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, &beta0,
280 |                                      (void**) &Cw[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, batchCountPerGPU,
281 |                                      CUDA_R_16F, CUBLAS_GEMM_ALGO0_TENSOR_OP));
282 | 
283 |     CHECK_CUBLAS(cublasGemmBatchedEx(cublas_handles[dev], CUBLAS_OP_N, CUBLAS_OP_N, LATTICE_SUB_N, LATTICE_SUB_N, LATTICE_SUB_N,
284 |                                      &alpha, (void**) &Aw1[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N,
285 |                                      (void**) &Bw1[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, &beta1,
286 |                                      (void**) &Cw[dev * batchCountPerGPU], CUDA_R_16F, LATTICE_SUB_N, batchCountPerGPU,
287 |                                      CUDA_R_16F, CUBLAS_GEMM_ALGO0_TENSOR_OP));
288 | 
289 |     int blocks = (2 * nbx * nby);
290 |     int blocksPerGPU = blocks / nGPUs;
291 |     add_boundaries<false><<<blocksPerGPU, THREADS>>>(lattice, nn_sums, nbx, nby, dev * ((long long)blocksPerGPU * THREADS));
292 |     blocks = (2 * nbx * nby * LATTICE_SUB_N) / (NLOOPS * SPINSPERTHREAD);
293 |     blocksPerGPU = blocks / nGPUs;
294 |     update_spins<false><<<blocksPerGPU, THREADS>>>(lattice, inv_temp, nn_sums, seed, (2*iter + 1) * (NLOOPS * SPINSPERTHREAD), nbx, nby, dev * ((long long)blocksPerGPU * THREADS));
295 |   }
296 | 
297 |   sync(nGPUs);
298 | }
299 | 
300 | void write_lattice(__half *lattice, std::string filename, int nbx, int nby, int nGPUs) {
301 |   printf("Writing lattice to %s...\n", filename.c_str());
302 | 
303 |   long long nx = nbx * LATTICE_SUP_N;
304 |   long long ny = nby * LATTICE_SUP_N;
305 | 
306 |   __half* lattice_h;
307 |   float* lattice_true_h;
308 |   lattice_h = (__half*) malloc(nx * ny * sizeof(*lattice_h));
309 |   lattice_true_h = (float*) malloc(nx * ny * sizeof(*lattice_true_h));
310 | 
311 |   long spinsPerGPU = nx * (ny/nGPUs);
312 |   // Copy out full lattice to host
313 |   for (int dev = 0; dev < nGPUs; dev++) {
314 |     CHECK_CUDA(cudaSetDevice(dev));
315 |     CHECK_CUDA(cudaMemcpy(&lattice_h[dev * spinsPerGPU], &lattice[dev * spinsPerGPU], spinsPerGPU * sizeof(*lattice_h), cudaMemcpyDeviceToHost));
316 |   }
317 | 
318 |   // Write file
319 |   for (int bj = 0; bj < nby; bj++) {
320 |     for (int bi = 0; bi < nbx; bi++) {
321 |       __half* l00 = lattice_h + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(0, 0);
322 |       __half* l01 = lattice_h + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(0, 1);
323 |       __half* l10 = lattice_h + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(1, 0);
324 |       __half* l11 = lattice_h + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(1, 1);
325 | 
326 |       long long offset = (bj * LATTICE_SUP_N) * nx + (bi * LATTICE_SUP_N);
327 |       for(int j = 0; j < LATTICE_SUB_N; j++) {
328 |         for(int i = 0; i < LATTICE_SUB_N; i++) {
329 |           lattice_true_h[offset + (2*j) * nx + (2*i)] = __half2float(*(l00 + SUB_ELEM(i, j)));
330 |           lattice_true_h[offset + (2*j + 1) * nx + (2*i + 1)] = __half2float(*(l11 + SUB_ELEM(i, j)));
331 |           lattice_true_h[offset + (2*j) * nx + (2*i + 1)] = __half2float(*(l10 + SUB_ELEM(i, j)));
332 |           lattice_true_h[offset + (2*j + 1) * nx + (2*i)] = __half2float(*(l01 + SUB_ELEM(i, j)));
333 |         }
334 |       }
335 |     }
336 |   }
337 | 
338 |   std::ofstream f;
339 |   f.open(filename);
340 |   if (f.is_open()) {
341 |     for (long long j = 0; j < ny; j++) {
342 |       for (long long i = 0; i < nx; i++) {
343 |          f << lattice_true_h[j * nx + i] << " ";
344 |       }
345 |       f << std::endl;
346 |     }
347 |   }
348 |   f.close();
349 | 
350 |   free(lattice_h);
351 |   free(lattice_true_h);
352 | }
353 | 
354 | static void usage(const char *pname) {
355 | 
356 |   const char *bname = rindex(pname, '/');
357 |   if (!bname) {bname = pname;}
358 |   else        {bname++;}
359 | 
360 |   fprintf(stdout,
361 |           "Usage: %s [options]\n"
362 |           "options:\n"
363 |           "\t-x|--lattice-nbx <LATTICE_NBX>\n"
364 |           "\t\tnumber of blocks along lattice rows (number of rows / 256)\n"
365 |           "\n"
366 |           "\t-y|--lattice-nby <LATTICE_NBY>\n"
367 |           "\t\tnumber of blocks along lattice columns (number of columns / 256)\n"
368 |           "\n"
369 |           "\t-g|--ngpus <NGPUS>\n"
370 |           "\t\tnumber of GPUs to use for simulation\n"
371 |           "\n"
372 |           "\t-w|--nwarmup <NWARMUP>\n"
373 |           "\t\tnumber of warmup iterations\n"
374 |           "\n"
375 |           "\t-n|--niters <NITERS>\n"
376 |           "\t\tnumber of trial iterations\n"
377 |           "\n"
378 |           "\t-a|--alpha <ALPHA>\n"
379 |           "\t\tcoefficient of critical temperature\n"
380 |           "\n"
381 |           "\t-s|--seed <SEED>\n"
382 |           "\t\tseed for random number generation\n"
383 |           "\n"
384 |           "\t-o|--write-lattice\n"
385 |           "\t\twrite final lattice configuration to file\n\n",
386 |           bname);
387 |   exit(EXIT_SUCCESS);
388 | }
389 | 
390 | int main(int argc, char **argv) {
391 | 
392 |   // Defaults
393 |   int nbx = 10; // Lattice rows dimension (in number of super blocks)
394 |   int nby = 10; // Lattice columns dimension (in number of super blocks)
395 |   float alpha = 0.1f; // coefficient of critical temperature
396 |   int niter = 1000;
397 |   int nwarmup = 100;
398 |   bool write = false;
399 |   int nGPUs = 1;
400 |   unsigned long long seed = 1234ULL;
401 | 
402 |   while (1) {
403 |     static struct option long_options[] = {
404 |         {   "lattice-nbx", required_argument, 0, 'x'},
405 |         {   "lattice-nby", required_argument, 0, 'y'},
406 |         {         "ngpus", required_argument, 0, 'g'},
407 |         {          "seed", required_argument, 0, 's'},
408 |         {       "nwarmup", required_argument, 0, 'w'},
409 |         {         "niter", required_argument, 0, 'n'},
410 |         { "write-lattice",       no_argument, 0, 'o'},
411 |         {          "help",       no_argument, 0, 'h'},
412 |         {               0,                 0, 0,   0}
413 |     };
414 | 
415 |     int option_index = 0;
416 |     int ch = getopt_long(argc, argv, "x:y:g:a:s:w:n:oh", long_options, &option_index);
417 |     if (ch == -1) break;
418 | 
419 |     switch(ch) {
420 |       case 0:
421 |         break;
422 |       case 'x':
423 |         nbx = atoi(optarg); break;
424 |       case 'y':
425 |         nby = atoi(optarg); break;
426 |       case 'g':
427 |         nGPUs = atoi(optarg); break;
428 |       case 'a':
429 |         alpha = atof(optarg); break;
430 |       case 's':
431 |         seed = atoll(optarg); break;
432 |       case 'w':
433 |         nwarmup = atoi(optarg); break;
434 |       case 'n':
435 |         niter = atoi(optarg); break;
436 |       case 'o':
437 |         write = true; break;
438 |       case 'h':
439 |         usage(argv[0]); break;
440 |       case '?':
441 |         exit(EXIT_FAILURE);
442 |       default:
443 |         fprintf(stderr, "unknown option: %c\n", ch);
444 |         exit(EXIT_FAILURE);
445 |     }
446 |   }
447 | 
448 |   if (nby % nGPUs != 0) {
449 |     fprintf(stderr, "ERROR: Number of super blocks in y dimension must be multiple of number of gpus.\n");
450 |     exit(EXIT_FAILURE);
451 |   }
452 | 
453 |   long long nx = nbx * LATTICE_SUP_N;
454 |   long long ny = nby * LATTICE_SUP_N;
455 | 
456 |   __half* lattice;
457 |   __half* nn_sums;
458 |   __half* k;
459 |   __half* kT;
460 |   CHECK_CUDA(cudaMallocManaged(&lattice, nx * ny * sizeof(*lattice)));
461 |   CHECK_CUDA(cudaMallocManaged(&nn_sums, nx * ny * sizeof(*nn_sums)));
462 |   CHECK_CUDA(cudaMallocManaged(&k, LATTICE_SUB_N * LATTICE_SUB_N * sizeof(*k)));
463 |   CHECK_CUDA(cudaMallocManaged(&kT, LATTICE_SUB_N * LATTICE_SUB_N * sizeof(*kT)));
464 | 
465 |   for (int dev = 0; dev < nGPUs; dev++) {
466 |     CHECK_CUDA(cudaMemAdvise(k, LATTICE_SUB_N * LATTICE_SUB_N * sizeof(*k), cudaMemAdviseSetReadMostly, dev));
467 |     CHECK_CUDA(cudaMemAdvise(k, LATTICE_SUB_N * LATTICE_SUB_N * sizeof(*k), cudaMemAdviseSetAccessedBy, dev));
468 |     CHECK_CUDA(cudaMemAdvise(kT, LATTICE_SUB_N * LATTICE_SUB_N * sizeof(*kT), cudaMemAdviseSetReadMostly, dev));
469 |     CHECK_CUDA(cudaMemAdvise(kT, LATTICE_SUB_N * LATTICE_SUB_N * sizeof(*kT), cudaMemAdviseSetAccessedBy, dev));
470 |   }
471 | 
472 |   long long spinsPerGPU = nx * ny / nGPUs;
473 |   for (int dev = 0; dev < nGPUs; dev++) {
474 |     CHECK_CUDA(cudaMemAdvise(&lattice[dev * spinsPerGPU], spinsPerGPU * sizeof(*lattice), cudaMemAdviseSetPreferredLocation, dev));
475 |     CHECK_CUDA(cudaMemAdvise(&nn_sums[dev * spinsPerGPU], spinsPerGPU * sizeof(*nn_sums), cudaMemAdviseSetPreferredLocation, dev));
476 |   }
477 | 
478 |   cublasHandle_t* cublas_handles;
479 |   cublas_handles = (cublasHandle_t*) malloc(nGPUs * sizeof(cublasHandle_t));
480 |   for (int dev = 0; dev < nGPUs; dev++) {
481 |     CHECK_CUDA(cudaSetDevice(dev));
482 |     CHECK_CUBLAS(cublasCreate(&cublas_handles[dev]));
483 |     CHECK_CUBLAS(cublasSetMathMode(cublas_handles[dev], CUBLAS_TENSOR_OP_MATH));
484 |   }
485 | 
486 |   // Setup k and k transpose matrices
487 |   CHECK_CUDA(cudaSetDevice(0));
488 |   int blocks = (LATTICE_SUB_N * LATTICE_SUB_N +  THREADS - 1) / THREADS;
489 |   set_k<<<blocks, THREADS>>>(k, kT);
490 | 
491 |   // Initialize lattice spins randomly
492 |   for (int dev = 0; dev < nGPUs; dev++) {
493 |     CHECK_CUDA(cudaSetDevice(dev));
494 |     blocks = (nx * ny + THREADS - 1) / THREADS;
495 |     int blocksPerGPU = blocks/nGPUs;
496 |     init_spins<<<blocksPerGPU, THREADS>>>(lattice, seed, nbx, nby, dev * nx * (ny/nGPUs));
497 |   }
498 | 
499 |   sync(nGPUs);
500 | 
501 |   // Setup pointers for batched GEMMS
502 |   __half **Ab0, **Bb0;
503 |   __half **Ab1, **Bb1;
504 |   __half **Aw0, **Bw0;
505 |   __half **Aw1, **Bw1;
506 |   __half **Cb, **Cw;
507 | 
508 |   int batchCount = 2 * (nbx * nby);
509 |   int batchCountPerGPU = batchCount / nGPUs;
510 |   CHECK_CUDA(cudaMallocManaged(&Ab0, batchCount * sizeof(*Ab0)));
511 |   CHECK_CUDA(cudaMallocManaged(&Bb0, batchCount * sizeof(*Bb0)));
512 |   CHECK_CUDA(cudaMallocManaged(&Ab1, batchCount * sizeof(*Ab1)));
513 |   CHECK_CUDA(cudaMallocManaged(&Bb1, batchCount * sizeof(*Bb1)));
514 |   CHECK_CUDA(cudaMallocManaged(&Aw0, batchCount * sizeof(*Aw0)));
515 |   CHECK_CUDA(cudaMallocManaged(&Bw0, batchCount * sizeof(*Bw0)));
516 |   CHECK_CUDA(cudaMallocManaged(&Aw1, batchCount * sizeof(*Aw1)));
517 |   CHECK_CUDA(cudaMallocManaged(&Bw1, batchCount * sizeof(*Bw1)));
518 |   CHECK_CUDA(cudaMallocManaged(&Cb, batchCount * sizeof(*Cb)));
519 |   CHECK_CUDA(cudaMallocManaged(&Cw, batchCount * sizeof(*Cw)));
520 | 
521 |   for (int dev = 0; dev < nGPUs; dev++) {
522 |     CHECK_CUDA(cudaMemAdvise(&Ab0[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Ab0), cudaMemAdviseSetPreferredLocation, dev));
523 |     CHECK_CUDA(cudaMemAdvise(&Bb0[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Bb0), cudaMemAdviseSetPreferredLocation, dev));
524 |     CHECK_CUDA(cudaMemAdvise(&Ab1[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Ab1), cudaMemAdviseSetPreferredLocation, dev));
525 |     CHECK_CUDA(cudaMemAdvise(&Bb1[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Bb1), cudaMemAdviseSetPreferredLocation, dev));
526 |     CHECK_CUDA(cudaMemAdvise(&Aw0[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Aw0), cudaMemAdviseSetPreferredLocation, dev));
527 |     CHECK_CUDA(cudaMemAdvise(&Bw0[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Bw0), cudaMemAdviseSetPreferredLocation, dev));
528 |     CHECK_CUDA(cudaMemAdvise(&Aw1[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Aw1), cudaMemAdviseSetPreferredLocation, dev));
529 |     CHECK_CUDA(cudaMemAdvise(&Bw1[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Bw1), cudaMemAdviseSetPreferredLocation, dev));
530 |     CHECK_CUDA(cudaMemAdvise(&Cb[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Cb), cudaMemAdviseSetPreferredLocation, dev));
531 |     CHECK_CUDA(cudaMemAdvise(&Cw[dev * batchCountPerGPU], batchCountPerGPU * sizeof(*Cw), cudaMemAdviseSetPreferredLocation, dev));
532 |   }
533 | 
534 |   int idx = 0;
535 | 
536 |   for (int bj = 0; bj < nby; bj++) {
537 |     for (int bi = 0; bi < nbx; bi++) {
538 |       __half* nn_sums00 = nn_sums + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(0, 0);
539 |       __half* nn_sums11 = nn_sums + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(1, 1);
540 |       __half* nn_sums01 = nn_sums + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(0, 1);
541 |       __half* nn_sums10 = nn_sums + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(1, 0);
542 |       __half* lat00 = lattice + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(0, 0);
543 |       __half* lat11 = lattice + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(1, 1);
544 |       __half* lat01 = lattice + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(0, 1);
545 |       __half* lat10 = lattice + SUP_OFFSET(bi, bj, nbx) + SUB_OFFSET(1, 0);
546 | 
547 |       // Black:
548 |       //nn_sum(0,0) = lattice(0,1) x K   + K^T x lattice(1,0)
549 |       //nn_sum(1,1) = lattice(1,0) x K^T + K x lattice(0,1)
550 |       Ab0[idx  ] = lat01; Bb0[idx  ] = k;
551 |       Ab0[idx+1] = lat10; Bb0[idx+1] = kT;
552 | 
553 |       Ab1[idx  ] = kT; Bb1[idx  ] = lat10;
554 |       Ab1[idx+1] = k;  Bb1[idx+1] = lat01;
555 | 
556 |       Cb[idx  ] = nn_sums00;
557 |       Cb[idx+1] = nn_sums11;
558 | 
559 |       // White:
560 |       //nn_sum(1,0) = lattice(1,1) x K   + K x lattice(0,0)
561 |       //nn_sum(0,1) = lattice(0,0) x K^T + K^T x lattice(1,1)
562 |       Aw0[idx  ] = lat00 ; Bw0[idx  ] = kT;
563 |       Aw0[idx+1] = lat11 ; Bw0[idx+1] = k;
564 | 
565 |       Aw1[idx  ] = kT; Bw1[idx  ] = lat11;
566 |       Aw1[idx+1] = k;  Bw1[idx+1] = lat00;
567 | 
568 |       Cw[idx  ] = nn_sums01;
569 |       Cw[idx+1] = nn_sums10;
570 | 
571 |       idx += 2;
572 | 
573 |     }
574 |   }
575 | 
576 |   sync(nGPUs);
577 | 
578 |   float inv_temp = 1.0f / (alpha*TCRIT);
579 | 
580 |   // Warmup
581 |   printf("Starting warmup...\n");
582 |   for (int n = 0; n < nwarmup; n++) {
583 |     update(Ab0, Bb0, Ab1, Bb1, Cb, Aw0, Bw0, Aw1, Bw1, Cw,
584 |            lattice, inv_temp, nn_sums, cublas_handles, n+1, nbx, nby, seed, nGPUs);
585 |   }
586 | 
587 |   sync(nGPUs);
588 |   printf("Starting trial iterations...\n");
589 |   auto t0 = std::chrono::high_resolution_clock::now();
590 | 
591 |   for (int n = nwarmup; n < niter + nwarmup; n++) {
592 |     update(Ab0, Bb0, Ab1, Bb1, Cb, Aw0, Bw0, Aw1, Bw1, Cw,
593 |            lattice, inv_temp, nn_sums, cublas_handles, n+1, nbx, nby, seed, nGPUs);
594 |     if ((n - nwarmup) % 1000 == 0) printf("Completed %d/%d iterations...\n", n - nwarmup + 1, niter);
595 |   }
596 | 
597 |   sync(nGPUs);
598 |   auto t1 = std::chrono::high_resolution_clock::now();
599 | 
600 |   double duration = (double) std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
601 |   printf("REPORT:\n");
602 |   printf("\tnGPUs: %d\n", nGPUs);
603 |   printf("\ttemperature: %f * %f\n", alpha, TCRIT);
604 |   printf("\tseed: %llu\n", seed);
605 |   printf("\twarmup iterations: %d\n", nwarmup);
606 |   printf("\ttrial iterations: %d\n", niter);
607 |   printf("\tlattice dimensions: %lld x %lld\n", nx, ny);
608 |   printf("\telapsed time: %f sec\n", duration * 1e-6);
609 |   printf("\tupdates per ns: %f\n", (double) (nx * ny) * niter / duration * 1e-3);
610 | 
611 |   // Compute average magnetism
612 |   double* devsums;
613 |   int nchunks = (spinsPerGPU + CUB_CHUNK_SIZE - 1)/ CUB_CHUNK_SIZE;
614 |   CHECK_CUDA(cudaMallocManaged(&devsums, nGPUs * nchunks * sizeof(*devsums)));
615 |   for (int dev = 0 ; dev < nGPUs; dev++) {
616 |     CHECK_CUDA(cudaSetDevice(dev));
617 |     size_t cub_workspace_bytes = 0;
618 |     void* workspace = NULL;
619 | 
620 |     CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, &lattice[dev * spinsPerGPU], &devsums[dev*nchunks], CUB_CHUNK_SIZE));
621 |     CHECK_CUDA(cudaMalloc(&workspace, cub_workspace_bytes));
622 | 
623 |     for (int n = 0; n < nchunks; n++) {
624 |       CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, &lattice[dev * spinsPerGPU + n*CUB_CHUNK_SIZE],
625 |                              &devsums[dev * nchunks + n], std::min((long long) CUB_CHUNK_SIZE, spinsPerGPU - n * CUB_CHUNK_SIZE)));
626 |     }
627 |     CHECK_CUDA(cudaFree(workspace));
628 |   }
629 | 
630 |   sync(nGPUs);
631 | 
632 |   double hostsum = 0;
633 |   for (int n = 0; n < nGPUs * nchunks; n++) {
634 |     hostsum += devsums[n];
635 |   }
636 |   std::cout << "\taverage magnetism (absolute): " << abs(hostsum / (nx * ny)) << std::endl;
637 | 
638 |   CHECK_CUDA(cudaFree(devsums));
639 | 
640 |   if (write) write_lattice(lattice, "final.txt", nbx, nby, nGPUs);
641 | 
642 |   return 0;
643 | }
644 | 


--------------------------------------------------------------------------------
/tensorcore/plot_ising.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | lattice = np.loadtxt("final.txt", dtype=np.int32)
 5 | plt.imshow(lattice)
 6 | plt.title('Final Lattice Configuration')
 7 | plt.colorbar()
 8 | plt.show()
 9 | 
10 | 


--------------------------------------------------------------------------------
/tensorcore/sample_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ising-gpu/935796d7a26016670363af7a0dced8a9ebcd4714/tensorcore/sample_plot.png


--------------------------------------------------------------------------------