├── .gitignore ├── README.md ├── ep11-racecheck-analysis ├── Makefile └── conway.cu ├── ep17-openacc2-data ├── LICENSE ├── Makefile ├── hotspot.c └── orig_hotspot.c ├── ep19-nvvp-analysis └── transpose.cu ├── ep2-first-cuda-c-program └── kernel.cu └── ep3-first-openacc-program ├── laplace2d.c └── timer.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | 6 | # Compiled Dynamic libraries 7 | *.so 8 | *.dylib 9 | 10 | # Compiled Static libraries 11 | *.lai 12 | *.la 13 | *.a 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | cudacasts 2 | ========= 3 | 4 | Source code from NVIDIA CUDACasts 5 | -------------------------------------------------------------------------------- /ep11-racecheck-analysis/Makefile: -------------------------------------------------------------------------------- 1 | 2 | .PHONY: build clean clobber run 3 | 4 | all: build 5 | 6 | build: conway 7 | 8 | conway: conway.cu 9 | nvcc -I. -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=compute_30 -gencode arch=compute_35,code=sm_35 -lineinfo -O3 -o conway conway.cu 10 | 11 | clean: 12 | -rm conway 13 | 14 | clobber: clean 15 | 16 | 17 | -------------------------------------------------------------------------------- /ep11-racecheck-analysis/conway.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define CHECK(x) { \ 4 | cudaError_t result = x; \ 5 | if (result != cudaSuccess) { \ 6 | printf("%d:unexpected error:%s, expecting:%s\n", \ 7 | __LINE__, \ 8 | cudaGetErrorString(result), \ 9 | cudaGetErrorString(cudaSuccess)); \ 10 | exit(0); \ 11 | } \ 12 | } 13 | 14 | 15 | #define ARRXY(arr,x,y) arr[(x) + ( (y) * (max_x) )] 16 | #define P_X(x) ((x + max_x - 1) % max_x) 17 | #define N_X(x) ((x + 1) % max_x) 18 | #define P_Y(y) ((y + max_y - 1) % max_y) 19 | #define N_Y(y) ((y + 1) % max_y) 20 | 21 | __host__ __device__ void 22 | printArray(char *arr, int max_y, int max_x) 23 | { 24 | int x, y; 25 | 26 | if (!arr) 27 | return; 28 | 29 | printf("\n"); 30 | for (y = max_y - 1; y >= 0; --y) { 31 | for (x = 0; x < max_x; ++x) { 32 | printf("%s", ARRXY(arr,x,y) ? "X":"."); 33 | } 34 | printf("\n"); 35 | } 36 | } 37 | 38 | __device__ __forceinline__ int 39 | getNeighborCount(int base_offset, const int max_y, const int max_x, const int x, const int y) 40 | { 41 | int nborcount = 0; 42 | extern __shared__ char buf[]; 43 | char *cur; 44 | 45 | cur = buf + base_offset; 46 | 47 | nborcount += ARRXY(cur, P_X(x), y); 48 | nborcount += ARRXY(cur, P_X(x), P_Y(y)); 49 | nborcount += ARRXY(cur, P_X(x), N_Y(y)); 50 | 51 | nborcount += ARRXY(cur, x, P_Y(y)); 52 | nborcount += ARRXY(cur, x, N_Y(y)); 53 | 54 | nborcount += ARRXY(cur, N_X(x), y); 55 | nborcount += ARRXY(cur, N_X(x), P_Y(y)); 56 | nborcount += ARRXY(cur, N_X(x), N_Y(y)); 57 | 58 | return nborcount; 59 | } 60 | 61 | __device__ __forceinline__ void 62 | updateCell(int cur_offset, int next_offset, const int max_y, const int max_x, const int x, const int y, const int singlethread) 63 | { 64 | int nborcount = 0; 65 | extern __shared__ char buf[]; 66 | char *cur, *next; 67 | 68 | cur = buf + cur_offset; 69 | next = buf + next_offset; 70 | 71 | nborcount = getNeighborCount(cur_offset, max_y, max_x, x, y); 72 | 73 | // Compute the next in the next buffer 74 | // 1. Any live cell with <2 neighbors dies 75 | // 2. Any live cell with 2 || 3 neighbors lives 76 | // 3. Any live cell with >3 neigbors dies 77 | // 4. Any dead cell with =3 neighbors becomes alive 78 | 79 | if (ARRXY(cur,x, y) && 80 | (nborcount < 2 || nborcount > 3)) 81 | ARRXY(next, x,y) = 0; 82 | else if (!ARRXY(cur, x, y) && 83 | nborcount == 3) 84 | ARRXY(next, x, y) = 1; 85 | else 86 | ARRXY(next, x, y) = ARRXY(cur, x, y); 87 | } 88 | 89 | __global__ void 90 | gameLoop(char *raw_in, char *raw_out, const int max_y, const int max_x, int num_iter, int print_interval, int singlethread) 91 | { 92 | extern __shared__ char buf[]; 93 | char *cur, *next, *tmp; 94 | size_t arraysize = 0; 95 | int iter, x, y, i, j; 96 | 97 | // Sanity checks 98 | arraysize = (max_x) * (max_y); 99 | 100 | // Skip threads we dont care about 101 | if (singlethread) { 102 | if (threadIdx.x > 0 || threadIdx.y > 0) 103 | return; 104 | } 105 | 106 | if (threadIdx.x >= max_x) 107 | return; 108 | 109 | if (threadIdx.y >= max_y) 110 | return; 111 | 112 | cur = buf; 113 | next = (cur + arraysize); 114 | 115 | x = threadIdx.x + 0; 116 | y = threadIdx.y + 0; 117 | 118 | // Reset Shmem 119 | if (threadIdx.x == 0 && threadIdx.y == 0) { 120 | for (i = 0; i < max_x; ++i) { 121 | for (j = 0; j < max_y; ++j) { 122 | ARRXY(cur, i, j) = 0; 123 | ARRXY(next, i, j) = 0; 124 | } 125 | } 126 | } 127 | __syncthreads(); 128 | 129 | // Populate the shmem buffer 130 | if (singlethread) { 131 | for (x = 0; x < max_x; ++x) 132 | for (y = 0; y < max_y; ++y) 133 | ARRXY(cur, x, y) = ARRXY(raw_in, x, y); 134 | } 135 | else 136 | ARRXY(cur, x, y) = ARRXY(raw_in, threadIdx.x, threadIdx.y); 137 | 138 | __syncthreads(); 139 | 140 | // Start the iteration loop 141 | for (iter = 0; iter < num_iter; ++iter) { 142 | // Compute the neighbor count in the current state 143 | 144 | if (singlethread) { 145 | for (x = 0; x < max_x; ++x) 146 | for (y = 0; y < max_y; ++y) 147 | updateCell(cur - buf, next - buf, max_y, max_x, x, y, singlethread); 148 | 149 | if ((threadIdx.x == 0 && threadIdx.y == 0) && 150 | print_interval && 151 | !(iter % print_interval)) { 152 | printArray(cur, max_y, max_x); 153 | } 154 | } 155 | else { 156 | updateCell(cur - buf, next - buf, max_y, max_x, x, y, singlethread); 157 | 158 | if ((threadIdx.x == 0 && threadIdx.y == 0) && 159 | print_interval && 160 | !(iter % print_interval)) { 161 | printArray(cur, max_y, max_x); 162 | } 163 | } 164 | 165 | // Swap the next and current states : 166 | tmp = cur; 167 | cur = next; 168 | next = tmp; 169 | 170 | } 171 | 172 | // Copy data out 173 | if (singlethread) { 174 | for (x = 0; x < max_x; ++x) 175 | for (y = 0; y < max_y; ++y) 176 | ARRXY(raw_out,x, y) = ARRXY(cur, x, y); 177 | } 178 | else 179 | ARRXY(raw_out,threadIdx.x, threadIdx.y) = ARRXY(cur, x, y); 180 | } 181 | 182 | void 183 | initArray(char *arr, int max_y, int max_x, unsigned int seed, float bias) 184 | { 185 | int x, y; 186 | 187 | if (!arr) 188 | return; 189 | 190 | if (bias >= 1 || bias <= 0) 191 | return; 192 | 193 | for (y = 0; y < max_y; ++y) { 194 | for (x = 0; x < max_x; ++x) { 195 | ARRXY(arr,x,y) = (rand() >= (RAND_MAX * (bias ))) ? 0 : 1; 196 | } 197 | } 198 | } 199 | 200 | bool 201 | compareArrays(char *arr1, char *arr2, int max_y, int max_x) 202 | { 203 | int x, y; 204 | for (y = 0; y < max_y; ++y) { 205 | for (x = 0; x < max_x; ++x) { 206 | if(ARRXY(arr1,x,y) != ARRXY(arr2,x,y)) { 207 | printf("Mismatch at x:%d, y:%d\n", x, y); 208 | return false; 209 | } 210 | } 211 | } 212 | return true; 213 | } 214 | 215 | float 216 | getLiveness(char *arr, int max_y, int max_x) 217 | { 218 | int size = max_y * max_x; 219 | int sum = 0; 220 | int x,y; 221 | 222 | for (y = 0; y < max_y; ++y) 223 | for (x = 0; x < max_x; ++x) 224 | sum += ARRXY(arr,x,y); 225 | 226 | return (1.0*sum)/size; 227 | } 228 | 229 | int 230 | main(int argc, char **argv) 231 | { 232 | char *array = NULL, *array2 = NULL; 233 | char *d_in, *d_out, *d_out2; 234 | bool mismatch = false; 235 | 236 | int max_x = (argc > 1) ? atol(argv[1]) : 7; 237 | int max_y = (argc > 2) ? atol(argv[2]) : 7; 238 | int dev_iter = (argc > 3) ? atol(argv[3]) : 10; 239 | int print_interval = (argc > 4) ? atol(argv[4]) : 0; 240 | float bias = (argc > 5) ? atof(argv[5]) : 1.0/3; 241 | unsigned int seed = (argc > 6) ? atol(argv[6]) : 129; 242 | 243 | float initial_liveness = 1.0; 244 | size_t bufsize = max_y * max_x * sizeof(char); 245 | size_t shmemsize = ((max_y)*(max_x)) * 2; 246 | 247 | array = (char*)calloc(1, bufsize); 248 | array2 = (char*)calloc(1, bufsize); 249 | if (!array || !array2) { 250 | printf("Failed to allocate memory\n"); 251 | return -1; 252 | } 253 | 254 | CHECK(cudaMalloc(&d_in, bufsize)); 255 | CHECK(cudaMalloc(&d_out, bufsize)); 256 | CHECK(cudaMalloc(&d_out2, bufsize)); 257 | 258 | printf (" Generating random array (%dx%d) Seed:%u Target Liveness:%f\n", 259 | max_y, max_x, seed, bias); 260 | initArray(array, max_y, max_x, seed, bias); 261 | initial_liveness = getLiveness(array, max_y, max_x); 262 | 263 | CHECK(cudaMemset(d_out, 0x0, bufsize)); 264 | CHECK(cudaMemset(d_out2, 0x0, bufsize)); 265 | 266 | dim3 threads(max_x, max_y, 1); 267 | 268 | CHECK(cudaMemcpy(d_in, array, bufsize, cudaMemcpyHostToDevice)); 269 | gameLoop<<<1, threads, shmemsize>>> (d_in, d_out, max_y, max_x, dev_iter, print_interval, 0); 270 | gameLoop<<<1, threads, shmemsize>>> (d_in, d_out2, max_y, max_x, dev_iter, print_interval, 1); 271 | CHECK(cudaMemcpy(array, d_out, bufsize, cudaMemcpyDeviceToHost)); 272 | CHECK(cudaMemcpy(array2, d_out2, bufsize, cudaMemcpyDeviceToHost)); 273 | 274 | printf(" Array %dx%d (Shmem :%u) Iterations: %u Initial Liveness:%f Final Liveness:%f\n", 275 | max_y, max_x, shmemsize, dev_iter, initial_liveness, getLiveness(array, max_y, max_x)); 276 | 277 | if (!compareArrays(array, array2, max_y, max_x)) { 278 | printf("Mismatch !!\n"); 279 | mismatch = true; 280 | } 281 | 282 | printf(" Final Array : "); 283 | printArray(array, max_y, max_x); 284 | if (mismatch) { 285 | printf(" \n Final Array : (single threaded)"); 286 | printArray(array2, max_y, max_x); 287 | } 288 | 289 | free(array); 290 | free(array2); 291 | CHECK(cudaFree(d_in)); 292 | CHECK(cudaFree(d_out)); 293 | CHECK(cudaFree(d_out2)); 294 | CHECK(cudaDeviceReset()); 295 | return 0; 296 | } 297 | -------------------------------------------------------------------------------- /ep17-openacc2-data/LICENSE: -------------------------------------------------------------------------------- 1 | LICENSE TERMS 2 | 3 | Copyright (c)2008-2011 University of Virginia 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, are permitted without royalty fees or other restrictions, provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | * Neither the name of the University of Virginia, the Dept. of Computer Science, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF VIRGINIA OR THE SOFTWARE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 13 | 14 | If you use this software or a modified version of it, please cite the most relevant among the following papers: 15 | 16 | - M. A. Goodrum, M. J. Trotter, A. Aksel, S. T. Acton, and K. Skadron. Parallelization of Particle Filter Algorithms. In Proceedings 17 | of the 3rd Workshop on Emerging Applications and Many-core Architecture (EAMA), in conjunction with the IEEE/ACM International 18 | Symposium on Computer Architecture (ISCA), June 2010. 19 | 20 | - S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, Sang-Ha Lee and K. Skadron. 21 | "Rodinia: A Benchmark Suite for Heterogeneous Computing". IEEE International Symposium 22 | on Workload Characterization, Oct 2009. 23 | 24 | - J. Meng and K. Skadron. "Performance Modeling and Automatic Ghost Zone Optimization 25 | for Iterative Stencil Loops on GPUs." In Proceedings of the 23rd Annual ACM International 26 | Conference on Supercomputing (ICS), June 2009. 27 | 28 | - L.G. Szafaryn, K. Skadron and J. Saucerman. "Experiences Accelerating MATLAB Systems 29 | Biology Applications." in Workshop on Biomedicine in Computing (BiC) at the International 30 | Symposium on Computer Architecture (ISCA), June 2009. 31 | 32 | - M. Boyer, D. Tarjan, S. T. Acton, and K. Skadron. "Accelerating Leukocyte Tracking using CUDA: 33 | A Case Study in Leveraging Manycore Coprocessors." In Proceedings of the International Parallel 34 | and Distributed Processing Symposium (IPDPS), May 2009. 35 | 36 | - S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, and K. Skadron. "A Performance 37 | Study of General Purpose Applications on Graphics Processors using CUDA" Journal of 38 | Parallel and Distributed Computing, Elsevier, June 2008. 39 | -------------------------------------------------------------------------------- /ep17-openacc2-data/Makefile: -------------------------------------------------------------------------------- 1 | # C compiler 2 | CC = pgcc 3 | OPT = -fast 4 | MP = -mp 5 | OMP_THRDS = 6 6 | ACC = -acc -Minfo=accel 7 | TIME = /usr/bin/time --verbose 8 | DIFF = diff --brief 9 | 10 | all: build run 11 | 12 | build: hotspot_omp hotspot_acc 13 | 14 | run: small medium large 15 | 16 | small: small_omp small_acc 17 | medium: medium_omp medium_acc 18 | large: large_omp large_acc 19 | 20 | hotspot_acc: hotspot.c 21 | $(CC) $(OPT) $(ACC) hotspot.c -o hotspot_acc 22 | 23 | hotspot_omp: hotspot.c 24 | $(CC) $(OPT) $(MP) hotspot.c -o hotspot_omp 25 | 26 | small_omp: hotspot_omp 27 | $(TIME) ./hotspot_omp 64 64 10000 $(OMP_THRDS) data/temp_64 data/power_64 > output_sm_omp.log 28 | $(DIFF) output_sm_omp.log data/sm_output.log 29 | 30 | medium_omp: hotspot_omp 31 | $(TIME) ./hotspot_omp 512 512 10000 $(OMP_THRDS) data/temp_512 data/power_512 > output_md_omp.log 32 | $(DIFF) output_md_omp.log data/md_output.log 33 | 34 | large_omp: hotspot_omp 35 | $(TIME) ./hotspot_omp 1024 1024 10000 $(OMP_THRDS) data/temp_1024 data/power_1024 > output_lg_omp.log 36 | $(DIFF) output_lg_omp.log data/lg_output.log 37 | 38 | small_acc: hotspot_acc 39 | $(TIME) ./hotspot_acc 64 64 10000 1 data/temp_64 data/power_64 > output_sm_acc.log 40 | $(DIFF) output_sm_acc.log data/sm_output.log 41 | 42 | medium_acc: hotspot_acc 43 | $(TIME) ./hotspot_acc 512 512 10000 1 data/temp_512 data/power_512 > output_md_acc.log 44 | $(DIFF) output_md_acc.log data/md_output.log 45 | 46 | large_acc: hotspot_acc 47 | $(TIME) ./hotspot_acc 1024 1024 10000 1 data/temp_1024 data/power_1024 > output_lg_acc.log 48 | $(DIFF) output_lg_acc.log data/lg_output.log 49 | 50 | clean: 51 | rm -f hotspot_acc hotspot_omp *.o *.log 52 | 53 | -------------------------------------------------------------------------------- /ep17-openacc2-data/hotspot.c: -------------------------------------------------------------------------------- 1 | /**************************************************************************** 2 | * This example of using OpenACC unstructured data regions was derived from 3 | * the Rodinia Hotspot benchmark. Please see the enclosed LICENSE file. 4 | * Copyright (c)2008-2011 University of Virginia 5 | * All rights reserved. 6 | ***************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #ifdef _OPENACC 13 | #include 14 | #endif 15 | #define STR_SIZE 256 16 | 17 | /* maximum power density possible (say 300W for a 10mm x 10mm chip) */ 18 | #define MAX_PD (3.0e6) 19 | /* required precision in degrees */ 20 | #define PRECISION 0.001 21 | #define SPEC_HEAT_SI 1.75e6 22 | #define K_SI 100 23 | /* capacitance fitting factor */ 24 | #define FACTOR_CHIP 0.5 25 | #define OUTPUT 26 | 27 | // global data 28 | double *temp, *power, *result; 29 | char *tfile, *pfile; 30 | 31 | /* chip parameters */ 32 | double t_chip = 0.0005; 33 | double chip_height = 0.016; 34 | double chip_width = 0.016; 35 | /* ambient temperature, assuming no package at all */ 36 | double amb_temp = 80.0; 37 | 38 | int num_omp_threads; 39 | 40 | /* Single iteration of the transient solver in the grid model. 41 | * advances the solution of the discretized difference equations 42 | * by one time step 43 | */ 44 | void single_iteration(int row, int col, 45 | double Cap, double Rx, double Ry, double Rz, 46 | double step) 47 | { 48 | double delta; 49 | int r, c; 50 | 51 | #pragma acc declare deviceptr(result), present_or_copy(temp[0:row*col]), pcopyin(power[0:row*col]) 52 | #ifdef _OPENMP 53 | omp_set_num_threads(num_omp_threads); 54 | #pragma omp parallel for shared(power, temp,result) private(r, c, delta) firstprivate(row, col) schedule(static) 55 | #endif 56 | 57 | #pragma acc kernels loop independent 58 | for (r = 0; r < row; r++) { 59 | #pragma acc loop independent 60 | for (c = 0; c < col; c++) { 61 | /* Corner 1 */ 62 | if ( (r == 0) && (c == 0) ) { 63 | delta = (step / Cap) * (power[0] + 64 | (temp[1] - temp[0]) / Rx + 65 | (temp[col] - temp[0]) / Ry + 66 | (amb_temp - temp[0]) / Rz); 67 | } /* Corner 2 */ 68 | else if ((r == 0) && (c == col-1)) { 69 | delta = (step / Cap) * (power[c] + 70 | (temp[c-1] - temp[c]) / Rx + 71 | (temp[c+col] - temp[c]) / Ry + 72 | (amb_temp - temp[c]) / Rz); 73 | } /* Corner 3 */ 74 | else if ((r == row-1) && (c == col-1)) { 75 | delta = (step / Cap) * (power[r*col+c] + 76 | (temp[r*col+c-1] - temp[r*col+c]) / Rx + 77 | (temp[(r-1)*col+c] - temp[r*col+c]) / Ry + 78 | (amb_temp - temp[r*col+c]) / Rz); 79 | } /* Corner 4 */ 80 | else if ((r == row-1) && (c == 0)) { 81 | delta = (step / Cap) * (power[r*col] + 82 | (temp[r*col+1] - temp[r*col]) / Rx + 83 | (temp[(r-1)*col] - temp[r*col]) / Ry + 84 | (amb_temp - temp[r*col]) / Rz); 85 | } /* Edge 1 */ 86 | else if (r == 0) { 87 | delta = (step / Cap) * (power[c] + 88 | (temp[c+1] + temp[c-1] - 2.0*temp[c]) / Rx + 89 | (temp[col+c] - temp[c]) / Ry + 90 | (amb_temp - temp[c]) / Rz); 91 | } /* Edge 2 */ 92 | else if (c == col-1) { 93 | delta = (step / Cap) * (power[r*col+c] + 94 | (temp[(r+1)*col+c] + temp[(r-1)*col+c] - 2.0*temp[r*col+c]) / Ry + 95 | (temp[r*col+c-1] - temp[r*col+c]) / Rx + 96 | (amb_temp - temp[r*col+c]) / Rz); 97 | } /* Edge 3 */ 98 | else if (r == row-1) { 99 | delta = (step / Cap) * (power[r*col+c] + 100 | (temp[r*col+c+1] + temp[r*col+c-1] - 2.0*temp[r*col+c]) / Rx + 101 | (temp[(r-1)*col+c] - temp[r*col+c]) / Ry + 102 | (amb_temp - temp[r*col+c]) / Rz); 103 | } /* Edge 4 */ 104 | else if (c == 0) { 105 | delta = (step / Cap) * (power[r*col] + 106 | (temp[(r+1)*col] + temp[(r-1)*col] - 2.0*temp[r*col]) / Ry + 107 | (temp[r*col+1] - temp[r*col]) / Rx + 108 | (amb_temp - temp[r*col]) / Rz); 109 | } /* Inside the chip */ 110 | else { 111 | delta = (step / Cap) * (power[r*col+c] + 112 | (temp[(r+1)*col+c] + temp[(r-1)*col+c] - 2.0*temp[r*col+c]) / Ry + 113 | (temp[r*col+c+1] + temp[r*col+c-1] - 2.0*temp[r*col+c]) / Rx + 114 | (amb_temp - temp[r*col+c]) / Rz); 115 | } 116 | 117 | /* Update Temperatures */ 118 | result[r*col+c] =temp[r*col+c]+ delta; 119 | 120 | 121 | } 122 | } 123 | 124 | #ifdef _OPENMP 125 | omp_set_num_threads(num_omp_threads); 126 | #pragma omp parallel for shared(result, temp) private(r, c) schedule(static) 127 | #endif 128 | #pragma acc kernels loop independent 129 | for (r = 0; r < row; r++) { 130 | #pragma acc loop independent 131 | for (c = 0; c < col; c++) { 132 | temp[r*col+c]=result[r*col+c]; 133 | } 134 | } 135 | } 136 | 137 | /* Transient solver driver routine: simply converts the heat 138 | * transfer differential equations to difference equations 139 | * and solves the difference equations by iterating 140 | */ 141 | void compute_tran_temp(int num_iterations, int row, int col) 142 | { 143 | #ifdef VERBOSE 144 | int i = 0; 145 | #endif 146 | 147 | double grid_height = chip_height / row; 148 | double grid_width = chip_width / col; 149 | 150 | double Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height; 151 | double Rx = grid_width / (2.0 * K_SI * t_chip * grid_height); 152 | double Ry = grid_height / (2.0 * K_SI * t_chip * grid_width); 153 | double Rz = t_chip / (K_SI * grid_height * grid_width); 154 | 155 | double max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI); 156 | double step = PRECISION / max_slope; 157 | double t; 158 | 159 | #ifdef VERBOSE 160 | fprintf(stdout, "total iterations: %d s\tstep size: %g s\n", num_iterations, step); 161 | fprintf(stdout, "Rx: %g\tRy: %g\tRz: %g\tCap: %g\n", Rx, Ry, Rz, Cap); 162 | #endif 163 | 164 | for (int i = 0; i < num_iterations ; i++) 165 | { 166 | #ifdef VERBOSE 167 | fprintf(stdout, "iteration %d\n", i++); 168 | #endif 169 | single_iteration(row, col, Cap, Rx, Ry, Rz, step); 170 | } 171 | 172 | #ifdef VERBOSE 173 | fprintf(stdout, "iteration %d\n", i++); 174 | #endif 175 | } 176 | 177 | void fatal(char *s) 178 | { 179 | fprintf(stderr, "error: %s\n", s); 180 | exit(1); 181 | } 182 | 183 | void read_input(double *vect, int grid_rows, int grid_cols, char *file) 184 | { 185 | int i, index; 186 | FILE *fp; 187 | char str[STR_SIZE]; 188 | double val; 189 | 190 | fp = fopen (file, "r"); 191 | if (!fp) 192 | fatal ("file could not be opened for reading"); 193 | 194 | for (i=0; i < grid_rows * grid_cols; i++) { 195 | fgets(str, STR_SIZE, fp); 196 | if (feof(fp)) 197 | fatal("not enough lines in file"); 198 | if ((sscanf(str, "%lf", &val) != 1) ) 199 | fatal("invalid file format"); 200 | vect[i] = val; 201 | } 202 | 203 | fclose(fp); 204 | } 205 | 206 | void init_data(int grid_rows, int grid_cols) { 207 | 208 | /* allocate memory for the temperature and power arrays */ 209 | temp = (double *) calloc (grid_rows * grid_cols, sizeof(double)); 210 | power = (double *) calloc (grid_rows * grid_cols, sizeof(double)); 211 | #ifdef _OPENACC 212 | result = (double *) acc_malloc (grid_rows * grid_cols * sizeof(double)); 213 | #else 214 | result = (double *) malloc (grid_rows * grid_cols * sizeof(double)); 215 | #endif 216 | 217 | if(!temp || !power) 218 | fatal("unable to allocate memory"); 219 | 220 | /* read initial temperatures and input power */ 221 | read_input(temp, grid_rows, grid_cols, tfile); 222 | read_input(power, grid_rows, grid_cols, pfile); 223 | #pragma acc enter data copyin(temp[0:grid_rows*grid_cols],power[0:grid_rows*grid_cols]) 224 | } 225 | 226 | 227 | void usage(int argc, char **argv) 228 | { 229 | fprintf(stderr, "Usage: %s \n", argv[0]); 230 | fprintf(stderr, "\t - number of rows in the grid (positive integer)\n"); 231 | fprintf(stderr, "\t - number of columns in the grid (positive integer)\n"); 232 | fprintf(stderr, "\t - number of iterations\n"); 233 | fprintf(stderr, "\t - number of threads\n"); 234 | fprintf(stderr, "\t - name of the file containing the initial temperature values of each cell\n"); 235 | fprintf(stderr, "\t - name of the file containing the dissipated power values of each cell\n"); 236 | exit(1); 237 | } 238 | 239 | int main(int argc, char **argv) 240 | { 241 | int grid_rows, grid_cols, sim_time, i; 242 | int size; 243 | 244 | /* check validity of inputs */ 245 | if (argc != 7) 246 | usage(argc, argv); 247 | if ((grid_rows = atoi(argv[1])) <= 0 || 248 | (grid_cols = atoi(argv[2])) <= 0 || 249 | (sim_time = atoi(argv[3])) <= 0 || 250 | (num_omp_threads = atoi(argv[4])) <= 0 251 | ) 252 | usage(argc, argv); 253 | tfile = argv[5]; 254 | pfile = argv[6]; 255 | 256 | size = grid_rows*grid_cols; 257 | init_data(grid_rows, grid_cols); 258 | printf("Start computing the transient temperature\n"); 259 | compute_tran_temp(sim_time, grid_rows, grid_cols); 260 | printf("Ending simulation\n"); 261 | /* output results */ 262 | #ifdef VERBOSE 263 | fprintf(stdout, "Final Temperatures:\n"); 264 | #endif 265 | 266 | #ifdef OUTPUT 267 | #pragma acc update host(temp[0:size]) 268 | for(i=0; i < grid_rows * grid_cols; i++) 269 | fprintf(stdout, "%d\t%g\n", i, temp[i]); 270 | #endif 271 | 272 | /* cleanup */ 273 | #pragma acc exit data delete(temp[0:size],power[0:size]) 274 | free(temp); 275 | free(power); 276 | #ifdef _OPENACC 277 | acc_free(result); 278 | #else 279 | free(result); 280 | #endif 281 | return 0; 282 | } 283 | 284 | 285 | -------------------------------------------------------------------------------- /ep17-openacc2-data/orig_hotspot.c: -------------------------------------------------------------------------------- 1 | /**************************************************************************** 2 | * This example of using OpenACC unstructured data regions was derived from 3 | * the Rodinia Hotspot benchmark. Please see the enclosed LICENSE file. 4 | * Copyright (c)2008-2011 University of Virginia 5 | * All rights reserved. 6 | ***************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #ifdef _OPENACC 13 | #include 14 | #endif 15 | #define STR_SIZE 256 16 | 17 | /* maximum power density possible (say 300W for a 10mm x 10mm chip) */ 18 | #define MAX_PD (3.0e6) 19 | /* required precision in degrees */ 20 | #define PRECISION 0.001 21 | #define SPEC_HEAT_SI 1.75e6 22 | #define K_SI 100 23 | /* capacitance fitting factor */ 24 | #define FACTOR_CHIP 0.5 25 | #define OUTPUT 26 | 27 | // global data 28 | double *temp, *power, *result; 29 | char *tfile, *pfile; 30 | 31 | /* chip parameters */ 32 | double t_chip = 0.0005; 33 | double chip_height = 0.016; 34 | double chip_width = 0.016; 35 | /* ambient temperature, assuming no package at all */ 36 | double amb_temp = 80.0; 37 | 38 | int num_omp_threads; 39 | 40 | /* Single iteration of the transient solver in the grid model. 41 | * advances the solution of the discretized difference equations 42 | * by one time step 43 | */ 44 | void single_iteration(int row, int col, 45 | double Cap, double Rx, double Ry, double Rz, 46 | double step) 47 | { 48 | double delta; 49 | int r, c; 50 | 51 | #ifdef _OPENMP 52 | omp_set_num_threads(num_omp_threads); 53 | #pragma omp parallel for shared(power, temp,result) private(r, c, delta) firstprivate(row, col) schedule(static) 54 | #endif 55 | 56 | #pragma acc kernels loop independent 57 | for (r = 0; r < row; r++) { 58 | #pragma acc loop independent 59 | for (c = 0; c < col; c++) { 60 | /* Corner 1 */ 61 | if ( (r == 0) && (c == 0) ) { 62 | delta = (step / Cap) * (power[0] + 63 | (temp[1] - temp[0]) / Rx + 64 | (temp[col] - temp[0]) / Ry + 65 | (amb_temp - temp[0]) / Rz); 66 | } /* Corner 2 */ 67 | else if ((r == 0) && (c == col-1)) { 68 | delta = (step / Cap) * (power[c] + 69 | (temp[c-1] - temp[c]) / Rx + 70 | (temp[c+col] - temp[c]) / Ry + 71 | (amb_temp - temp[c]) / Rz); 72 | } /* Corner 3 */ 73 | else if ((r == row-1) && (c == col-1)) { 74 | delta = (step / Cap) * (power[r*col+c] + 75 | (temp[r*col+c-1] - temp[r*col+c]) / Rx + 76 | (temp[(r-1)*col+c] - temp[r*col+c]) / Ry + 77 | (amb_temp - temp[r*col+c]) / Rz); 78 | } /* Corner 4 */ 79 | else if ((r == row-1) && (c == 0)) { 80 | delta = (step / Cap) * (power[r*col] + 81 | (temp[r*col+1] - temp[r*col]) / Rx + 82 | (temp[(r-1)*col] - temp[r*col]) / Ry + 83 | (amb_temp - temp[r*col]) / Rz); 84 | } /* Edge 1 */ 85 | else if (r == 0) { 86 | delta = (step / Cap) * (power[c] + 87 | (temp[c+1] + temp[c-1] - 2.0*temp[c]) / Rx + 88 | (temp[col+c] - temp[c]) / Ry + 89 | (amb_temp - temp[c]) / Rz); 90 | } /* Edge 2 */ 91 | else if (c == col-1) { 92 | delta = (step / Cap) * (power[r*col+c] + 93 | (temp[(r+1)*col+c] + temp[(r-1)*col+c] - 2.0*temp[r*col+c]) / Ry + 94 | (temp[r*col+c-1] - temp[r*col+c]) / Rx + 95 | (amb_temp - temp[r*col+c]) / Rz); 96 | } /* Edge 3 */ 97 | else if (r == row-1) { 98 | delta = (step / Cap) * (power[r*col+c] + 99 | (temp[r*col+c+1] + temp[r*col+c-1] - 2.0*temp[r*col+c]) / Rx + 100 | (temp[(r-1)*col+c] - temp[r*col+c]) / Ry + 101 | (amb_temp - temp[r*col+c]) / Rz); 102 | } /* Edge 4 */ 103 | else if (c == 0) { 104 | delta = (step / Cap) * (power[r*col] + 105 | (temp[(r+1)*col] + temp[(r-1)*col] - 2.0*temp[r*col]) / Ry + 106 | (temp[r*col+1] - temp[r*col]) / Rx + 107 | (amb_temp - temp[r*col]) / Rz); 108 | } /* Inside the chip */ 109 | else { 110 | delta = (step / Cap) * (power[r*col+c] + 111 | (temp[(r+1)*col+c] + temp[(r-1)*col+c] - 2.0*temp[r*col+c]) / Ry + 112 | (temp[r*col+c+1] + temp[r*col+c-1] - 2.0*temp[r*col+c]) / Rx + 113 | (amb_temp - temp[r*col+c]) / Rz); 114 | } 115 | 116 | /* Update Temperatures */ 117 | result[r*col+c] =temp[r*col+c]+ delta; 118 | 119 | 120 | } 121 | } 122 | 123 | #ifdef _OPENMP 124 | omp_set_num_threads(num_omp_threads); 125 | #pragma omp parallel for shared(result, temp) private(r, c) schedule(static) 126 | #endif 127 | #pragma acc kernels loop independent 128 | for (r = 0; r < row; r++) { 129 | #pragma acc loop independent 130 | for (c = 0; c < col; c++) { 131 | temp[r*col+c]=result[r*col+c]; 132 | } 133 | } 134 | } 135 | 136 | /* Transient solver driver routine: simply converts the heat 137 | * transfer differential equations to difference equations 138 | * and solves the difference equations by iterating 139 | */ 140 | void compute_tran_temp(int num_iterations, int row, int col) 141 | { 142 | #ifdef VERBOSE 143 | int i = 0; 144 | #endif 145 | 146 | double grid_height = chip_height / row; 147 | double grid_width = chip_width / col; 148 | 149 | double Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height; 150 | double Rx = grid_width / (2.0 * K_SI * t_chip * grid_height); 151 | double Ry = grid_height / (2.0 * K_SI * t_chip * grid_width); 152 | double Rz = t_chip / (K_SI * grid_height * grid_width); 153 | 154 | double max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI); 155 | double step = PRECISION / max_slope; 156 | double t; 157 | 158 | #ifdef VERBOSE 159 | fprintf(stdout, "total iterations: %d s\tstep size: %g s\n", num_iterations, step); 160 | fprintf(stdout, "Rx: %g\tRy: %g\tRz: %g\tCap: %g\n", Rx, Ry, Rz, Cap); 161 | #endif 162 | 163 | for (int i = 0; i < num_iterations ; i++) 164 | { 165 | #ifdef VERBOSE 166 | fprintf(stdout, "iteration %d\n", i++); 167 | #endif 168 | single_iteration(row, col, Cap, Rx, Ry, Rz, step); 169 | } 170 | 171 | #ifdef VERBOSE 172 | fprintf(stdout, "iteration %d\n", i++); 173 | #endif 174 | } 175 | 176 | void fatal(char *s) 177 | { 178 | fprintf(stderr, "error: %s\n", s); 179 | exit(1); 180 | } 181 | 182 | void read_input(double *vect, int grid_rows, int grid_cols, char *file) 183 | { 184 | int i, index; 185 | FILE *fp; 186 | char str[STR_SIZE]; 187 | double val; 188 | 189 | fp = fopen (file, "r"); 190 | if (!fp) 191 | fatal ("file could not be opened for reading"); 192 | 193 | for (i=0; i < grid_rows * grid_cols; i++) { 194 | fgets(str, STR_SIZE, fp); 195 | if (feof(fp)) 196 | fatal("not enough lines in file"); 197 | if ((sscanf(str, "%lf", &val) != 1) ) 198 | fatal("invalid file format"); 199 | vect[i] = val; 200 | } 201 | 202 | fclose(fp); 203 | } 204 | 205 | void init_data(int grid_rows, int grid_cols) { 206 | 207 | /* allocate memory for the temperature and power arrays */ 208 | temp = (double *) calloc (grid_rows * grid_cols, sizeof(double)); 209 | power = (double *) calloc (grid_rows * grid_cols, sizeof(double)); 210 | result = (double *) malloc (grid_rows * grid_cols * sizeof(double)); 211 | 212 | if(!temp || !power) 213 | fatal("unable to allocate memory"); 214 | 215 | /* read initial temperatures and input power */ 216 | read_input(temp, grid_rows, grid_cols, tfile); 217 | read_input(power, grid_rows, grid_cols, pfile); 218 | 219 | } 220 | 221 | 222 | void usage(int argc, char **argv) 223 | { 224 | fprintf(stderr, "Usage: %s \n", argv[0]); 225 | fprintf(stderr, "\t - number of rows in the grid (positive integer)\n"); 226 | fprintf(stderr, "\t - number of columns in the grid (positive integer)\n"); 227 | fprintf(stderr, "\t - number of iterations\n"); 228 | fprintf(stderr, "\t - number of threads\n"); 229 | fprintf(stderr, "\t - name of the file containing the initial temperature values of each cell\n"); 230 | fprintf(stderr, "\t - name of the file containing the dissipated power values of each cell\n"); 231 | exit(1); 232 | } 233 | 234 | int main(int argc, char **argv) 235 | { 236 | int grid_rows, grid_cols, sim_time, i; 237 | int size; 238 | 239 | /* check validity of inputs */ 240 | if (argc != 7) 241 | usage(argc, argv); 242 | if ((grid_rows = atoi(argv[1])) <= 0 || 243 | (grid_cols = atoi(argv[2])) <= 0 || 244 | (sim_time = atoi(argv[3])) <= 0 || 245 | (num_omp_threads = atoi(argv[4])) <= 0 246 | ) 247 | usage(argc, argv); 248 | tfile = argv[5]; 249 | pfile = argv[6]; 250 | 251 | size = grid_rows*grid_cols; 252 | init_data(grid_rows, grid_cols); 253 | printf("Start computing the transient temperature\n"); 254 | compute_tran_temp(sim_time, grid_rows, grid_cols); 255 | printf("Ending simulation\n"); 256 | /* output results */ 257 | #ifdef VERBOSE 258 | fprintf(stdout, "Final Temperatures:\n"); 259 | #endif 260 | 261 | #ifdef OUTPUT 262 | #pragma acc update host(temp[0:size]) 263 | for(i=0; i < grid_rows * grid_cols; i++) 264 | fprintf(stdout, "%d\t%g\n", i, temp[i]); 265 | #endif 266 | 267 | /* cleanup */ 268 | free(temp); 269 | free(power); 270 | free(result); 271 | return 0; 272 | } 273 | 274 | 275 | -------------------------------------------------------------------------------- /ep19-nvvp-analysis/transpose.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2012 NVIDIA Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | 18 | // Convenience function for checking CUDA runtime API results 19 | // can be wrapped around any runtime API call. No-op in release builds. 20 | inline 21 | cudaError_t checkCuda(cudaError_t result) 22 | { 23 | #if defined(DEBUG) || defined(_DEBUG) 24 | if (result != cudaSuccess) { 25 | fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); 26 | assert(result == cudaSuccess); 27 | } 28 | #endif 29 | return result; 30 | } 31 | 32 | const int TILE_DIM = 8; 33 | const int BLOCK_ROWS = 4; 34 | const int NUM_REPS = 100; 35 | 36 | // Check errors and print GB/s 37 | void postprocess(const float *ref, const float *res, int n, float ms) 38 | { 39 | bool passed = true; 40 | for (int i = 0; i < n; i++) 41 | if (res[i] != ref[i]) { 42 | printf("%d %f %f\n", i, res[i], ref[i]); 43 | printf("%25s\n", "*** FAILED ***"); 44 | passed = false; 45 | break; 46 | } 47 | if (passed) 48 | printf("%20.2f\n", 2 * n * sizeof(float) * 1e-6 * NUM_REPS / ms ); 49 | } 50 | 51 | // simple copy kernel 52 | // Used as reference case representing best effective bandwidth. 53 | __global__ void copy(float *odata, const float *idata) 54 | { 55 | int x = blockIdx.x * TILE_DIM + threadIdx.x; 56 | int y = blockIdx.y * TILE_DIM + threadIdx.y; 57 | int width = gridDim.x * TILE_DIM; 58 | 59 | for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS) 60 | odata[(y+j)*width + x] = idata[(y+j)*width + x]; 61 | } 62 | 63 | // copy kernel using shared memory 64 | // Also used as reference case, demonstrating effect of using shared memory. 65 | __global__ void copySharedMem(float *odata, const float *idata) 66 | { 67 | __shared__ float tile[TILE_DIM * TILE_DIM]; 68 | 69 | int x = blockIdx.x * TILE_DIM + threadIdx.x; 70 | int y = blockIdx.y * TILE_DIM + threadIdx.y; 71 | int width = gridDim.x * TILE_DIM; 72 | 73 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 74 | tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x]; 75 | 76 | __syncthreads(); 77 | 78 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 79 | odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x]; 80 | } 81 | 82 | // naive transpose 83 | // Simplest transpose; doesn't use shared memory. 84 | // Global memory reads are coalesced but writes are not. 85 | __global__ void transposeNaive(float *odata, const float *idata) 86 | { 87 | int x = blockIdx.x * TILE_DIM + threadIdx.x; 88 | int y = blockIdx.y * TILE_DIM + threadIdx.y; 89 | int width = gridDim.x * TILE_DIM; 90 | 91 | for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS) 92 | odata[x*width + (y+j)] = idata[(y+j)*width + x]; 93 | } 94 | 95 | // coalesced transpose 96 | // Uses shared memory to achieve coalesing in both reads and writes 97 | // Tile width == #banks causes shared memory bank conflicts. 98 | __global__ void transposeCoalesced(float *odata, const float *idata) 99 | { 100 | __shared__ float tile[TILE_DIM][TILE_DIM]; 101 | 102 | int x = blockIdx.x * TILE_DIM + threadIdx.x; 103 | int y = blockIdx.y * TILE_DIM + threadIdx.y; 104 | int width = gridDim.x * TILE_DIM; 105 | 106 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 107 | tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; 108 | 109 | __syncthreads(); 110 | 111 | x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset 112 | y = blockIdx.x * TILE_DIM + threadIdx.y; 113 | 114 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 115 | odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; 116 | } 117 | 118 | 119 | // No bank-conflict transpose 120 | // Same as transposeCoalesced except the first tile dimension is padded 121 | // to avoid shared memory bank conflicts. 122 | __global__ void transposeNoBankConflicts(float *odata, const float *idata) 123 | { 124 | __shared__ float tile[TILE_DIM][TILE_DIM+1]; 125 | 126 | int x = blockIdx.x * TILE_DIM + threadIdx.x; 127 | int y = blockIdx.y * TILE_DIM + threadIdx.y; 128 | int width = gridDim.x * TILE_DIM; 129 | 130 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 131 | tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; 132 | 133 | __syncthreads(); 134 | 135 | x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset 136 | y = blockIdx.x * TILE_DIM + threadIdx.y; 137 | 138 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 139 | odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; 140 | } 141 | 142 | int main(int argc, char **argv) 143 | { 144 | const int nx = 1024; 145 | const int ny = 1024; 146 | const int mem_size = nx*ny*sizeof(float); 147 | 148 | dim3 dimGrid(nx/TILE_DIM, ny/TILE_DIM, 1); 149 | dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); 150 | 151 | int devId = 0; 152 | if (argc > 1) devId = atoi(argv[1]); 153 | 154 | cudaDeviceProp prop; 155 | checkCuda( cudaGetDeviceProperties(&prop, devId)); 156 | printf("\nDevice : %s\n", prop.name); 157 | printf("Matrix size: %d %d, Block size: %d %d, Tile size: %d %d\n", 158 | nx, ny, TILE_DIM, BLOCK_ROWS, TILE_DIM, TILE_DIM); 159 | printf("dimGrid: %d %d %d. dimBlock: %d %d %d\n", 160 | dimGrid.x, dimGrid.y, dimGrid.z, dimBlock.x, dimBlock.y, dimBlock.z); 161 | 162 | checkCuda( cudaSetDevice(devId) ); 163 | 164 | float *h_idata = (float*)malloc(mem_size); 165 | float *h_cdata = (float*)malloc(mem_size); 166 | float *h_tdata = (float*)malloc(mem_size); 167 | float *gold = (float*)malloc(mem_size); 168 | 169 | float *d_idata, *d_cdata, *d_tdata; 170 | checkCuda( cudaMalloc(&d_idata, mem_size) ); 171 | checkCuda( cudaMalloc(&d_cdata, mem_size) ); 172 | checkCuda( cudaMalloc(&d_tdata, mem_size) ); 173 | 174 | // check parameters and calculate execution configuration 175 | if (nx % TILE_DIM || ny % TILE_DIM) { 176 | printf("nx and ny must be a multiple of TILE_DIM\n"); 177 | goto error_exit; 178 | } 179 | 180 | /*if (TILE_DIM % BLOCK_ROWS) { 181 | printf("TILE_DIM must be a multiple of BLOCK_ROWS\n"); 182 | goto error_exit; 183 | }*/ 184 | 185 | // host 186 | for (int j = 0; j < ny; j++) 187 | for (int i = 0; i < nx; i++) 188 | h_idata[j*nx + i] = j*nx + i; 189 | 190 | // correct result for error checking 191 | for (int j = 0; j < ny; j++) 192 | for (int i = 0; i < nx; i++) 193 | gold[j*nx + i] = h_idata[i*nx + j]; 194 | 195 | // device 196 | checkCuda( cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) ); 197 | 198 | // events for timing 199 | cudaEvent_t startEvent, stopEvent; 200 | checkCuda( cudaEventCreate(&startEvent) ); 201 | checkCuda( cudaEventCreate(&stopEvent) ); 202 | float ms; 203 | 204 | // ------------ 205 | // time kernels 206 | // ------------ 207 | printf("%25s%25s\n", "Routine", "Bandwidth (GB/s)"); 208 | 209 | /* // ---- 210 | // copy 211 | // ---- 212 | printf("%25s", "copy"); 213 | checkCuda( cudaMemset(d_cdata, 0, mem_size) ); 214 | // warm up 215 | copy<<>>(d_cdata, d_idata); 216 | checkCuda( cudaEventRecord(startEvent, 0) ); 217 | for (int i = 0; i < NUM_REPS; i++) 218 | copy<<>>(d_cdata, d_idata); 219 | checkCuda( cudaEventRecord(stopEvent, 0) ); 220 | checkCuda( cudaEventSynchronize(stopEvent) ); 221 | checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) ); 222 | checkCuda( cudaMemcpy(h_cdata, d_cdata, mem_size, cudaMemcpyDeviceToHost) ); 223 | postprocess(h_idata, h_cdata, nx*ny, ms); 224 | 225 | // ------------- 226 | // copySharedMem 227 | // ------------- 228 | printf("%25s", "shared memory copy"); 229 | checkCuda( cudaMemset(d_cdata, 0, mem_size) ); 230 | // warm up 231 | copySharedMem<<>>(d_cdata, d_idata); 232 | checkCuda( cudaEventRecord(startEvent, 0) ); 233 | for (int i = 0; i < NUM_REPS; i++) 234 | copySharedMem<<>>(d_cdata, d_idata); 235 | checkCuda( cudaEventRecord(stopEvent, 0) ); 236 | checkCuda( cudaEventSynchronize(stopEvent) ); 237 | checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) ); 238 | checkCuda( cudaMemcpy(h_cdata, d_cdata, mem_size, cudaMemcpyDeviceToHost) ); 239 | postprocess(h_idata, h_cdata, nx * ny, ms); 240 | */ 241 | // -------------- 242 | // transposeNaive 243 | // -------------- 244 | printf("%25s", "naive transpose"); 245 | checkCuda( cudaMemset(d_tdata, 0, mem_size) ); 246 | // warmup 247 | transposeNaive<<>>(d_tdata, d_idata); 248 | checkCuda( cudaEventRecord(startEvent, 0) ); 249 | for (int i = 0; i < NUM_REPS; i++) 250 | transposeNaive<<>>(d_tdata, d_idata); 251 | checkCuda( cudaEventRecord(stopEvent, 0) ); 252 | checkCuda( cudaEventSynchronize(stopEvent) ); 253 | checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) ); 254 | checkCuda( cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost) ); 255 | postprocess(gold, h_tdata, nx * ny, ms); 256 | 257 | /* // ------------------ 258 | // transposeCoalesced 259 | // ------------------ 260 | printf("%25s", "coalesced transpose"); 261 | checkCuda( cudaMemset(d_tdata, 0, mem_size) ); 262 | // warmup 263 | transposeCoalesced<<>>(d_tdata, d_idata); 264 | checkCuda( cudaEventRecord(startEvent, 0) ); 265 | for (int i = 0; i < NUM_REPS; i++) 266 | transposeCoalesced<<>>(d_tdata, d_idata); 267 | checkCuda( cudaEventRecord(stopEvent, 0) ); 268 | checkCuda( cudaEventSynchronize(stopEvent) ); 269 | checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) ); 270 | checkCuda( cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost) ); 271 | postprocess(gold, h_tdata, nx * ny, ms); 272 | 273 | // ------------------------ 274 | // transposeNoBankConflicts 275 | // ------------------------ 276 | printf("%25s", "conflict-free transpose"); 277 | checkCuda( cudaMemset(d_tdata, 0, mem_size) ); 278 | // warmup 279 | transposeNoBankConflicts<<>>(d_tdata, d_idata); 280 | checkCuda( cudaEventRecord(startEvent, 0) ); 281 | for (int i = 0; i < NUM_REPS; i++) 282 | transposeNoBankConflicts<<>>(d_tdata, d_idata); 283 | checkCuda( cudaEventRecord(stopEvent, 0) ); 284 | checkCuda( cudaEventSynchronize(stopEvent) ); 285 | checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) ); 286 | checkCuda( cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost) ); 287 | postprocess(gold, h_tdata, nx * ny, ms);*/ 288 | 289 | error_exit: 290 | // cleanup 291 | checkCuda( cudaEventDestroy(startEvent) ); 292 | checkCuda( cudaEventDestroy(stopEvent) ); 293 | checkCuda( cudaFree(d_tdata) ); 294 | checkCuda( cudaFree(d_cdata) ); 295 | checkCuda( cudaFree(d_idata) ); 296 | free(h_idata); 297 | free(h_tdata); 298 | free(h_cdata); 299 | free(gold); 300 | } 301 | 302 | -------------------------------------------------------------------------------- /ep2-first-cuda-c-program/kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define SIZE 1024 4 | 5 | __global__ void VectorAdd(int *a, int *b, int *c, int n) 6 | { 7 | int i = threadIdx.x; 8 | 9 | if (i < n) 10 | c[i] = a[i] + b[i]; 11 | } 12 | 13 | int main() 14 | { 15 | int *a, *b, *c; 16 | int *d_a, *d_b, *d_c; 17 | 18 | a = (int *)malloc(SIZE*sizeof(int)); 19 | b = (int *)malloc(SIZE*sizeof(int)); 20 | c = (int *)malloc(SIZE*sizeof(int)); 21 | 22 | cudaMalloc( &d_a, SIZE*sizeof(int)); 23 | cudaMalloc( &d_b, SIZE*sizeof(int)); 24 | cudaMalloc( &d_c, SIZE*sizeof(int)); 25 | 26 | for( int i = 0; i < SIZE; ++i ) 27 | { 28 | a[i] = i; 29 | b[i] = i; 30 | c[i] = 0; 31 | } 32 | 33 | cudaMemcpy( d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice ); 34 | cudaMemcpy( d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice ); 35 | cudaMemcpy( d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice ); 36 | 37 | VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE); 38 | 39 | cudaMemcpy( c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost ); 40 | 41 | for( int i = 0; i < 10; ++i) 42 | printf("c[%d] = %d\n", i, c[i]); 43 | 44 | free(a); 45 | free(b); 46 | free(c); 47 | 48 | cudaFree(d_a); 49 | cudaFree(d_b); 50 | cudaFree(d_c); 51 | 52 | return 0; 53 | } -------------------------------------------------------------------------------- /ep3-first-openacc-program/laplace2d.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2012 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include "timer.h" 20 | 21 | #define NN 4096 22 | #define NM 4096 23 | 24 | double A[NN][NM]; 25 | double Anew[NN][NM]; 26 | 27 | int main(int argc, char** argv) 28 | { 29 | const int n = NN; 30 | const int m = NM; 31 | const int iter_max = 1000; 32 | 33 | const double tol = 1.0e-6; 34 | double error = 1.0; 35 | 36 | memset(A, 0, n * m * sizeof(double)); 37 | memset(Anew, 0, n * m * sizeof(double)); 38 | 39 | for (int j = 0; j < n; j++) 40 | { 41 | A[j][0] = 1.0; 42 | Anew[j][0] = 1.0; 43 | } 44 | 45 | printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m); 46 | 47 | StartTimer(); 48 | int iter = 0; 49 | 50 | #pragma acc data copy(A), create(Anew) 51 | while ( error > tol && iter < iter_max ) 52 | { 53 | error = 0.0; 54 | 55 | #pragma omp parallel for shared(m, n, Anew, A) 56 | #pragma acc kernels 57 | for( int j = 1; j < n-1; j++) 58 | { 59 | for( int i = 1; i < m-1; i++ ) 60 | { 61 | Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] 62 | + A[j-1][i] + A[j+1][i]); 63 | error = fmax( error, fabs(Anew[j][i] - A[j][i])); 64 | } 65 | } 66 | 67 | #pragma omp parallel for shared(m, n, Anew, A) 68 | #pragma acc kernels 69 | for( int j = 1; j < n-1; j++) 70 | { 71 | for( int i = 1; i < m-1; i++ ) 72 | { 73 | A[j][i] = Anew[j][i]; 74 | } 75 | } 76 | 77 | if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error); 78 | 79 | iter++; 80 | } 81 | 82 | double runtime = GetTimer(); 83 | 84 | printf(" total: %f s\n", runtime / 1000); 85 | } 86 | -------------------------------------------------------------------------------- /ep3-first-openacc-program/timer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2012 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef TIMER_H 18 | #define TIMER_H 19 | 20 | #include 21 | 22 | #ifdef WIN32 23 | #define WIN32_LEAN_AND_MEAN 24 | #include 25 | #else 26 | #include 27 | #endif 28 | 29 | #ifdef WIN32 30 | double PCFreq = 0.0; 31 | __int64 timerStart = 0; 32 | #else 33 | struct timeval timerStart; 34 | #endif 35 | 36 | void StartTimer() 37 | { 38 | #ifdef WIN32 39 | LARGE_INTEGER li; 40 | if(!QueryPerformanceFrequency(&li)) 41 | printf("QueryPerformanceFrequency failed!\n"); 42 | 43 | PCFreq = (double)li.QuadPart/1000.0; 44 | 45 | QueryPerformanceCounter(&li); 46 | timerStart = li.QuadPart; 47 | #else 48 | gettimeofday(&timerStart, NULL); 49 | #endif 50 | } 51 | 52 | // time elapsed in ms 53 | double GetTimer() 54 | { 55 | #ifdef WIN32 56 | LARGE_INTEGER li; 57 | QueryPerformanceCounter(&li); 58 | return (double)(li.QuadPart-timerStart)/PCFreq; 59 | #else 60 | struct timeval timerStop, timerElapsed; 61 | gettimeofday(&timerStop, NULL); 62 | timersub(&timerStop, &timerStart, &timerElapsed); 63 | return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0; 64 | #endif 65 | } 66 | 67 | #endif // TIMER_H 68 | --------------------------------------------------------------------------------