├── .gitignore
├── README.md
├── ep11-racecheck-analysis
    ├── Makefile
    └── conway.cu
├── ep17-openacc2-data
    ├── LICENSE
    ├── Makefile
    ├── hotspot.c
    └── orig_hotspot.c
├── ep19-nvvp-analysis
    └── transpose.cu
├── ep2-first-cuda-c-program
    └── kernel.cu
└── ep3-first-openacc-program
    ├── laplace2d.c
    └── timer.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | 
 6 | # Compiled Dynamic libraries
 7 | *.so
 8 | *.dylib
 9 | 
10 | # Compiled Static libraries
11 | *.lai
12 | *.la
13 | *.a
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | cudacasts
2 | =========
3 | 
4 | Source code from NVIDIA CUDACasts
5 | 


--------------------------------------------------------------------------------
/ep11-racecheck-analysis/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | .PHONY: build clean clobber run
 3 | 
 4 | all: build
 5 | 
 6 | build: conway
 7 | 
 8 | conway: conway.cu
 9 | 	nvcc -I. -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=compute_30 -gencode arch=compute_35,code=sm_35 -lineinfo -O3 -o conway conway.cu
10 | 
11 | clean:
12 | 	-rm conway
13 | 
14 | clobber: clean
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/ep11-racecheck-analysis/conway.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | 
  3 | #define CHECK(x) {                                           \
  4 |         cudaError_t result = x;                              \
  5 |         if (result != cudaSuccess) {                         \
  6 |             printf("%d:unexpected error:%s, expecting:%s\n", \
  7 |                     __LINE__,                                \
  8 |                     cudaGetErrorString(result),              \
  9 |                     cudaGetErrorString(cudaSuccess));        \
 10 |             exit(0);                                         \
 11 |         }                                                    \
 12 |     }
 13 | 
 14 | 
 15 | #define ARRXY(arr,x,y) arr[(x) + ( (y) * (max_x) )]
 16 | #define P_X(x) ((x + max_x - 1) % max_x)
 17 | #define N_X(x) ((x + 1) % max_x)
 18 | #define P_Y(y) ((y + max_y - 1) % max_y)
 19 | #define N_Y(y) ((y + 1) % max_y)
 20 | 
 21 | __host__ __device__ void
 22 | printArray(char *arr, int max_y, int max_x)
 23 | {
 24 |     int x, y;
 25 | 
 26 |     if (!arr)
 27 |         return;
 28 | 
 29 |     printf("\n");
 30 |     for (y = max_y - 1; y >= 0; --y) {
 31 |         for (x = 0; x < max_x; ++x) {
 32 |             printf("%s", ARRXY(arr,x,y) ? "X":".");
 33 |         }
 34 |         printf("\n");
 35 |     }
 36 | }
 37 | 
 38 | __device__ __forceinline__ int
 39 | getNeighborCount(int base_offset, const int max_y, const int max_x, const int x, const int y)
 40 | {
 41 |     int nborcount = 0;
 42 |     extern __shared__ char buf[];
 43 |     char *cur;
 44 | 
 45 |     cur = buf + base_offset;
 46 | 
 47 |     nborcount += ARRXY(cur, P_X(x), y);
 48 |     nborcount += ARRXY(cur, P_X(x), P_Y(y));
 49 |     nborcount += ARRXY(cur, P_X(x), N_Y(y));
 50 | 
 51 |     nborcount += ARRXY(cur, x, P_Y(y));
 52 |     nborcount += ARRXY(cur, x, N_Y(y));
 53 | 
 54 |     nborcount += ARRXY(cur, N_X(x), y);
 55 |     nborcount += ARRXY(cur, N_X(x), P_Y(y));
 56 |     nborcount += ARRXY(cur, N_X(x), N_Y(y));
 57 | 
 58 |     return nborcount;
 59 | }
 60 | 
 61 | __device__ __forceinline__ void
 62 | updateCell(int cur_offset, int next_offset, const int max_y, const int max_x, const int x, const int y, const int singlethread)
 63 | {
 64 |     int nborcount = 0;
 65 |     extern __shared__ char buf[];
 66 |     char *cur, *next;
 67 | 
 68 |     cur = buf + cur_offset;
 69 |     next = buf + next_offset;
 70 | 
 71 |     nborcount = getNeighborCount(cur_offset, max_y, max_x, x, y);
 72 | 
 73 |     // Compute the next in the next buffer
 74 |     // 1. Any live cell with <2 neighbors dies
 75 |     // 2. Any live cell with 2 || 3 neighbors lives
 76 |     // 3. Any live cell with >3 neigbors dies
 77 |     // 4. Any dead cell with =3 neighbors becomes alive
 78 | 
 79 |     if (ARRXY(cur,x, y) &&
 80 |         (nborcount < 2 || nborcount > 3))
 81 |             ARRXY(next, x,y) = 0;
 82 |     else if (!ARRXY(cur, x, y) &&
 83 |              nborcount == 3)
 84 |         ARRXY(next, x, y) = 1;
 85 |     else
 86 |         ARRXY(next, x, y) = ARRXY(cur, x, y);
 87 | }
 88 | 
 89 | __global__ void
 90 | gameLoop(char *raw_in, char *raw_out, const int max_y, const int max_x, int num_iter, int print_interval, int singlethread)
 91 | {
 92 |     extern __shared__ char buf[];
 93 |     char *cur, *next, *tmp;
 94 |     size_t arraysize = 0;
 95 |     int iter, x, y, i, j;
 96 | 
 97 |     // Sanity checks
 98 |     arraysize = (max_x) * (max_y);
 99 | 
100 |     // Skip threads we dont care about
101 |     if (singlethread) {
102 |         if (threadIdx.x > 0 || threadIdx.y > 0)
103 |             return;
104 |     }
105 | 
106 |     if (threadIdx.x >= max_x)
107 |         return;
108 | 
109 |     if (threadIdx.y >= max_y)
110 |         return;
111 | 
112 |     cur  = buf;
113 |     next  = (cur + arraysize);
114 | 
115 |     x = threadIdx.x + 0;
116 |     y = threadIdx.y + 0;
117 | 
118 |     // Reset Shmem
119 |     if (threadIdx.x == 0 && threadIdx.y == 0) {
120 |         for (i = 0; i < max_x; ++i) {
121 |             for (j = 0; j < max_y; ++j) {
122 |                 ARRXY(cur, i, j) = 0;
123 |                 ARRXY(next, i, j) = 0;
124 |             }
125 |         }
126 |     }
127 |     __syncthreads();
128 | 
129 |     // Populate the shmem buffer
130 |     if (singlethread) {
131 |         for (x = 0; x < max_x; ++x)
132 |             for (y = 0; y < max_y; ++y)
133 |                 ARRXY(cur, x, y) = ARRXY(raw_in, x, y);
134 |     }
135 |     else
136 |         ARRXY(cur, x, y) = ARRXY(raw_in, threadIdx.x, threadIdx.y);
137 | 
138 |     __syncthreads();
139 | 
140 |     // Start the iteration loop
141 |     for (iter = 0; iter < num_iter; ++iter) {
142 |         // Compute the neighbor count in the current state
143 | 
144 |         if (singlethread) {
145 |             for (x = 0; x < max_x; ++x)
146 |                 for (y = 0; y < max_y; ++y)
147 |                     updateCell(cur - buf, next - buf, max_y, max_x, x, y, singlethread);
148 | 
149 |             if ((threadIdx.x == 0 && threadIdx.y == 0) &&
150 |                 print_interval &&
151 |                 !(iter % print_interval)) {
152 |                 printArray(cur, max_y, max_x);
153 |             }
154 |         }
155 |         else {
156 |             updateCell(cur - buf, next - buf, max_y, max_x, x, y, singlethread);
157 | 
158 |             if ((threadIdx.x == 0 && threadIdx.y == 0) &&
159 |                 print_interval &&
160 |                 !(iter % print_interval)) {
161 |                 printArray(cur, max_y, max_x);
162 |             }
163 |         }
164 | 
165 |         // Swap the next and current states :
166 |         tmp = cur;
167 |         cur = next;
168 |         next = tmp;
169 | 
170 |     }
171 | 
172 |     // Copy data out
173 |     if (singlethread) {
174 |         for (x = 0; x < max_x; ++x)
175 |             for (y = 0; y < max_y; ++y)
176 |                 ARRXY(raw_out,x, y) = ARRXY(cur, x, y);
177 |     }
178 |     else
179 |         ARRXY(raw_out,threadIdx.x, threadIdx.y) = ARRXY(cur, x, y);
180 | }
181 | 
182 | void
183 | initArray(char *arr, int max_y, int max_x, unsigned int seed, float bias)
184 | {
185 |     int x, y;
186 | 
187 |     if (!arr)
188 |         return;
189 | 
190 |     if (bias >= 1 || bias <= 0)
191 |         return;
192 | 
193 |     for (y = 0; y < max_y; ++y) {
194 |         for (x = 0; x < max_x; ++x) {
195 |             ARRXY(arr,x,y) = (rand() >= (RAND_MAX * (bias ))) ? 0 : 1;
196 |         }
197 |     }
198 | }
199 | 
200 | bool
201 | compareArrays(char *arr1, char *arr2, int max_y, int max_x)
202 | {
203 |     int x, y;
204 |     for (y = 0; y < max_y; ++y) {
205 |         for (x = 0; x < max_x; ++x) {
206 |            if(ARRXY(arr1,x,y) != ARRXY(arr2,x,y)) {
207 |                 printf("Mismatch at x:%d, y:%d\n", x, y);
208 |                 return false;
209 |             }
210 |         }
211 |     }
212 |     return true;
213 | }
214 | 
215 | float
216 | getLiveness(char *arr, int max_y, int max_x)
217 | {
218 |     int size = max_y * max_x;
219 |     int sum = 0;
220 |     int x,y;
221 | 
222 |     for (y = 0; y < max_y; ++y)
223 |         for (x = 0; x < max_x; ++x)
224 |            sum += ARRXY(arr,x,y);
225 | 
226 |     return (1.0*sum)/size;
227 | }
228 | 
229 | int
230 | main(int argc, char **argv)
231 | {
232 |     char *array = NULL, *array2 = NULL;
233 |     char *d_in, *d_out, *d_out2;
234 |     bool mismatch = false;
235 | 
236 |     int max_x  = (argc > 1) ? atol(argv[1]) : 7;
237 |     int max_y = (argc > 2) ? atol(argv[2]) : 7;
238 |     int dev_iter = (argc > 3) ? atol(argv[3]) : 10;
239 |     int print_interval = (argc > 4) ? atol(argv[4]) : 0;
240 |     float bias  = (argc > 5) ? atof(argv[5]) : 1.0/3;
241 |     unsigned int seed =  (argc > 6) ? atol(argv[6]) : 129;
242 | 
243 |     float initial_liveness = 1.0;
244 |     size_t bufsize = max_y * max_x * sizeof(char);
245 |     size_t shmemsize = ((max_y)*(max_x)) * 2;
246 | 
247 |     array = (char*)calloc(1, bufsize);
248 |     array2 = (char*)calloc(1, bufsize);
249 |     if (!array || !array2) {
250 |         printf("Failed to allocate memory\n");
251 |         return -1;
252 |     }
253 | 
254 |     CHECK(cudaMalloc(&d_in, bufsize));
255 |     CHECK(cudaMalloc(&d_out, bufsize));
256 |     CHECK(cudaMalloc(&d_out2, bufsize));
257 | 
258 |     printf (" Generating random array (%dx%d) Seed:%u Target Liveness:%f\n",
259 |             max_y, max_x, seed, bias);
260 |     initArray(array, max_y, max_x, seed, bias);
261 |     initial_liveness = getLiveness(array, max_y, max_x);
262 | 
263 |     CHECK(cudaMemset(d_out, 0x0, bufsize));
264 |     CHECK(cudaMemset(d_out2, 0x0, bufsize));
265 | 
266 |     dim3 threads(max_x, max_y, 1);
267 | 
268 |     CHECK(cudaMemcpy(d_in, array, bufsize, cudaMemcpyHostToDevice));
269 |     gameLoop<<<1, threads, shmemsize>>> (d_in, d_out, max_y, max_x, dev_iter, print_interval, 0);
270 |     gameLoop<<<1, threads, shmemsize>>> (d_in, d_out2, max_y, max_x, dev_iter, print_interval, 1);
271 |     CHECK(cudaMemcpy(array, d_out, bufsize, cudaMemcpyDeviceToHost));
272 |     CHECK(cudaMemcpy(array2, d_out2, bufsize, cudaMemcpyDeviceToHost));
273 | 
274 |     printf(" Array %dx%d (Shmem :%u) Iterations: %u Initial Liveness:%f Final Liveness:%f\n",
275 |            max_y, max_x, shmemsize, dev_iter, initial_liveness, getLiveness(array, max_y, max_x));
276 | 
277 |     if (!compareArrays(array, array2, max_y, max_x)) {
278 |         printf("Mismatch !!\n");
279 |         mismatch = true;
280 |     }
281 | 
282 |     printf(" Final Array : ");
283 |     printArray(array, max_y, max_x);
284 |     if (mismatch) {
285 |         printf(" \n Final Array : (single threaded)");
286 |         printArray(array2, max_y, max_x);
287 |     }
288 | 
289 |     free(array);
290 |     free(array2);
291 |     CHECK(cudaFree(d_in));
292 |     CHECK(cudaFree(d_out));
293 |     CHECK(cudaFree(d_out2));
294 |     CHECK(cudaDeviceReset());
295 |     return 0;
296 | }
297 | 


--------------------------------------------------------------------------------
/ep17-openacc2-data/LICENSE:
--------------------------------------------------------------------------------
 1 | LICENSE TERMS
 2 | 
 3 | Copyright (c)2008-2011 University of Virginia
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without modification, are permitted without royalty fees or other restrictions, provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 9 |     * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10 |     * Neither the name of the University of Virginia, the Dept. of Computer Science, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
11 | 
12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF VIRGINIA OR THE SOFTWARE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13 | 
14 | If you use this software or a modified version of it, please cite the most relevant among the following papers:
15 | 
16 | - M. A. Goodrum, M. J. Trotter, A. Aksel, S. T. Acton, and K. Skadron. Parallelization of Particle Filter Algorithms. In Proceedings 
17 | of the 3rd Workshop on Emerging Applications and Many-core Architecture (EAMA), in conjunction with the IEEE/ACM International 
18 | Symposium on Computer Architecture (ISCA), June 2010.
19 | 
20 | - S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, Sang-Ha Lee and K. Skadron.
21 | "Rodinia: A Benchmark Suite for Heterogeneous Computing". IEEE International Symposium
22 | on Workload Characterization, Oct 2009.
23 | 
24 | - J. Meng and K. Skadron. "Performance Modeling and Automatic Ghost Zone Optimization
25 | for Iterative Stencil Loops on GPUs." In Proceedings of the 23rd Annual ACM International
26 | Conference on Supercomputing (ICS), June 2009.
27 | 
28 | - L.G. Szafaryn, K. Skadron and J. Saucerman. "Experiences Accelerating MATLAB Systems
29 | Biology Applications." in Workshop on Biomedicine in Computing (BiC) at the International
30 | Symposium on Computer Architecture (ISCA), June 2009.
31 | 
32 | - M. Boyer, D. Tarjan, S. T. Acton, and K. Skadron. "Accelerating Leukocyte Tracking using CUDA:
33 | A Case Study in Leveraging Manycore Coprocessors." In Proceedings of the International Parallel
34 | and Distributed Processing Symposium (IPDPS), May 2009.
35 | 
36 | - S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, and K. Skadron. "A Performance
37 | Study of General Purpose Applications on Graphics Processors using CUDA" Journal of
38 | Parallel and Distributed Computing, Elsevier, June 2008.
39 | 


--------------------------------------------------------------------------------
/ep17-openacc2-data/Makefile:
--------------------------------------------------------------------------------
 1 | # C compiler
 2 | CC = pgcc
 3 | OPT = -fast
 4 | MP  = -mp
 5 | OMP_THRDS = 6
 6 | ACC = -acc -Minfo=accel
 7 | TIME = /usr/bin/time --verbose
 8 | DIFF = diff --brief 
 9 | 
10 | all: build run
11 | 
12 | build: hotspot_omp hotspot_acc
13 | 
14 | run: small medium large
15 | 
16 | small: small_omp small_acc
17 | medium: medium_omp medium_acc
18 | large: large_omp large_acc
19 | 
20 | hotspot_acc: hotspot.c
21 | 	$(CC) $(OPT) $(ACC) hotspot.c -o hotspot_acc
22 | 
23 | hotspot_omp: hotspot.c
24 | 	$(CC) $(OPT) $(MP) hotspot.c -o hotspot_omp
25 | 
26 | small_omp: hotspot_omp
27 | 	$(TIME) ./hotspot_omp  64 64 10000 $(OMP_THRDS) data/temp_64 data/power_64 > output_sm_omp.log
28 | 	$(DIFF) output_sm_omp.log data/sm_output.log
29 | 
30 | medium_omp: hotspot_omp
31 | 	$(TIME) ./hotspot_omp  512 512 10000 $(OMP_THRDS) data/temp_512 data/power_512 > output_md_omp.log
32 | 	$(DIFF) output_md_omp.log data/md_output.log
33 | 
34 | large_omp: hotspot_omp
35 | 	$(TIME) ./hotspot_omp  1024 1024 10000 $(OMP_THRDS) data/temp_1024 data/power_1024 > output_lg_omp.log
36 | 	$(DIFF) output_lg_omp.log data/lg_output.log
37 | 
38 | small_acc: hotspot_acc
39 | 	$(TIME) ./hotspot_acc 64 64 10000 1 data/temp_64 data/power_64 > output_sm_acc.log
40 | 	$(DIFF) output_sm_acc.log data/sm_output.log
41 | 
42 | medium_acc: hotspot_acc
43 | 	$(TIME) ./hotspot_acc  512 512 10000 1 data/temp_512 data/power_512 > output_md_acc.log
44 | 	$(DIFF) output_md_acc.log data/md_output.log
45 | 
46 | large_acc: hotspot_acc
47 | 	$(TIME) ./hotspot_acc  1024 1024 10000 1 data/temp_1024 data/power_1024 > output_lg_acc.log
48 | 	$(DIFF) output_lg_acc.log data/lg_output.log
49 | 
50 | clean:
51 | 	rm -f hotspot_acc hotspot_omp *.o *.log
52 | 
53 | 


--------------------------------------------------------------------------------
/ep17-openacc2-data/hotspot.c:
--------------------------------------------------------------------------------
  1 | /****************************************************************************
  2 |  * This example of using OpenACC unstructured data regions was derived from
  3 |  * the Rodinia Hotspot benchmark.  Please see the enclosed LICENSE file.
  4 |  * Copyright (c)2008-2011 University of Virginia
  5 |  * All rights reserved.
  6 |  ***************************************************************************/
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <omp.h>
 11 | #include <sys/time.h>
 12 | #ifdef _OPENACC
 13 | #include <openacc.h>
 14 | #endif
 15 | #define STR_SIZE	256
 16 | 
 17 | /* maximum power density possible (say 300W for a 10mm x 10mm chip)	*/
 18 | #define MAX_PD	(3.0e6)
 19 | /* required precision in degrees	*/
 20 | #define PRECISION	0.001
 21 | #define SPEC_HEAT_SI 1.75e6
 22 | #define K_SI 100
 23 | /* capacitance fitting factor	*/
 24 | #define FACTOR_CHIP	0.5
 25 | #define OUTPUT
 26 | 
 27 | // global data
 28 | double *temp, *power, *result;
 29 | char *tfile, *pfile;
 30 | 
 31 | /* chip parameters	*/
 32 | double t_chip = 0.0005;
 33 | double chip_height = 0.016;
 34 | double chip_width = 0.016;
 35 | /* ambient temperature, assuming no package at all	*/
 36 | double amb_temp = 80.0;
 37 | 
 38 | int num_omp_threads;
 39 | 
 40 | /* Single iteration of the transient solver in the grid model.
 41 |  * advances the solution of the discretized difference equations 
 42 |  * by one time step
 43 |  */
 44 | void single_iteration(int row, int col,
 45 | 		  double Cap, double Rx, double Ry, double Rz, 
 46 | 		  double step)
 47 | {
 48 | 	double delta;
 49 | 	int r, c;
 50 | 
 51 | #pragma acc declare deviceptr(result), present_or_copy(temp[0:row*col]), pcopyin(power[0:row*col])
 52 | #ifdef _OPENMP
 53 | 	omp_set_num_threads(num_omp_threads);
 54 |     #pragma omp parallel for shared(power, temp,result) private(r, c, delta) firstprivate(row, col) schedule(static)
 55 | #endif
 56 | 
 57 | #pragma acc kernels loop independent 
 58 | 	for (r = 0; r < row; r++) {
 59 | #pragma acc loop independent
 60 | 		for (c = 0; c < col; c++) {
 61 |   			/*	Corner 1	*/
 62 | 			if ( (r == 0) && (c == 0) ) {
 63 | 				delta = (step / Cap) * (power[0] +
 64 | 						(temp[1] - temp[0]) / Rx +
 65 | 						(temp[col] - temp[0]) / Ry +
 66 | 						(amb_temp - temp[0]) / Rz);
 67 | 			}	/*	Corner 2	*/
 68 | 			else if ((r == 0) && (c == col-1)) {
 69 | 				delta = (step / Cap) * (power[c] +
 70 | 						(temp[c-1] - temp[c]) / Rx +
 71 | 						(temp[c+col] - temp[c]) / Ry +
 72 | 						(amb_temp - temp[c]) / Rz);
 73 | 			}	/*	Corner 3	*/
 74 | 			else if ((r == row-1) && (c == col-1)) {
 75 | 				delta = (step / Cap) * (power[r*col+c] + 
 76 | 						(temp[r*col+c-1] - temp[r*col+c]) / Rx + 
 77 | 						(temp[(r-1)*col+c] - temp[r*col+c]) / Ry + 
 78 | 						(amb_temp - temp[r*col+c]) / Rz);					
 79 | 			}	/*	Corner 4	*/
 80 | 			else if ((r == row-1) && (c == 0)) {
 81 | 				delta = (step / Cap) * (power[r*col] + 
 82 | 						(temp[r*col+1] - temp[r*col]) / Rx + 
 83 | 						(temp[(r-1)*col] - temp[r*col]) / Ry + 
 84 | 						(amb_temp - temp[r*col]) / Rz);
 85 | 			}	/*	Edge 1	*/
 86 | 			else if (r == 0) {
 87 | 				delta = (step / Cap) * (power[c] + 
 88 | 						(temp[c+1] + temp[c-1] - 2.0*temp[c]) / Rx + 
 89 | 						(temp[col+c] - temp[c]) / Ry + 
 90 | 						(amb_temp - temp[c]) / Rz);
 91 | 			}	/*	Edge 2	*/
 92 | 			else if (c == col-1) {
 93 | 				delta = (step / Cap) * (power[r*col+c] + 
 94 | 						(temp[(r+1)*col+c] + temp[(r-1)*col+c] - 2.0*temp[r*col+c]) / Ry + 
 95 | 						(temp[r*col+c-1] - temp[r*col+c]) / Rx + 
 96 | 						(amb_temp - temp[r*col+c]) / Rz);
 97 | 			}	/*	Edge 3	*/
 98 | 			else if (r == row-1) {
 99 | 				delta = (step / Cap) * (power[r*col+c] + 
100 | 						(temp[r*col+c+1] + temp[r*col+c-1] - 2.0*temp[r*col+c]) / Rx + 
101 | 						(temp[(r-1)*col+c] - temp[r*col+c]) / Ry + 
102 | 						(amb_temp - temp[r*col+c]) / Rz);
103 | 			}	/*	Edge 4	*/
104 | 			else if (c == 0) {
105 | 				delta = (step / Cap) * (power[r*col] + 
106 | 						(temp[(r+1)*col] + temp[(r-1)*col] - 2.0*temp[r*col]) / Ry + 
107 | 						(temp[r*col+1] - temp[r*col]) / Rx + 
108 | 						(amb_temp - temp[r*col]) / Rz);
109 | 			}	/*	Inside the chip	*/
110 | 			else {
111 | 				delta = (step / Cap) * (power[r*col+c] + 
112 | 						(temp[(r+1)*col+c] + temp[(r-1)*col+c] - 2.0*temp[r*col+c]) / Ry + 
113 | 						(temp[r*col+c+1] + temp[r*col+c-1] - 2.0*temp[r*col+c]) / Rx + 
114 | 						(amb_temp - temp[r*col+c]) / Rz);
115 | 			}
116 |   			
117 | 			/*	Update Temperatures	*/
118 | 			result[r*col+c] =temp[r*col+c]+ delta;
119 | 
120 | 
121 | 		}
122 | 	}
123 | 
124 | #ifdef _OPENMP
125 | 	omp_set_num_threads(num_omp_threads);
126 | 	#pragma omp parallel for shared(result, temp) private(r, c) schedule(static)
127 | #endif
128 | #pragma acc kernels loop independent 
129 | 	for (r = 0; r < row; r++) {
130 | #pragma acc loop independent
131 | 		for (c = 0; c < col; c++) {
132 | 			temp[r*col+c]=result[r*col+c];
133 | 		}
134 | 	}
135 | }
136 | 
137 | /* Transient solver driver routine: simply converts the heat 
138 |  * transfer differential equations to difference equations 
139 |  * and solves the difference equations by iterating
140 |  */
141 | void compute_tran_temp(int num_iterations, int row, int col) 
142 | {
143 | 	#ifdef VERBOSE
144 | 	int i = 0;
145 | 	#endif
146 | 
147 | 	double grid_height = chip_height / row;
148 | 	double grid_width = chip_width / col;
149 | 
150 | 	double Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
151 | 	double Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
152 | 	double Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
153 | 	double Rz = t_chip / (K_SI * grid_height * grid_width);
154 | 
155 | 	double max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
156 | 	double step = PRECISION / max_slope;
157 | 	double t;
158 | 
159 | 	#ifdef VERBOSE
160 | 	fprintf(stdout, "total iterations: %d s\tstep size: %g s\n", num_iterations, step);
161 | 	fprintf(stdout, "Rx: %g\tRy: %g\tRz: %g\tCap: %g\n", Rx, Ry, Rz, Cap);
162 | 	#endif
163 | 
164 |      for (int i = 0; i < num_iterations ; i++)
165 | 	{
166 | 		#ifdef VERBOSE
167 | 		fprintf(stdout, "iteration %d\n", i++);
168 | 		#endif
169 | 		single_iteration(row, col, Cap, Rx, Ry, Rz, step);
170 | 	}	
171 | 
172 | 	#ifdef VERBOSE
173 | 	fprintf(stdout, "iteration %d\n", i++);
174 | 	#endif
175 | }
176 | 
177 | void fatal(char *s)
178 | {
179 | 	fprintf(stderr, "error: %s\n", s);
180 | 	exit(1);
181 | }
182 | 
183 | void read_input(double *vect, int grid_rows, int grid_cols, char *file)
184 | {
185 |   	int i, index;
186 | 	FILE *fp;
187 | 	char str[STR_SIZE];
188 | 	double val;
189 | 
190 | 	fp = fopen (file, "r");
191 | 	if (!fp)
192 | 		fatal ("file could not be opened for reading");
193 | 
194 | 	for (i=0; i < grid_rows * grid_cols; i++) {
195 | 		fgets(str, STR_SIZE, fp);
196 | 		if (feof(fp))
197 | 			fatal("not enough lines in file");
198 | 		if ((sscanf(str, "%lf", &val) != 1) )
199 | 			fatal("invalid file format");
200 | 		vect[i] = val;
201 | 	}
202 | 
203 | 	fclose(fp);	
204 | }
205 | 
206 | void init_data(int grid_rows, int grid_cols) {
207 | 
208 | 	/* allocate memory for the temperature and power arrays	*/
209 | 	temp = (double *) calloc (grid_rows * grid_cols, sizeof(double));
210 | 	power = (double *) calloc (grid_rows * grid_cols, sizeof(double));
211 | #ifdef _OPENACC
212 | 	result = (double *) acc_malloc (grid_rows * grid_cols * sizeof(double));
213 | #else
214 | 	result = (double *) malloc (grid_rows * grid_cols * sizeof(double));
215 | #endif
216 | 
217 | 	if(!temp || !power)
218 | 		fatal("unable to allocate memory");
219 | 
220 | 	/* read initial temperatures and input power	*/
221 | 	read_input(temp, grid_rows, grid_cols, tfile);
222 | 	read_input(power, grid_rows, grid_cols, pfile);
223 | #pragma acc enter data copyin(temp[0:grid_rows*grid_cols],power[0:grid_rows*grid_cols])
224 | }
225 | 
226 | 
227 | void usage(int argc, char **argv)
228 | {
229 | 	fprintf(stderr, "Usage: %s <grid_rows> <grid_cols> <sim_time> <no. of threads><temp_file> <power_file>\n", argv[0]);
230 | 	fprintf(stderr, "\t<grid_rows>  - number of rows in the grid (positive integer)\n");
231 | 	fprintf(stderr, "\t<grid_cols>  - number of columns in the grid (positive integer)\n");
232 | 	fprintf(stderr, "\t<sim_time>   - number of iterations\n");
233 | 	fprintf(stderr, "\t<no. of threads>   - number of threads\n");
234 | 	fprintf(stderr, "\t<temp_file>  - name of the file containing the initial temperature values of each cell\n");
235 | 	fprintf(stderr, "\t<power_file> - name of the file containing the dissipated power values of each cell\n");
236 | 	exit(1);
237 | }
238 | 
239 | int main(int argc, char **argv)
240 | {
241 | 	int grid_rows, grid_cols, sim_time, i;
242 |         int size;
243 | 	
244 | 	/* check validity of inputs	*/
245 | 	if (argc != 7)
246 | 		usage(argc, argv);
247 | 	if ((grid_rows = atoi(argv[1])) <= 0 ||
248 | 		(grid_cols = atoi(argv[2])) <= 0 ||
249 | 		(sim_time = atoi(argv[3])) <= 0 || 
250 | 		(num_omp_threads = atoi(argv[4])) <= 0
251 | 		)
252 | 		usage(argc, argv);
253 | 	tfile = argv[5];
254 | 	pfile = argv[6];
255 | 
256 |         size = grid_rows*grid_cols;
257 | 	init_data(grid_rows, grid_cols);
258 | 	printf("Start computing the transient temperature\n");
259 | 	compute_tran_temp(sim_time, grid_rows, grid_cols);
260 | 	printf("Ending simulation\n");
261 | 	/* output results	*/
262 | #ifdef VERBOSE
263 | 	fprintf(stdout, "Final Temperatures:\n");
264 | #endif
265 | 
266 | #ifdef OUTPUT
267 | #pragma acc update host(temp[0:size])
268 |         for(i=0; i < grid_rows * grid_cols; i++)
269 |         fprintf(stdout, "%d\t%g\n", i, temp[i]);
270 | #endif
271 | 
272 | 	/* cleanup	*/
273 | #pragma acc exit data delete(temp[0:size],power[0:size])
274 | 	free(temp);
275 | 	free(power);
276 | #ifdef _OPENACC
277 | 	acc_free(result);
278 | #else
279 | 	free(result);
280 | #endif
281 | 	return 0;
282 | }
283 | 
284 | 
285 | 


--------------------------------------------------------------------------------
/ep17-openacc2-data/orig_hotspot.c:
--------------------------------------------------------------------------------
  1 | /****************************************************************************
  2 |  * This example of using OpenACC unstructured data regions was derived from
  3 |  * the Rodinia Hotspot benchmark.  Please see the enclosed LICENSE file.
  4 |  * Copyright (c)2008-2011 University of Virginia
  5 |  * All rights reserved.
  6 |  ***************************************************************************/
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <omp.h>
 11 | #include <sys/time.h>
 12 | #ifdef _OPENACC
 13 | #include <openacc.h>
 14 | #endif
 15 | #define STR_SIZE	256
 16 | 
 17 | /* maximum power density possible (say 300W for a 10mm x 10mm chip)	*/
 18 | #define MAX_PD	(3.0e6)
 19 | /* required precision in degrees	*/
 20 | #define PRECISION	0.001
 21 | #define SPEC_HEAT_SI 1.75e6
 22 | #define K_SI 100
 23 | /* capacitance fitting factor	*/
 24 | #define FACTOR_CHIP	0.5
 25 | #define OUTPUT
 26 | 
 27 | // global data
 28 | double *temp, *power, *result;
 29 | char *tfile, *pfile;
 30 | 
 31 | /* chip parameters	*/
 32 | double t_chip = 0.0005;
 33 | double chip_height = 0.016;
 34 | double chip_width = 0.016;
 35 | /* ambient temperature, assuming no package at all	*/
 36 | double amb_temp = 80.0;
 37 | 
 38 | int num_omp_threads;
 39 | 
 40 | /* Single iteration of the transient solver in the grid model.
 41 |  * advances the solution of the discretized difference equations 
 42 |  * by one time step
 43 |  */
 44 | void single_iteration(int row, int col,
 45 | 		  double Cap, double Rx, double Ry, double Rz, 
 46 | 		  double step)
 47 | {
 48 | 	double delta;
 49 | 	int r, c;
 50 | 
 51 | #ifdef _OPENMP
 52 | 	omp_set_num_threads(num_omp_threads);
 53 |     #pragma omp parallel for shared(power, temp,result) private(r, c, delta) firstprivate(row, col) schedule(static)
 54 | #endif
 55 | 
 56 | #pragma acc kernels loop independent 
 57 | 	for (r = 0; r < row; r++) {
 58 | #pragma acc loop independent
 59 | 		for (c = 0; c < col; c++) {
 60 |   			/*	Corner 1	*/
 61 | 			if ( (r == 0) && (c == 0) ) {
 62 | 				delta = (step / Cap) * (power[0] +
 63 | 						(temp[1] - temp[0]) / Rx +
 64 | 						(temp[col] - temp[0]) / Ry +
 65 | 						(amb_temp - temp[0]) / Rz);
 66 | 			}	/*	Corner 2	*/
 67 | 			else if ((r == 0) && (c == col-1)) {
 68 | 				delta = (step / Cap) * (power[c] +
 69 | 						(temp[c-1] - temp[c]) / Rx +
 70 | 						(temp[c+col] - temp[c]) / Ry +
 71 | 						(amb_temp - temp[c]) / Rz);
 72 | 			}	/*	Corner 3	*/
 73 | 			else if ((r == row-1) && (c == col-1)) {
 74 | 				delta = (step / Cap) * (power[r*col+c] + 
 75 | 						(temp[r*col+c-1] - temp[r*col+c]) / Rx + 
 76 | 						(temp[(r-1)*col+c] - temp[r*col+c]) / Ry + 
 77 | 						(amb_temp - temp[r*col+c]) / Rz);					
 78 | 			}	/*	Corner 4	*/
 79 | 			else if ((r == row-1) && (c == 0)) {
 80 | 				delta = (step / Cap) * (power[r*col] + 
 81 | 						(temp[r*col+1] - temp[r*col]) / Rx + 
 82 | 						(temp[(r-1)*col] - temp[r*col]) / Ry + 
 83 | 						(amb_temp - temp[r*col]) / Rz);
 84 | 			}	/*	Edge 1	*/
 85 | 			else if (r == 0) {
 86 | 				delta = (step / Cap) * (power[c] + 
 87 | 						(temp[c+1] + temp[c-1] - 2.0*temp[c]) / Rx + 
 88 | 						(temp[col+c] - temp[c]) / Ry + 
 89 | 						(amb_temp - temp[c]) / Rz);
 90 | 			}	/*	Edge 2	*/
 91 | 			else if (c == col-1) {
 92 | 				delta = (step / Cap) * (power[r*col+c] + 
 93 | 						(temp[(r+1)*col+c] + temp[(r-1)*col+c] - 2.0*temp[r*col+c]) / Ry + 
 94 | 						(temp[r*col+c-1] - temp[r*col+c]) / Rx + 
 95 | 						(amb_temp - temp[r*col+c]) / Rz);
 96 | 			}	/*	Edge 3	*/
 97 | 			else if (r == row-1) {
 98 | 				delta = (step / Cap) * (power[r*col+c] + 
 99 | 						(temp[r*col+c+1] + temp[r*col+c-1] - 2.0*temp[r*col+c]) / Rx + 
100 | 						(temp[(r-1)*col+c] - temp[r*col+c]) / Ry + 
101 | 						(amb_temp - temp[r*col+c]) / Rz);
102 | 			}	/*	Edge 4	*/
103 | 			else if (c == 0) {
104 | 				delta = (step / Cap) * (power[r*col] + 
105 | 						(temp[(r+1)*col] + temp[(r-1)*col] - 2.0*temp[r*col]) / Ry + 
106 | 						(temp[r*col+1] - temp[r*col]) / Rx + 
107 | 						(amb_temp - temp[r*col]) / Rz);
108 | 			}	/*	Inside the chip	*/
109 | 			else {
110 | 				delta = (step / Cap) * (power[r*col+c] + 
111 | 						(temp[(r+1)*col+c] + temp[(r-1)*col+c] - 2.0*temp[r*col+c]) / Ry + 
112 | 						(temp[r*col+c+1] + temp[r*col+c-1] - 2.0*temp[r*col+c]) / Rx + 
113 | 						(amb_temp - temp[r*col+c]) / Rz);
114 | 			}
115 |   			
116 | 			/*	Update Temperatures	*/
117 | 			result[r*col+c] =temp[r*col+c]+ delta;
118 | 
119 | 
120 | 		}
121 | 	}
122 | 
123 | #ifdef _OPENMP
124 | 	omp_set_num_threads(num_omp_threads);
125 | 	#pragma omp parallel for shared(result, temp) private(r, c) schedule(static)
126 | #endif
127 | #pragma acc kernels loop independent 
128 | 	for (r = 0; r < row; r++) {
129 | #pragma acc loop independent
130 | 		for (c = 0; c < col; c++) {
131 | 			temp[r*col+c]=result[r*col+c];
132 | 		}
133 | 	}
134 | }
135 | 
136 | /* Transient solver driver routine: simply converts the heat 
137 |  * transfer differential equations to difference equations 
138 |  * and solves the difference equations by iterating
139 |  */
140 | void compute_tran_temp(int num_iterations, int row, int col) 
141 | {
142 | 	#ifdef VERBOSE
143 | 	int i = 0;
144 | 	#endif
145 | 
146 | 	double grid_height = chip_height / row;
147 | 	double grid_width = chip_width / col;
148 | 
149 | 	double Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
150 | 	double Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
151 | 	double Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
152 | 	double Rz = t_chip / (K_SI * grid_height * grid_width);
153 | 
154 | 	double max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
155 | 	double step = PRECISION / max_slope;
156 | 	double t;
157 | 
158 | 	#ifdef VERBOSE
159 | 	fprintf(stdout, "total iterations: %d s\tstep size: %g s\n", num_iterations, step);
160 | 	fprintf(stdout, "Rx: %g\tRy: %g\tRz: %g\tCap: %g\n", Rx, Ry, Rz, Cap);
161 | 	#endif
162 | 
163 |      for (int i = 0; i < num_iterations ; i++)
164 | 	{
165 | 		#ifdef VERBOSE
166 | 		fprintf(stdout, "iteration %d\n", i++);
167 | 		#endif
168 | 		single_iteration(row, col, Cap, Rx, Ry, Rz, step);
169 | 	}	
170 | 
171 | 	#ifdef VERBOSE
172 | 	fprintf(stdout, "iteration %d\n", i++);
173 | 	#endif
174 | }
175 | 
176 | void fatal(char *s)
177 | {
178 | 	fprintf(stderr, "error: %s\n", s);
179 | 	exit(1);
180 | }
181 | 
182 | void read_input(double *vect, int grid_rows, int grid_cols, char *file)
183 | {
184 |   	int i, index;
185 | 	FILE *fp;
186 | 	char str[STR_SIZE];
187 | 	double val;
188 | 
189 | 	fp = fopen (file, "r");
190 | 	if (!fp)
191 | 		fatal ("file could not be opened for reading");
192 | 
193 | 	for (i=0; i < grid_rows * grid_cols; i++) {
194 | 		fgets(str, STR_SIZE, fp);
195 | 		if (feof(fp))
196 | 			fatal("not enough lines in file");
197 | 		if ((sscanf(str, "%lf", &val) != 1) )
198 | 			fatal("invalid file format");
199 | 		vect[i] = val;
200 | 	}
201 | 
202 | 	fclose(fp);	
203 | }
204 | 
205 | void init_data(int grid_rows, int grid_cols) {
206 | 
207 | 	/* allocate memory for the temperature and power arrays	*/
208 | 	temp = (double *) calloc (grid_rows * grid_cols, sizeof(double));
209 | 	power = (double *) calloc (grid_rows * grid_cols, sizeof(double));
210 | 	result = (double *) malloc (grid_rows * grid_cols * sizeof(double));
211 | 
212 | 	if(!temp || !power)
213 | 		fatal("unable to allocate memory");
214 | 
215 | 	/* read initial temperatures and input power	*/
216 | 	read_input(temp, grid_rows, grid_cols, tfile);
217 | 	read_input(power, grid_rows, grid_cols, pfile);
218 | 
219 | }
220 | 
221 | 
222 | void usage(int argc, char **argv)
223 | {
224 | 	fprintf(stderr, "Usage: %s <grid_rows> <grid_cols> <sim_time> <no. of threads><temp_file> <power_file>\n", argv[0]);
225 | 	fprintf(stderr, "\t<grid_rows>  - number of rows in the grid (positive integer)\n");
226 | 	fprintf(stderr, "\t<grid_cols>  - number of columns in the grid (positive integer)\n");
227 | 	fprintf(stderr, "\t<sim_time>   - number of iterations\n");
228 | 	fprintf(stderr, "\t<no. of threads>   - number of threads\n");
229 | 	fprintf(stderr, "\t<temp_file>  - name of the file containing the initial temperature values of each cell\n");
230 | 	fprintf(stderr, "\t<power_file> - name of the file containing the dissipated power values of each cell\n");
231 | 	exit(1);
232 | }
233 | 
234 | int main(int argc, char **argv)
235 | {
236 | 	int grid_rows, grid_cols, sim_time, i;
237 |         int size;
238 | 	
239 | 	/* check validity of inputs	*/
240 | 	if (argc != 7)
241 | 		usage(argc, argv);
242 | 	if ((grid_rows = atoi(argv[1])) <= 0 ||
243 | 		(grid_cols = atoi(argv[2])) <= 0 ||
244 | 		(sim_time = atoi(argv[3])) <= 0 || 
245 | 		(num_omp_threads = atoi(argv[4])) <= 0
246 | 		)
247 | 		usage(argc, argv);
248 | 	tfile = argv[5];
249 | 	pfile = argv[6];
250 | 
251 |         size = grid_rows*grid_cols;
252 | 	init_data(grid_rows, grid_cols);
253 | 	printf("Start computing the transient temperature\n");
254 | 	compute_tran_temp(sim_time, grid_rows, grid_cols);
255 | 	printf("Ending simulation\n");
256 | 	/* output results	*/
257 | #ifdef VERBOSE
258 | 	fprintf(stdout, "Final Temperatures:\n");
259 | #endif
260 | 
261 | #ifdef OUTPUT
262 | #pragma acc update host(temp[0:size])
263 |         for(i=0; i < grid_rows * grid_cols; i++)
264 |         fprintf(stdout, "%d\t%g\n", i, temp[i]);
265 | #endif
266 | 
267 | 	/* cleanup	*/
268 | 	free(temp);
269 | 	free(power);
270 | 	free(result);
271 | 	return 0;
272 | }
273 | 
274 | 
275 | 


--------------------------------------------------------------------------------
/ep19-nvvp-analysis/transpose.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2012 NVIDIA Corporation
  2 | // 
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | // 
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | // 
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <assert.h>
 17 | 
 18 | // Convenience function for checking CUDA runtime API results
 19 | // can be wrapped around any runtime API call. No-op in release builds.
 20 | inline
 21 | cudaError_t checkCuda(cudaError_t result)
 22 | {
 23 | #if defined(DEBUG) || defined(_DEBUG)
 24 |   if (result != cudaSuccess) {
 25 |     fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
 26 |     assert(result == cudaSuccess);
 27 |   }
 28 | #endif
 29 |   return result;
 30 | }
 31 | 
 32 | const int TILE_DIM = 8;
 33 | const int BLOCK_ROWS = 4;
 34 | const int NUM_REPS = 100;
 35 | 
 36 | // Check errors and print GB/s
 37 | void postprocess(const float *ref, const float *res, int n, float ms)
 38 | {
 39 |   bool passed = true;
 40 |   for (int i = 0; i < n; i++)
 41 |     if (res[i] != ref[i]) {
 42 |       printf("%d %f %f\n", i, res[i], ref[i]);
 43 |       printf("%25s\n", "*** FAILED ***");
 44 |       passed = false;
 45 |       break;
 46 |     }
 47 |   if (passed)
 48 |     printf("%20.2f\n", 2 * n * sizeof(float) * 1e-6 * NUM_REPS / ms );
 49 | }
 50 | 
 51 | // simple copy kernel
 52 | // Used as reference case representing best effective bandwidth.
 53 | __global__ void copy(float *odata, const float *idata)
 54 | {
 55 |   int x = blockIdx.x * TILE_DIM + threadIdx.x;
 56 |   int y = blockIdx.y * TILE_DIM + threadIdx.y;
 57 |   int width = gridDim.x * TILE_DIM;
 58 | 
 59 |   for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
 60 |     odata[(y+j)*width + x] = idata[(y+j)*width + x];
 61 | }
 62 | 
 63 | // copy kernel using shared memory
 64 | // Also used as reference case, demonstrating effect of using shared memory.
 65 | __global__ void copySharedMem(float *odata, const float *idata)
 66 | {
 67 |   __shared__ float tile[TILE_DIM * TILE_DIM];
 68 |   
 69 |   int x = blockIdx.x * TILE_DIM + threadIdx.x;
 70 |   int y = blockIdx.y * TILE_DIM + threadIdx.y;
 71 |   int width = gridDim.x * TILE_DIM;
 72 | 
 73 |   for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
 74 |      tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x];
 75 | 
 76 |   __syncthreads();
 77 | 
 78 |   for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
 79 |      odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x];          
 80 | }
 81 | 
 82 | // naive transpose
 83 | // Simplest transpose; doesn't use shared memory.
 84 | // Global memory reads are coalesced but writes are not.
 85 | __global__ void transposeNaive(float *odata, const float *idata)
 86 | {
 87 |   int x = blockIdx.x * TILE_DIM + threadIdx.x;
 88 |   int y = blockIdx.y * TILE_DIM + threadIdx.y;
 89 |   int width = gridDim.x * TILE_DIM;
 90 | 
 91 |   for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
 92 |     odata[x*width + (y+j)] = idata[(y+j)*width + x];
 93 | }
 94 | 
 95 | // coalesced transpose
 96 | // Uses shared memory to achieve coalesing in both reads and writes
 97 | // Tile width == #banks causes shared memory bank conflicts.
 98 | __global__ void transposeCoalesced(float *odata, const float *idata)
 99 | {
100 |   __shared__ float tile[TILE_DIM][TILE_DIM];
101 |     
102 |   int x = blockIdx.x * TILE_DIM + threadIdx.x;
103 |   int y = blockIdx.y * TILE_DIM + threadIdx.y;
104 |   int width = gridDim.x * TILE_DIM;
105 | 
106 |   for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
107 |      tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
108 | 
109 |   __syncthreads();
110 | 
111 |   x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
112 |   y = blockIdx.x * TILE_DIM + threadIdx.y;
113 | 
114 |   for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
115 |      odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
116 | }
117 |    
118 | 
119 | // No bank-conflict transpose
120 | // Same as transposeCoalesced except the first tile dimension is padded 
121 | // to avoid shared memory bank conflicts.
122 | __global__ void transposeNoBankConflicts(float *odata, const float *idata)
123 | {
124 |   __shared__ float tile[TILE_DIM][TILE_DIM+1];
125 |     
126 |   int x = blockIdx.x * TILE_DIM + threadIdx.x;
127 |   int y = blockIdx.y * TILE_DIM + threadIdx.y;
128 |   int width = gridDim.x * TILE_DIM;
129 | 
130 |   for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
131 |      tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
132 | 
133 |   __syncthreads();
134 | 
135 |   x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
136 |   y = blockIdx.x * TILE_DIM + threadIdx.y;
137 | 
138 |   for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
139 |      odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
140 | }
141 | 
142 | int main(int argc, char **argv)
143 | {
144 |   const int nx = 1024;
145 |   const int ny = 1024;
146 |   const int mem_size = nx*ny*sizeof(float);
147 | 
148 |   dim3 dimGrid(nx/TILE_DIM, ny/TILE_DIM, 1);
149 |   dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
150 | 
151 |   int devId = 0;
152 |   if (argc > 1) devId = atoi(argv[1]);
153 | 
154 |   cudaDeviceProp prop;
155 |   checkCuda( cudaGetDeviceProperties(&prop, devId));
156 |   printf("\nDevice : %s\n", prop.name);
157 |   printf("Matrix size: %d %d, Block size: %d %d, Tile size: %d %d\n", 
158 |          nx, ny, TILE_DIM, BLOCK_ROWS, TILE_DIM, TILE_DIM);
159 |   printf("dimGrid: %d %d %d. dimBlock: %d %d %d\n",
160 |          dimGrid.x, dimGrid.y, dimGrid.z, dimBlock.x, dimBlock.y, dimBlock.z);
161 |   
162 |   checkCuda( cudaSetDevice(devId) );
163 | 
164 |   float *h_idata = (float*)malloc(mem_size);
165 |   float *h_cdata = (float*)malloc(mem_size);
166 |   float *h_tdata = (float*)malloc(mem_size);
167 |   float *gold    = (float*)malloc(mem_size);
168 |   
169 |   float *d_idata, *d_cdata, *d_tdata;
170 |   checkCuda( cudaMalloc(&d_idata, mem_size) );
171 |   checkCuda( cudaMalloc(&d_cdata, mem_size) );
172 |   checkCuda( cudaMalloc(&d_tdata, mem_size) );
173 | 
174 |   // check parameters and calculate execution configuration
175 |   if (nx % TILE_DIM || ny % TILE_DIM) {
176 |     printf("nx and ny must be a multiple of TILE_DIM\n");
177 |     goto error_exit;
178 |   }
179 | 
180 |   /*if (TILE_DIM % BLOCK_ROWS) {
181 |     printf("TILE_DIM must be a multiple of BLOCK_ROWS\n");
182 |     goto error_exit;
183 |   }*/
184 |     
185 |   // host
186 |   for (int j = 0; j < ny; j++)
187 |     for (int i = 0; i < nx; i++)
188 |       h_idata[j*nx + i] = j*nx + i;
189 | 
190 |   // correct result for error checking
191 |   for (int j = 0; j < ny; j++)
192 |     for (int i = 0; i < nx; i++)
193 |       gold[j*nx + i] = h_idata[i*nx + j];
194 |   
195 |   // device
196 |   checkCuda( cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) );
197 |   
198 |   // events for timing
199 |   cudaEvent_t startEvent, stopEvent;
200 |   checkCuda( cudaEventCreate(&startEvent) );
201 |   checkCuda( cudaEventCreate(&stopEvent) );
202 |   float ms;
203 | 
204 |   // ------------
205 |   // time kernels
206 |   // ------------
207 |   printf("%25s%25s\n", "Routine", "Bandwidth (GB/s)");
208 |   
209 | /*  // ----
210 |   // copy 
211 |   // ----
212 |   printf("%25s", "copy");
213 |   checkCuda( cudaMemset(d_cdata, 0, mem_size) );
214 |   // warm up
215 |   copy<<<dimGrid, dimBlock>>>(d_cdata, d_idata);
216 |   checkCuda( cudaEventRecord(startEvent, 0) );
217 |   for (int i = 0; i < NUM_REPS; i++)
218 |      copy<<<dimGrid, dimBlock>>>(d_cdata, d_idata);
219 |   checkCuda( cudaEventRecord(stopEvent, 0) );
220 |   checkCuda( cudaEventSynchronize(stopEvent) );
221 |   checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
222 |   checkCuda( cudaMemcpy(h_cdata, d_cdata, mem_size, cudaMemcpyDeviceToHost) );
223 |   postprocess(h_idata, h_cdata, nx*ny, ms);
224 | 
225 |   // -------------
226 |   // copySharedMem 
227 |   // -------------
228 |   printf("%25s", "shared memory copy");
229 |   checkCuda( cudaMemset(d_cdata, 0, mem_size) );
230 |   // warm up
231 |   copySharedMem<<<dimGrid, dimBlock>>>(d_cdata, d_idata);
232 |   checkCuda( cudaEventRecord(startEvent, 0) );
233 |   for (int i = 0; i < NUM_REPS; i++)
234 |      copySharedMem<<<dimGrid, dimBlock>>>(d_cdata, d_idata);
235 |   checkCuda( cudaEventRecord(stopEvent, 0) );
236 |   checkCuda( cudaEventSynchronize(stopEvent) );
237 |   checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
238 |   checkCuda( cudaMemcpy(h_cdata, d_cdata, mem_size, cudaMemcpyDeviceToHost) );
239 |   postprocess(h_idata, h_cdata, nx * ny, ms);
240 | */
241 |   // --------------
242 |   // transposeNaive 
243 |   // --------------
244 |   printf("%25s", "naive transpose");
245 |   checkCuda( cudaMemset(d_tdata, 0, mem_size) );
246 |   // warmup
247 |   transposeNaive<<<dimGrid, dimBlock>>>(d_tdata, d_idata);
248 |   checkCuda( cudaEventRecord(startEvent, 0) );
249 |   for (int i = 0; i < NUM_REPS; i++)
250 |      transposeNaive<<<dimGrid, dimBlock>>>(d_tdata, d_idata);
251 |   checkCuda( cudaEventRecord(stopEvent, 0) );
252 |   checkCuda( cudaEventSynchronize(stopEvent) );
253 |   checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
254 |   checkCuda( cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost) );
255 |   postprocess(gold, h_tdata, nx * ny, ms);
256 | 
257 | /*  // ------------------
258 |   // transposeCoalesced 
259 |   // ------------------
260 |   printf("%25s", "coalesced transpose");
261 |   checkCuda( cudaMemset(d_tdata, 0, mem_size) );
262 |   // warmup
263 |   transposeCoalesced<<<dimGrid, dimBlock>>>(d_tdata, d_idata);
264 |   checkCuda( cudaEventRecord(startEvent, 0) );
265 |   for (int i = 0; i < NUM_REPS; i++)
266 |      transposeCoalesced<<<dimGrid, dimBlock>>>(d_tdata, d_idata);
267 |   checkCuda( cudaEventRecord(stopEvent, 0) );
268 |   checkCuda( cudaEventSynchronize(stopEvent) );
269 |   checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
270 |   checkCuda( cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost) );
271 |   postprocess(gold, h_tdata, nx * ny, ms);
272 | 
273 |   // ------------------------
274 |   // transposeNoBankConflicts
275 |   // ------------------------
276 |   printf("%25s", "conflict-free transpose");
277 |   checkCuda( cudaMemset(d_tdata, 0, mem_size) );
278 |   // warmup
279 |   transposeNoBankConflicts<<<dimGrid, dimBlock>>>(d_tdata, d_idata);
280 |   checkCuda( cudaEventRecord(startEvent, 0) );
281 |   for (int i = 0; i < NUM_REPS; i++)
282 |      transposeNoBankConflicts<<<dimGrid, dimBlock>>>(d_tdata, d_idata);
283 |   checkCuda( cudaEventRecord(stopEvent, 0) );
284 |   checkCuda( cudaEventSynchronize(stopEvent) );
285 |   checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
286 |   checkCuda( cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost) );
287 |   postprocess(gold, h_tdata, nx * ny, ms);*/
288 | 
289 | error_exit:
290 |   // cleanup
291 |   checkCuda( cudaEventDestroy(startEvent) );
292 |   checkCuda( cudaEventDestroy(stopEvent) );
293 |   checkCuda( cudaFree(d_tdata) );
294 |   checkCuda( cudaFree(d_cdata) );
295 |   checkCuda( cudaFree(d_idata) );
296 |   free(h_idata);
297 |   free(h_tdata);
298 |   free(h_cdata);
299 |   free(gold);
300 | }
301 | 
302 | 


--------------------------------------------------------------------------------
/ep2-first-cuda-c-program/kernel.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define SIZE	1024
 4 | 
 5 | __global__ void VectorAdd(int *a, int *b, int *c, int n)
 6 | {
 7 | 	int i = threadIdx.x;
 8 | 
 9 | 	if (i < n)
10 | 		c[i] = a[i] + b[i];
11 | }
12 | 
13 | int main()
14 | {
15 | 	int *a, *b, *c;
16 | 	int *d_a, *d_b, *d_c;
17 | 
18 | 	a = (int *)malloc(SIZE*sizeof(int));
19 | 	b = (int *)malloc(SIZE*sizeof(int));
20 | 	c = (int *)malloc(SIZE*sizeof(int));
21 | 
22 | 	cudaMalloc( &d_a, SIZE*sizeof(int));
23 | 	cudaMalloc( &d_b, SIZE*sizeof(int));
24 | 	cudaMalloc( &d_c, SIZE*sizeof(int));
25 | 
26 | 	for( int i = 0; i < SIZE; ++i )
27 | 	{
28 | 		a[i] = i;
29 | 		b[i] = i;
30 | 		c[i] = 0;
31 | 	}
32 | 
33 | 	cudaMemcpy( d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice );
34 | 	cudaMemcpy( d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice );
35 | 	cudaMemcpy( d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice );
36 | 
37 | 	VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
38 | 	
39 | 	cudaMemcpy( c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost );
40 | 
41 | 	for( int i = 0; i < 10; ++i)
42 | 		printf("c[%d] = %d\n", i, c[i]);
43 | 
44 | 	free(a);
45 | 	free(b);
46 | 	free(c);
47 | 
48 | 	cudaFree(d_a);
49 | 	cudaFree(d_b);
50 | 	cudaFree(d_c);
51 | 
52 | 	return 0;
53 | }


--------------------------------------------------------------------------------
/ep3-first-openacc-program/laplace2d.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <math.h>
18 | #include <string.h>
19 | #include "timer.h"
20 | 
21 | #define NN 4096
22 | #define NM 4096
23 | 
24 | double A[NN][NM];
25 | double Anew[NN][NM];
26 | 
27 | int main(int argc, char** argv)
28 | {
29 |     const int n = NN;
30 |     const int m = NM;
31 |     const int iter_max = 1000;
32 |     
33 |     const double tol = 1.0e-6;
34 |     double error     = 1.0;
35 |     
36 |     memset(A, 0, n * m * sizeof(double));
37 |     memset(Anew, 0, n * m * sizeof(double));
38 |         
39 |     for (int j = 0; j < n; j++)
40 |     {
41 |         A[j][0]    = 1.0;
42 |         Anew[j][0] = 1.0;
43 |     }
44 |     
45 |     printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m);
46 |     
47 |     StartTimer();
48 |     int iter = 0;
49 |     
50 | #pragma acc data copy(A), create(Anew)
51 |     while ( error > tol && iter < iter_max )
52 |     {
53 |         error = 0.0;
54 | 
55 | #pragma omp parallel for shared(m, n, Anew, A)
56 | #pragma acc kernels
57 |         for( int j = 1; j < n-1; j++)
58 |         {
59 |             for( int i = 1; i < m-1; i++ )
60 |             {
61 |                 Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
62 |                                     + A[j-1][i] + A[j+1][i]);
63 |                 error = fmax( error, fabs(Anew[j][i] - A[j][i]));
64 |             }
65 |         }
66 |         
67 | #pragma omp parallel for shared(m, n, Anew, A)
68 | #pragma acc kernels
69 |         for( int j = 1; j < n-1; j++)
70 |         {
71 |             for( int i = 1; i < m-1; i++ )
72 |             {
73 |                 A[j][i] = Anew[j][i];    
74 |             }
75 |         }
76 | 
77 |         if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
78 |         
79 |         iter++;
80 |     }
81 | 
82 |     double runtime = GetTimer();
83 |  
84 |     printf(" total: %f s\n", runtime / 1000);
85 | }
86 | 


--------------------------------------------------------------------------------
/ep3-first-openacc-program/timer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #ifndef TIMER_H
18 | #define TIMER_H
19 | 
20 | #include <stdlib.h>
21 | 
22 | #ifdef WIN32
23 | #define WIN32_LEAN_AND_MEAN
24 | #include <windows.h>
25 | #else
26 | #include <sys/time.h>
27 | #endif
28 | 
29 | #ifdef WIN32
30 | double PCFreq = 0.0;
31 | __int64 timerStart = 0;
32 | #else
33 | struct timeval timerStart;
34 | #endif
35 | 
36 | void StartTimer()
37 | {
38 | #ifdef WIN32
39 |     LARGE_INTEGER li;
40 |     if(!QueryPerformanceFrequency(&li))
41 |         printf("QueryPerformanceFrequency failed!\n");
42 | 
43 |     PCFreq = (double)li.QuadPart/1000.0;
44 | 
45 |     QueryPerformanceCounter(&li);
46 |     timerStart = li.QuadPart;
47 | #else
48 |     gettimeofday(&timerStart, NULL);
49 | #endif
50 | }
51 | 
52 | // time elapsed in ms
53 | double GetTimer()
54 | {
55 | #ifdef WIN32
56 |     LARGE_INTEGER li;
57 |     QueryPerformanceCounter(&li);
58 |     return (double)(li.QuadPart-timerStart)/PCFreq;
59 | #else
60 |     struct timeval timerStop, timerElapsed;
61 |     gettimeofday(&timerStop, NULL);
62 |     timersub(&timerStop, &timerStart, &timerElapsed);
63 |     return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0;
64 | #endif
65 | }
66 | 
67 | #endif // TIMER_H
68 | 


--------------------------------------------------------------------------------