├── CUDA
    ├── asyncAPI.cu
    ├── cdpSimplePrint.cu
    ├── cdpSimpleQuicksort.cu
    ├── clock.cu
    ├── cppIntegration.cu
    ├── cppOverload.cu
    ├── matrixMul.cu
    ├── simpleAssert.cu
    ├── simpleAssert_kernel.cu
    ├── template_runtime.cu
    └── vectorAdd.cu
├── MPI
    ├── SimpleSendRcv.c
    ├── array_prod.c
    ├── average_reduce.c
    ├── average_scatter.c
    ├── factorial.c
    ├── lognSum.c
    ├── matrixMult.c
    ├── mpi_hello_world.c
    ├── mpibcast.c
    └── pieCalculation.c
├── OMP
    ├── Critical.c
    ├── Fibonacci.c
    ├── HelloWorld.c
    ├── MatrixMul.c
    ├── ParallelTreeSearch.c
    ├── PiCalculation.c
    ├── ReductionPI.c
    ├── Single.c
    ├── Sorting.c
    └── SumOfArrays.c
└── README.md


/CUDA/asyncAPI.cu:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////
  2 | //
  3 | // Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  4 | //
  5 | // Please refer to the NVIDIA end user license agreement (EULA) associated
  6 | // with this source code for terms and conditions that govern your use of
  7 | // this software. Any use, reproduction, disclosure, or distribution of
  8 | // this software and related documentation outside the terms of the EULA
  9 | // is strictly prohibited.
 10 | //
 11 | ////////////////////////////////////////////////////////////////////////////
 12 | 
 13 | //
 14 | // This sample illustrates the usage of CUDA events for both GPU timing and
 15 | // overlapping CPU and GPU execution.  Events are inserted into a stream
 16 | // of CUDA calls.  Since CUDA stream calls are asynchronous, the CPU can
 17 | // perform computations while GPU is executing (including DMA memcopies
 18 | // between the host and device).  CPU can query CUDA events to determine
 19 | // whether GPU has completed tasks.
 20 | //
 21 | 
 22 | // includes, system
 23 | #include <stdio.h>
 24 | 
 25 | // includes CUDA Runtime
 26 | #include <cuda_runtime.h>
 27 | 
 28 | // includes, project
 29 | #include <helper_cuda.h>
 30 | #include <helper_functions.h> // helper utility functions 
 31 | 
 32 | __global__ void increment_kernel(int *g_data, int inc_value)
 33 | {
 34 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 35 |     g_data[idx] = g_data[idx] + inc_value;
 36 | }
 37 | 
 38 | bool correct_output(int *data, const int n, const int x)
 39 | {
 40 |     for (int i = 0; i < n; i++)
 41 |         if (data[i] != x)
 42 |         {
 43 |             printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
 44 |             return false;
 45 |         }
 46 | 
 47 |     return true;
 48 | }
 49 | 
 50 | int main(int argc, char *argv[])
 51 | {
 52 |     int devID;
 53 |     cudaDeviceProp deviceProps;
 54 | 
 55 |     printf("[%s] - Starting...\n", argv[0]);
 56 | 
 57 |     // This will pick the best possible CUDA capable device
 58 |     devID = findCudaDevice(argc, (const char **)argv);
 59 | 
 60 |     // get device name
 61 |     checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
 62 |     printf("CUDA device [%s]\n", deviceProps.name);
 63 | 
 64 |     int n = 16 * 1024 * 1024;
 65 |     int nbytes = n * sizeof(int);
 66 |     int value = 26;
 67 | 
 68 |     // allocate host memory
 69 |     int *a = 0;
 70 |     checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
 71 |     memset(a, 0, nbytes);
 72 | 
 73 |     // allocate device memory
 74 |     int *d_a=0;
 75 |     checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
 76 |     checkCudaErrors(cudaMemset(d_a, 255, nbytes));
 77 | 
 78 |     // set kernel launch configuration
 79 |     dim3 threads = dim3(512, 1);
 80 |     dim3 blocks  = dim3(n / threads.x, 1);
 81 | 
 82 |     // create cuda event handles
 83 |     cudaEvent_t start, stop;
 84 |     checkCudaErrors(cudaEventCreate(&start));
 85 |     checkCudaErrors(cudaEventCreate(&stop));
 86 | 
 87 |     StopWatchInterface *timer = NULL;
 88 |     sdkCreateTimer(&timer);
 89 |     sdkResetTimer(&timer);
 90 | 
 91 |     checkCudaErrors(cudaDeviceSynchronize());
 92 |     float gpu_time = 0.0f;
 93 | 
 94 |     // asynchronously issue work to the GPU (all to stream 0)
 95 |     sdkStartTimer(&timer);
 96 |     cudaEventRecord(start, 0);
 97 |     cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
 98 |     increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
 99 |     cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
100 |     cudaEventRecord(stop, 0);
101 |     sdkStopTimer(&timer);
102 | 
103 |     // have CPU do some work while waiting for stage 1 to finish
104 |     unsigned long int counter=0;
105 | 
106 |     while (cudaEventQuery(stop) == cudaErrorNotReady)
107 |     {
108 |         counter++;
109 |     }
110 | 
111 |     checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
112 | 
113 |     // print the cpu and gpu times
114 |     printf("time spent executing by the GPU: %.2f\n", gpu_time);
115 |     printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
116 |     printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
117 | 
118 |     // check the output for correctness
119 |     bool bFinalResults = correct_output(a, n, value);
120 | 
121 |     // release resources
122 |     checkCudaErrors(cudaEventDestroy(start));
123 |     checkCudaErrors(cudaEventDestroy(stop));
124 |     checkCudaErrors(cudaFreeHost(a));
125 |     checkCudaErrors(cudaFree(d_a));
126 | 
127 |     // cudaDeviceReset causes the driver to clean up all state. While
128 |     // not mandatory in normal operation, it is good practice.  It is also
129 |     // needed to ensure correct operation when the application is being
130 |     // profiled. Calling cudaDeviceReset causes all profile data to be
131 |     // flushed before the application exits
132 |     cudaDeviceReset();
133 | 
134 |     exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
135 | }
136 | 


--------------------------------------------------------------------------------
/CUDA/cdpSimplePrint.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | #include <iostream>
 13 | #include <cstdio>
 14 | #include <cstdlib>
 15 | #include <helper_cuda.h>
 16 | #include <helper_string.h>
 17 | 
 18 | ////////////////////////////////////////////////////////////////////////////////
 19 | // Variable on the GPU used to generate unique identifiers of blocks.
 20 | ////////////////////////////////////////////////////////////////////////////////
 21 | __device__ int g_uids = 0;
 22 | 
 23 | ////////////////////////////////////////////////////////////////////////////////
 24 | // Print a simple message to signal the block which is currently executing.
 25 | ////////////////////////////////////////////////////////////////////////////////
 26 | __device__ void print_info(int depth, int thread, int uid, int parent_uid)
 27 | {
 28 |     if (threadIdx.x == 0)
 29 |     {
 30 |         if (depth == 0)
 31 |             printf("BLOCK %d launched by the host\n", uid);
 32 |         else
 33 |         {
 34 |             char buffer[32];
 35 | 
 36 |             for (int i = 0 ; i < depth ; ++i)
 37 |             {
 38 |                 buffer[3*i+0] = '|';
 39 |                 buffer[3*i+1] = ' ';
 40 |                 buffer[3*i+2] = ' ';
 41 |             }
 42 | 
 43 |             buffer[3*depth] = '\0';
 44 |             printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid);
 45 |         }
 46 |     }
 47 | 
 48 |     __syncthreads();
 49 | }
 50 | 
 51 | ////////////////////////////////////////////////////////////////////////////////
 52 | // The kernel using CUDA dynamic parallelism.
 53 | //
 54 | // It generates a unique identifier for each block. Prints the information
 55 | // about that block. Finally, if the 'max_depth' has not been reached, the
 56 | // block launches new blocks directly from the GPU.
 57 | ////////////////////////////////////////////////////////////////////////////////
 58 | __global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid)
 59 | {
 60 |     // We create a unique ID per block. Thread 0 does that and shares the value with the other threads.
 61 |     __shared__ int s_uid;
 62 | 
 63 |     if (threadIdx.x == 0)
 64 |     {
 65 |         s_uid = atomicAdd(&g_uids, 1);
 66 |     }
 67 | 
 68 |     __syncthreads();
 69 | 
 70 |     // We print the ID of the block and information about its parent.
 71 |     print_info(depth, thread, s_uid, parent_uid);
 72 | 
 73 |     // We launch new blocks if we haven't reached the max_depth yet.
 74 |     if (++depth >= max_depth)
 75 |     {
 76 |         return;
 77 |     }
 78 | 
 79 |     cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid);
 80 | }
 81 | 
 82 | ////////////////////////////////////////////////////////////////////////////////
 83 | // Main entry point.
 84 | ////////////////////////////////////////////////////////////////////////////////
 85 | int main(int argc, char **argv)
 86 | {
 87 |     printf("starting Simple Print (CUDA Dynamic Parallelism)\n");
 88 | 
 89 |     // Parse a few command-line arguments.
 90 |     int max_depth = 2;
 91 | 
 92 |     if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
 93 |         checkCmdLineFlag(argc, (const char **)argv, "h"))
 94 |     {
 95 |         printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n", argv[0]);
 96 |         exit(EXIT_SUCCESS);
 97 |     }
 98 | 
 99 |     if (checkCmdLineFlag(argc, (const char **)argv, "depth"))
100 |     {
101 |         max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth");
102 | 
103 |         if (max_depth < 1 || max_depth > 8)
104 |         {
105 |             printf("depth parameter has to be between 1 and 8\n");
106 |             exit(EXIT_FAILURE);
107 |         }
108 |     }
109 | 
110 |     // Find/set the device.
111 |     int device_count = 0, device = -1;
112 |     
113 |     if(checkCmdLineFlag(argc, (const char **)argv, "device"))
114 |     {
115 |         device = getCmdLineArgumentInt(argc, (const char **)argv, "device");
116 | 
117 |         cudaDeviceProp properties;
118 |         checkCudaErrors(cudaGetDeviceProperties(&properties, device));
119 |         
120 |         if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
121 |         {
122 |             std::cout << "Running on GPU " << device << " (" << properties.name << ")" << std::endl;
123 |         }
124 |         else
125 |         {
126 |             std::cout << "ERROR: cdpsimplePrint requires GPU devices with compute SM 3.5 or higher."<< std::endl;
127 |             std::cout << "Current GPU device has compute SM" << properties.major <<"."<< properties.minor <<". Exiting..." << std::endl;
128 |             exit(EXIT_FAILURE);
129 |         }
130 | 
131 |     }
132 |     else
133 |     {
134 |         checkCudaErrors(cudaGetDeviceCount(&device_count));
135 |         for (int i = 0 ; i < device_count ; ++i)
136 |         {
137 |             cudaDeviceProp properties;
138 |             checkCudaErrors(cudaGetDeviceProperties(&properties, i));
139 |             if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
140 |             {
141 |                 device = i;
142 |                 std::cout << "Running on GPU " << i << " (" << properties.name << ")" << std::endl;
143 |                 break;
144 |             }
145 |             std::cout << "GPU " << i << " (" << properties.name << ") does not support CUDA Dynamic Parallelism" << std::endl;
146 |         }
147 |     }
148 |     if (device == -1)
149 |     {
150 |               std::cerr << "cdpSimplePrint requires GPU devices with compute SM 3.5 or higher.  Exiting..." << std::endl;
151 |               exit(EXIT_WAIVED);
152 |      }
153 |     cudaSetDevice(device);
154 | 
155 |     // Print a message describing what the sample does.
156 |     printf("***************************************************************************\n");
157 |     printf("The CPU launches 2 blocks of 2 threads each. On the device each thread will\n");
158 |     printf("launch 2 blocks of 2 threads each. The GPU we will do that recursively\n");
159 |     printf("until it reaches max_depth=%d\n\n", max_depth);
160 |     printf("In total 2");
161 |     int num_blocks = 2, sum = 2;
162 | 
163 |     for (int i = 1 ; i < max_depth ; ++i)
164 |     {
165 |         num_blocks *= 4;
166 |         printf("+%d", num_blocks);
167 |         sum += num_blocks;
168 |     }
169 | 
170 |     printf("=%d blocks are launched!!! (%d from the GPU)\n", sum, sum-2);
171 |     printf("***************************************************************************\n\n");
172 | 
173 |     // We set the recursion limit for CDP to max_depth.
174 |     cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);
175 | 
176 |     // Launch the kernel from the CPU.
177 |     printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n\n");
178 |     cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1);
179 |     checkCudaErrors(cudaGetLastError());
180 | 
181 |     // Finalize.
182 |     checkCudaErrors(cudaDeviceSynchronize());
183 |     
184 |     // cudaDeviceReset causes the driver to clean up all state. While
185 |     // not mandatory in normal operation, it is good practice.  It is also
186 |     // needed to ensure correct operation when the application is being
187 |     // profiled. Calling cudaDeviceReset causes all profile data to be
188 |     // flushed before the application exits
189 |     checkCudaErrors(cudaDeviceReset());
190 | 
191 |     exit(EXIT_SUCCESS);
192 | }
193 | 


--------------------------------------------------------------------------------
/CUDA/cdpSimpleQuicksort.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 | * with this source code for terms and conditions that govern your use of
  6 | * this software. Any use, reproduction, disclosure, or distribution of
  7 | * this software and related documentation outside the terms of the EULA
  8 | * is strictly prohibited.
  9 | *
 10 | */
 11 | #include <iostream>
 12 | #include <cstdio>
 13 | #include <helper_cuda.h>
 14 | #include <helper_string.h>
 15 | 
 16 | #define MAX_DEPTH       16
 17 | #define INSERTION_SORT  32
 18 | 
 19 | ////////////////////////////////////////////////////////////////////////////////
 20 | // Selection sort used when depth gets too big or the number of elements drops
 21 | // below a threshold.
 22 | ////////////////////////////////////////////////////////////////////////////////
 23 | __device__ void selection_sort(unsigned int *data, int left, int right)
 24 | {
 25 |     for (int i = left ; i <= right ; ++i)
 26 |     {
 27 |         unsigned min_val = data[i];
 28 |         int min_idx = i;
 29 | 
 30 |         // Find the smallest value in the range [left, right].
 31 |         for (int j = i+1 ; j <= right ; ++j)
 32 |         {
 33 |             unsigned val_j = data[j];
 34 | 
 35 |             if (val_j < min_val)
 36 |             {
 37 |                 min_idx = j;
 38 |                 min_val = val_j;
 39 |             }
 40 |         }
 41 | 
 42 |         // Swap the values.
 43 |         if (i != min_idx)
 44 |         {
 45 |             data[min_idx] = data[i];
 46 |             data[i] = min_val;
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | ////////////////////////////////////////////////////////////////////////////////
 52 | // Very basic quicksort algorithm, recursively launching the next level.
 53 | ////////////////////////////////////////////////////////////////////////////////
 54 | __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
 55 | {
 56 |     // If we're too deep or there are few elements left, we use an insertion sort...
 57 |     if (depth >= MAX_DEPTH || right-left <= INSERTION_SORT)
 58 |     {
 59 |         selection_sort(data, left, right);
 60 |         return;
 61 |     }
 62 | 
 63 |     unsigned int *lptr = data+left;
 64 |     unsigned int *rptr = data+right;
 65 |     unsigned int  pivot = data[(left+right)/2];
 66 | 
 67 |     // Do the partitioning.
 68 |     while (lptr <= rptr)
 69 |     {
 70 |         // Find the next left- and right-hand values to swap
 71 |         unsigned int lval = *lptr;
 72 |         unsigned int rval = *rptr;
 73 | 
 74 |         // Move the left pointer as long as the pointed element is smaller than the pivot.
 75 |         while (lval < pivot)
 76 |         {
 77 |             lptr++;
 78 |             lval = *lptr;
 79 |         }
 80 | 
 81 |         // Move the right pointer as long as the pointed element is larger than the pivot.
 82 |         while (rval > pivot)
 83 |         {
 84 |             rptr--;
 85 |             rval = *rptr;
 86 |         }
 87 | 
 88 |         // If the swap points are valid, do the swap!
 89 |         if (lptr <= rptr)
 90 |         {
 91 |             *lptr++ = rval;
 92 |             *rptr-- = lval;
 93 |         }
 94 |     }
 95 | 
 96 |     // Now the recursive part
 97 |     int nright = rptr - data;
 98 |     int nleft  = lptr - data;
 99 | 
100 |     // Launch a new block to sort the left part.
101 |     if (left < (rptr-data))
102 |     {
103 |         cudaStream_t s;
104 |         cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
105 |         cdp_simple_quicksort<<< 1, 1, 0, s >>>(data, left, nright, depth+1);
106 |         cudaStreamDestroy(s);
107 |     }
108 | 
109 |     // Launch a new block to sort the right part.
110 |     if ((lptr-data) < right)
111 |     {
112 |         cudaStream_t s1;
113 |         cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
114 |         cdp_simple_quicksort<<< 1, 1, 0, s1 >>>(data, nleft, right, depth+1);
115 |         cudaStreamDestroy(s1);
116 |     }
117 | }
118 | 
119 | ////////////////////////////////////////////////////////////////////////////////
120 | // Call the quicksort kernel from the host.
121 | ////////////////////////////////////////////////////////////////////////////////
122 | void run_qsort(unsigned int *data, unsigned int nitems)
123 | {
124 |     // Prepare CDP for the max depth 'MAX_DEPTH'.
125 |     checkCudaErrors(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH));
126 | 
127 |     // Launch on device
128 |     int left = 0;
129 |     int right = nitems-1;
130 |     std::cout << "Launching kernel on the GPU" << std::endl;
131 |     cdp_simple_quicksort<<< 1, 1 >>>(data, left, right, 0);
132 |     checkCudaErrors(cudaDeviceSynchronize());
133 | }
134 | 
135 | ////////////////////////////////////////////////////////////////////////////////
136 | // Initialize data on the host.
137 | ////////////////////////////////////////////////////////////////////////////////
138 | void initialize_data(unsigned int *dst, unsigned int nitems)
139 | {
140 |     // Fixed seed for illustration
141 |     srand(2047);
142 | 
143 |     // Fill dst with random values
144 |     for (unsigned i = 0 ; i < nitems ; i++)
145 |         dst[i] = rand() % nitems ;
146 | }
147 | 
148 | ////////////////////////////////////////////////////////////////////////////////
149 | // Verify the results.
150 | ////////////////////////////////////////////////////////////////////////////////
151 | void check_results(int n, unsigned int *results_d)
152 | {
153 |     unsigned int *results_h = new unsigned[n];
154 |     checkCudaErrors(cudaMemcpy(results_h, results_d, n*sizeof(unsigned), cudaMemcpyDeviceToHost));
155 | 
156 |     for (int i = 1 ; i < n ; ++i)
157 |         if (results_h[i-1] > results_h[i])
158 |         {
159 |             std::cout << "Invalid item[" << i-1 << "]: " << results_h[i-1] << " greater than " << results_h[i] << std::endl;
160 |             exit(EXIT_FAILURE);
161 |         }
162 | 
163 |     std::cout << "OK" << std::endl;
164 |     delete[] results_h;
165 | }
166 | 
167 | ////////////////////////////////////////////////////////////////////////////////
168 | // Main entry point.
169 | ////////////////////////////////////////////////////////////////////////////////
170 | int main(int argc, char **argv)
171 | {
172 |     int num_items = 128;
173 |     bool verbose = false;
174 | 
175 |     if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
176 |         checkCmdLineFlag(argc, (const char **)argv, "h"))
177 |     {
178 |         std::cerr << "Usage: " << argv[0] << " num_items=<num_items>\twhere num_items is the number of items to sort" << std::endl;
179 |         exit(EXIT_SUCCESS);
180 |     }
181 | 
182 |     if (checkCmdLineFlag(argc, (const char **)argv, "v"))
183 |     {
184 |         verbose = true;
185 |     }
186 | 
187 |     if (checkCmdLineFlag(argc, (const char **)argv, "num_items"))
188 |     {
189 |         num_items = getCmdLineArgumentInt(argc, (const char **)argv, "num_items");
190 | 
191 |         if (num_items < 1)
192 |         {
193 |             std::cerr << "ERROR: num_items has to be greater than 1" << std::endl;
194 |             exit(EXIT_FAILURE);
195 |         }
196 |     }
197 | 
198 |     // Get device properties
199 |     int device_count = 0, device = -1;
200 |     
201 |     if(checkCmdLineFlag(argc, (const char **)argv, "device"))
202 |     {
203 |         device = getCmdLineArgumentInt(argc, (const char **)argv, "device");
204 |         
205 |         cudaDeviceProp properties;
206 |         checkCudaErrors(cudaGetDeviceProperties(&properties, device));
207 |         
208 |         if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
209 |         {
210 |             std::cout << "Running on GPU " << device << " (" << properties.name << ")" << std::endl;
211 |         }
212 |         else
213 |         {
214 |             std::cout << "ERROR: cdpsimpleQuicksort requires GPU devices with compute SM 3.5 or higher."<< std::endl;
215 |             std::cout << "Current GPU device has compute SM" << properties.major <<"."<< properties.minor <<". Exiting..." << std::endl;
216 |             exit(EXIT_FAILURE);
217 |         }
218 | 
219 |     }
220 |     else
221 |     {
222 |         checkCudaErrors(cudaGetDeviceCount(&device_count));
223 |     
224 |         for (int i = 0 ; i < device_count ; ++i)
225 |         {
226 |             cudaDeviceProp properties;
227 |             checkCudaErrors(cudaGetDeviceProperties(&properties, i));
228 | 
229 |             if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
230 |             {
231 |                 device = i;
232 |                 std::cout << "Running on GPU " << i << " (" << properties.name << ")" << std::endl;
233 |                 break;
234 |             }
235 | 
236 |             std::cout << "GPU " << i << " (" << properties.name << ") does not support CUDA Dynamic Parallelism" << std::endl;
237 |          }
238 |      }
239 | 
240 |     if (device == -1)
241 |     {
242 |         std::cerr << "cdpSimpleQuicksort requires GPU devices with compute SM 3.5 or higher.  Exiting..." << std::endl;
243 |         exit(EXIT_WAIVED);
244 |     }
245 | 
246 |     cudaSetDevice(device);
247 | 
248 |     // Create input data
249 |     unsigned int *h_data = 0;
250 |     unsigned int *d_data = 0;
251 | 
252 |     // Allocate CPU memory and initialize data.
253 |     std::cout << "Initializing data:" << std::endl;
254 |     h_data =(unsigned int *)malloc(num_items*sizeof(unsigned int));
255 |     initialize_data(h_data, num_items);
256 | 
257 |     if (verbose)
258 |     {
259 |         for (int i=0 ; i<num_items ; i++)
260 |             std::cout << "Data [" << i << "]: " << h_data[i] << std::endl;
261 |     }
262 | 
263 |     // Allocate GPU memory.
264 |     checkCudaErrors(cudaMalloc((void **)&d_data, num_items * sizeof(unsigned int)));
265 |     checkCudaErrors(cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice));
266 | 
267 |     // Execute
268 |     std::cout << "Running quicksort on " << num_items << " elements" << std::endl;
269 |     run_qsort(d_data, num_items);
270 | 
271 |     // Check result
272 |     std::cout << "Validating results: ";
273 |     check_results(num_items, d_data);
274 | 
275 |     free(h_data);
276 |     checkCudaErrors(cudaFree(d_data));
277 | 
278 |     // cudaDeviceReset causes the driver to clean up all state. While
279 |     // not mandatory in normal operation, it is good practice.  It is also
280 |     // needed to ensure correct operation when the application is being
281 |     // profiled. Calling cudaDeviceReset causes all profile data to be
282 |     // flushed before the application exits
283 |     cudaDeviceReset();
284 |     exit(EXIT_SUCCESS);
285 | }
286 | 
287 | 


--------------------------------------------------------------------------------
/CUDA/clock.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | // This example shows how to use the clock function to measure the performance of
 13 | // a kernel accurately.
 14 | //
 15 | // Blocks are executed in parallel and out of order. Since there's no synchronization
 16 | // mechanism between blocks, we measure the clock once for each block. The clock
 17 | // samples are written to device memory.
 18 | 
 19 | // System includes
 20 | #include <stdio.h>
 21 | #include <assert.h>
 22 | 
 23 | // CUDA runtime
 24 | #include <cuda_runtime.h>
 25 | 
 26 | // helper functions and utilities to work with CUDA
 27 | #include <helper_functions.h>
 28 | #include <helper_cuda.h>
 29 | 
 30 | // This kernel computes a standard parallel reduction and evaluates the
 31 | // time it takes to do that for each block. The timing results are stored
 32 | // in device memory.
 33 | __global__ static void timedReduction(const float *input, float *output, clock_t *timer)
 34 | {
 35 |     // __shared__ float shared[2 * blockDim.x];
 36 |     extern __shared__ float shared[];
 37 | 
 38 |     const int tid = threadIdx.x;
 39 |     const int bid = blockIdx.x;
 40 | 
 41 |     if (tid == 0) timer[bid] = clock();
 42 | 
 43 |     // Copy input.
 44 |     shared[tid] = input[tid];
 45 |     shared[tid + blockDim.x] = input[tid + blockDim.x];
 46 | 
 47 |     // Perform reduction to find minimum.
 48 |     for (int d = blockDim.x; d > 0; d /= 2)
 49 |     {
 50 |         __syncthreads();
 51 | 
 52 |         if (tid < d)
 53 |         {
 54 |             float f0 = shared[tid];
 55 |             float f1 = shared[tid + d];
 56 | 
 57 |             if (f1 < f0)
 58 |             {
 59 |                 shared[tid] = f1;
 60 |             }
 61 |         }
 62 |     }
 63 | 
 64 |     // Write result.
 65 |     if (tid == 0) output[bid] = shared[0];
 66 | 
 67 |     __syncthreads();
 68 | 
 69 |     if (tid == 0) timer[bid+gridDim.x] = clock();
 70 | }
 71 | 
 72 | 
 73 | // This example shows how to use the clock function to measure the performance of
 74 | // a kernel accurately.
 75 | //
 76 | // Blocks are executed in parallel and out of order. Since there's no synchronization
 77 | // mechanism between blocks, we measure the clock once for each block. The clock
 78 | // samples are written to device memory.
 79 | 
 80 | #define NUM_BLOCKS    64
 81 | #define NUM_THREADS   256
 82 | 
 83 | // It's interesting to change the number of blocks and the number of threads to
 84 | // understand how to keep the hardware busy.
 85 | //
 86 | // Here are some numbers I get on my G80:
 87 | //    blocks - clocks
 88 | //    1 - 3096
 89 | //    8 - 3232
 90 | //    16 - 3364
 91 | //    32 - 4615
 92 | //    64 - 9981
 93 | //
 94 | // With less than 16 blocks some of the multiprocessors of the device are idle. With
 95 | // more than 16 you are using all the multiprocessors, but there's only one block per
 96 | // multiprocessor and that doesn't allow you to hide the latency of the memory. With
 97 | // more than 32 the speed scales linearly.
 98 | 
 99 | // Start the main CUDA Sample here
100 | int main(int argc, char **argv)
101 | {
102 |     printf("CUDA Clock sample\n");
103 | 
104 |     // This will pick the best possible CUDA capable device
105 |     int dev = findCudaDevice(argc, (const char **)argv);
106 | 
107 |     float *dinput = NULL;
108 |     float *doutput = NULL;
109 |     clock_t *dtimer = NULL;
110 | 
111 |     clock_t timer[NUM_BLOCKS * 2];
112 |     float input[NUM_THREADS * 2];
113 | 
114 |     for (int i = 0; i < NUM_THREADS * 2; i++)
115 |     {
116 |         input[i] = (float)i;
117 |     }
118 | 
119 |     checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
120 |     checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
121 |     checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
122 | 
123 |     checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
124 | 
125 |     timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 *NUM_THREADS>>>(dinput, doutput, dtimer);
126 | 
127 |     checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
128 | 
129 |     checkCudaErrors(cudaFree(dinput));
130 |     checkCudaErrors(cudaFree(doutput));
131 |     checkCudaErrors(cudaFree(dtimer));
132 | 
133 | 
134 |     // Compute the difference between the last block end and the first block start.
135 |     clock_t minStart = timer[0];
136 |     clock_t maxEnd = timer[NUM_BLOCKS];
137 | 
138 |     for (int i = 1; i < NUM_BLOCKS; i++)
139 |     {
140 |         minStart = timer[i] < minStart ? timer[i] : minStart;
141 |         maxEnd = timer[NUM_BLOCKS+i] > maxEnd ? timer[NUM_BLOCKS+i] : maxEnd;
142 |     }
143 | 
144 |     printf("Total clocks = %d\n", (int)(maxEnd - minStart));
145 | 
146 | 
147 |     // cudaDeviceReset causes the driver to clean up all state. While
148 |     // not mandatory in normal operation, it is good practice.  It is also
149 |     // needed to ensure correct operation when the application is being
150 |     // profiled. Calling cudaDeviceReset causes all profile data to be
151 |     // flushed before the application exits
152 |     cudaDeviceReset();
153 | 
154 |     return EXIT_SUCCESS;
155 | }
156 | 


--------------------------------------------------------------------------------
/CUDA/cppIntegration.cu:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////
  2 | //
  3 | // Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  4 | //
  5 | // Please refer to the NVIDIA end user license agreement (EULA) associated
  6 | // with this source code for terms and conditions that govern your use of
  7 | // this software. Any use, reproduction, disclosure, or distribution of
  8 | // this software and related documentation outside the terms of the EULA
  9 | // is strictly prohibited.
 10 | //
 11 | ////////////////////////////////////////////////////////////////////////////
 12 | 
 13 | /* Example of integrating CUDA functions into an existing
 14 |  * application / framework.
 15 |  * Host part of the device code.
 16 |  * Compiled with Cuda compiler.
 17 |  */
 18 | 
 19 | // System includes
 20 | #include <stdlib.h>
 21 | #include <stdio.h>
 22 | #include <string.h>
 23 | #include <math.h>
 24 | #include <assert.h>
 25 | 
 26 | // CUDA runtime
 27 | #include <cuda_runtime.h>
 28 | 
 29 | // helper functions and utilities to work with CUDA
 30 | #include <helper_cuda.h>
 31 | #include <helper_functions.h>
 32 | 
 33 | #ifndef MAX
 34 | #define MAX(a,b) (a > b ? a : b)
 35 | #endif
 36 | 
 37 | ////////////////////////////////////////////////////////////////////////////////
 38 | // declaration, forward
 39 | 
 40 | extern "C" void computeGold(char *reference, char *idata, const unsigned int len);
 41 | extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len);
 42 | 
 43 | ///////////////////////////////////////////////////////////////////////////////
 44 | //! Simple test kernel for device functionality
 45 | //! @param g_odata  memory to process (in and out)
 46 | ///////////////////////////////////////////////////////////////////////////////
 47 | __global__ void kernel(int *g_data)
 48 | {
 49 |     // write data to global memory
 50 |     const unsigned int tid = threadIdx.x;
 51 |     int data = g_data[tid];
 52 | 
 53 |     // use integer arithmetic to process all four bytes with one thread
 54 |     // this serializes the execution, but is the simplest solutions to avoid
 55 |     // bank conflicts for this very low number of threads
 56 |     // in general it is more efficient to process each byte by a separate thread,
 57 |     // to avoid bank conflicts the access pattern should be
 58 |     // g_data[4 * wtid + wid], where wtid is the thread id within the half warp
 59 |     // and wid is the warp id
 60 |     // see also the programming guide for a more in depth discussion.
 61 |     g_data[tid] = ((((data <<  0) >> 24) - 10) << 24)
 62 |                   | ((((data <<  8) >> 24) - 10) << 16)
 63 |                   | ((((data << 16) >> 24) - 10) <<  8)
 64 |                   | ((((data << 24) >> 24) - 10) <<  0);
 65 | }
 66 | 
 67 | ///////////////////////////////////////////////////////////////////////////////
 68 | //! Demonstration that int2 data can be used in the cpp code
 69 | //! @param g_odata  memory to process (in and out)
 70 | ///////////////////////////////////////////////////////////////////////////////
 71 | __global__ void
 72 | kernel2(int2 *g_data)
 73 | {
 74 |     // write data to global memory
 75 |     const unsigned int tid = threadIdx.x;
 76 |     int2 data = g_data[tid];
 77 | 
 78 |     // use integer arithmetic to process all four bytes with one thread
 79 |     // this serializes the execution, but is the simplest solutions to avoid
 80 |     // bank conflicts for this very low number of threads
 81 |     // in general it is more efficient to process each byte by a separate thread,
 82 |     // to avoid bank conflicts the access pattern should be
 83 |     // g_data[4 * wtid + wid], where wtid is the thread id within the half warp
 84 |     // and wid is the warp id
 85 |     // see also the programming guide for a more in depth discussion.
 86 |     g_data[tid].x = data.x - data.y;
 87 | }
 88 | 
 89 | ////////////////////////////////////////////////////////////////////////////////
 90 | //! Entry point for Cuda functionality on host side
 91 | //! @param argc  command line argument count
 92 | //! @param argv  command line arguments
 93 | //! @param data  data to process on the device
 94 | //! @param len   len of \a data
 95 | ////////////////////////////////////////////////////////////////////////////////
 96 | extern "C" bool
 97 | runTest(const int argc, const char **argv, char *data, int2 *data_int2, unsigned int len)
 98 | {
 99 |     // use command-line specified CUDA device, otherwise use device with highest Gflops/s
100 |     findCudaDevice(argc, (const char **)argv);
101 | 
102 |     const unsigned int num_threads = len / 4;
103 |     assert(0 == (len % 4));
104 |     const unsigned int mem_size = sizeof(char) * len;
105 |     const unsigned int mem_size_int2 = sizeof(int2) * len;
106 | 
107 |     // allocate device memory
108 |     char *d_data;
109 |     checkCudaErrors(cudaMalloc((void **) &d_data, mem_size));
110 |     // copy host memory to device
111 |     checkCudaErrors(cudaMemcpy(d_data, data, mem_size,
112 |                                cudaMemcpyHostToDevice));
113 |     // allocate device memory for int2 version
114 |     int2 *d_data_int2;
115 |     checkCudaErrors(cudaMalloc((void **) &d_data_int2, mem_size_int2));
116 |     // copy host memory to device
117 |     checkCudaErrors(cudaMemcpy(d_data_int2, data_int2, mem_size_int2,
118 |                                cudaMemcpyHostToDevice));
119 | 
120 |     // setup execution parameters
121 |     dim3 grid(1, 1, 1);
122 |     dim3 threads(num_threads, 1, 1);
123 |     dim3 threads2(len, 1, 1); // more threads needed fir separate int2 version
124 |     // execute the kernel
125 |     kernel<<< grid, threads >>>((int *) d_data);
126 |     kernel2<<< grid, threads2 >>>(d_data_int2);
127 | 
128 |     // check if kernel execution generated and error
129 |     getLastCudaError("Kernel execution failed");
130 | 
131 |     // compute reference solutions
132 |     char *reference = (char *) malloc(mem_size);
133 |     computeGold(reference, data, len);
134 |     int2 *reference2 = (int2 *) malloc(mem_size_int2);
135 |     computeGold2(reference2, data_int2, len);
136 | 
137 |     // copy results from device to host
138 |     checkCudaErrors(cudaMemcpy(data, d_data, mem_size,
139 |                                cudaMemcpyDeviceToHost));
140 |     checkCudaErrors(cudaMemcpy(data_int2, d_data_int2, mem_size_int2,
141 |                                cudaMemcpyDeviceToHost));
142 | 
143 |     // check result
144 |     bool success = true;
145 | 
146 |     for (unsigned int i = 0; i < len; i++)
147 |     {
148 |         if (reference[i] != data[i] ||
149 |             reference2[i].x != data_int2[i].x ||
150 |             reference2[i].y != data_int2[i].y)
151 |         {
152 |             success = false;
153 |         }
154 |     }
155 | 
156 |     // cleanup memory
157 |     checkCudaErrors(cudaFree(d_data));
158 |     checkCudaErrors(cudaFree(d_data_int2));
159 |     free(reference);
160 |     free(reference2);
161 | 
162 |     return success;
163 | }
164 | 


--------------------------------------------------------------------------------
/CUDA/cppOverload.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | #define THREAD_N 256
 12 | #define N 1024
 13 | #define DIV_UP(a, b) (((a) + (b) - 1) / (b))
 14 | 
 15 | // Includes, system
 16 | #include <stdio.h>
 17 | #include <helper_cuda.h>
 18 | #include <helper_string.h>
 19 | #include <helper_math.h>
 20 | #include "cppOverload_kernel.cuh"
 21 | 
 22 | const char *sampleName = "C++ Function Overloading";
 23 | 
 24 | #define OUTPUT_ATTR(attr)  \
 25 |     printf("Shared Size:   %d\n", (int)attr.sharedSizeBytes);   \
 26 |     printf("Constant Size: %d\n", (int)attr.constSizeBytes);                 \
 27 |     printf("Local Size:    %d\n", (int)attr.localSizeBytes);                 \
 28 |     printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock);          \
 29 |     printf("Number of Registers: %d\n", attr.numRegs);                       \
 30 |     printf("PTX Version: %d\n", attr.ptxVersion);                            \
 31 |     printf("Binary Version: %d\n", attr.binaryVersion);                      \
 32 |      
 33 | 
 34 | bool check_func1(int *hInput, int *hOutput, int a)
 35 | {
 36 |     for (int i = 0; i < N; ++i)
 37 |     {
 38 |         int cpuRes = hInput[i]*a + i;
 39 | 
 40 |         if (hOutput[i] != cpuRes)
 41 |         {
 42 |             return false;
 43 |         }
 44 |     }
 45 | 
 46 |     return true;
 47 | }
 48 | 
 49 | bool check_func2(int2 *hInput, int *hOutput, int a)
 50 | {
 51 |     for (int i = 0; i < N; i++)
 52 |     {
 53 |         int cpuRes = (hInput[i].x + hInput[i].y)*a + i;
 54 | 
 55 |         if (hOutput[i] != cpuRes)
 56 |         {
 57 |             return false;
 58 |         }
 59 |     }
 60 | 
 61 |     return true;
 62 | }
 63 | 
 64 | bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a)
 65 | {
 66 |     for (int i = 0; i < N; i++)
 67 |     {
 68 |         if (hOutput[i] != (hInput1[i] + hInput2[i])*a + i)
 69 |         {
 70 |             return false;
 71 |         }
 72 |     }
 73 | 
 74 |     return true;
 75 | }
 76 | 
 77 | int main(int argc, const char *argv[])
 78 | {
 79 |     int *hInput  = NULL;
 80 |     int *hOutput = NULL;
 81 |     int *dInput  = NULL;
 82 |     int *dOutput = NULL;
 83 | 
 84 |     printf("%s starting...\n", sampleName);
 85 | 
 86 |     int deviceCount;
 87 |     checkCudaErrors(cudaGetDeviceCount(&deviceCount));
 88 |     printf("DevicecheckCudaErrors Count: %d\n", deviceCount);
 89 | 
 90 |     int deviceID = findCudaDevice(argc, argv);
 91 | 	cudaDeviceProp prop;
 92 | 	checkCudaErrors(cudaGetDeviceProperties(&prop, deviceID));
 93 | 	if (prop.major < 2)    
 94 |     {
 95 |         printf("ERROR: cppOverload requires GPU devices with compute SM 2.0 or higher.\n");
 96 |         printf("Current GPU device has compute SM%d.%d, Exiting...", prop.major, prop.minor);
 97 |         exit(EXIT_WAIVED);
 98 |     }
 99 | 	
100 |     checkCudaErrors(cudaSetDevice(deviceID));
101 | 
102 |     // Allocate device memory
103 |     checkCudaErrors(cudaMalloc(&dInput , sizeof(int)*N*2));
104 |     checkCudaErrors(cudaMalloc(&dOutput, sizeof(int)*N));
105 | 
106 |     // Allocate host memory
107 |     checkCudaErrors(cudaMallocHost(&hInput , sizeof(int)*N*2));
108 |     checkCudaErrors(cudaMallocHost(&hOutput, sizeof(int)*N));
109 | 
110 |     for (int i = 0; i < N*2; i++)
111 |     {
112 |         hInput[i] = i;
113 |     }
114 | 
115 |     // Copy data from host to device
116 |     checkCudaErrors(cudaMemcpy(dInput, hInput, sizeof(int)*N*2, cudaMemcpyHostToDevice));
117 | 
118 |     // Test C++ overloading
119 |     bool testResult = true;
120 |     bool funcResult = true;
121 |     int a = 1;
122 | 
123 |     void (*func1)(const int *, int *, int);
124 |     void (*func2)(const int2 *, int *, int);
125 |     void (*func3)(const int *, const int *, int *, int);
126 |     struct cudaFuncAttributes attr;
127 | 
128 |     // overload function 1
129 |     func1 = simple_kernel;
130 |     memset(&attr, 0, sizeof(attr));
131 |     checkCudaErrors(cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared));
132 |     checkCudaErrors(cudaFuncGetAttributes(&attr, *func1));
133 |     OUTPUT_ATTR(attr);
134 |     (*func1)<<<DIV_UP(N, THREAD_N), THREAD_N>>>(dInput, dOutput, a);
135 |     checkCudaErrors(cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost));
136 |     funcResult = check_func1(hInput, hOutput, a);
137 |     printf("simple_kernel(const int *pIn, int *pOut, int a) %s\n\n", funcResult ? "PASSED" : "FAILED");
138 |     testResult &= funcResult;
139 | 
140 |     // overload function 2
141 |     func2 = simple_kernel;
142 |     memset(&attr, 0, sizeof(attr));
143 |     checkCudaErrors(cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared));
144 |     checkCudaErrors(cudaFuncGetAttributes(&attr, *func2));
145 |     OUTPUT_ATTR(attr);
146 |     (*func2)<<<DIV_UP(N, THREAD_N), THREAD_N>>>((int2 *)dInput, dOutput, a);
147 |     checkCudaErrors(cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost));
148 |     funcResult = check_func2(reinterpret_cast<int2 *>(hInput), hOutput, a);
149 |     printf("simple_kernel(const int2 *pIn, int *pOut, int a) %s\n\n", funcResult ? "PASSED" : "FAILED");
150 |     testResult &= funcResult;
151 | 
152 |     // overload function 3
153 |     func3 = simple_kernel;
154 |     memset(&attr, 0, sizeof(attr));
155 |     checkCudaErrors(cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared));
156 |     checkCudaErrors(cudaFuncGetAttributes(&attr, *func3));
157 |     OUTPUT_ATTR(attr);
158 |     (*func3)<<<DIV_UP(N, THREAD_N), THREAD_N>>>(dInput, dInput+N, dOutput, a);
159 |     checkCudaErrors(cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost));
160 |     funcResult = check_func3(&hInput[0], &hInput[N], hOutput, a);
161 |     printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) %s\n\n", funcResult ? "PASSED" : "FAILED");
162 |     testResult &= funcResult;
163 | 
164 |     checkCudaErrors(cudaFree(dInput));
165 |     checkCudaErrors(cudaFree(dOutput));
166 |     checkCudaErrors(cudaFreeHost(hOutput));
167 |     checkCudaErrors(cudaFreeHost(hInput));
168 | 
169 |     checkCudaErrors(cudaDeviceSynchronize());
170 | 
171 |     // cudaDeviceReset causes the driver to clean up all state. While
172 |     // not mandatory in normal operation, it is good practice.  It is also
173 |     // needed to ensure correct operation when the application is being
174 |     // profiled. Calling cudaDeviceReset causes all profile data to be
175 |     // flushed before the application exits
176 |     checkCudaErrors(cudaDeviceReset());
177 | 
178 |     exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
179 | }
180 | 


--------------------------------------------------------------------------------
/CUDA/matrixMul.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | /**
 13 |  * Matrix multiplication: C = A * B.
 14 |  * Host code.
 15 |  *
 16 |  * This sample implements matrix multiplication as described in Chapter 3
 17 |  * of the programming guide.
 18 |  * It has been written for clarity of exposition to illustrate various CUDA
 19 |  * programming principles, not with the goal of providing the most
 20 |  * performant generic kernel for matrix multiplication.
 21 |  *
 22 |  * See also:
 23 |  * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
 24 |  * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
 25 |  * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
 26 |  */
 27 | 
 28 | // System includes
 29 | #include <stdio.h>
 30 | #include <assert.h>
 31 | 
 32 | // CUDA runtime
 33 | #include <cuda_runtime.h>
 34 | 
 35 | // Helper functions and utilities to work with CUDA
 36 | #include <helper_functions.h>
 37 | 
 38 | /**
 39 |  * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 40 |  * wA is A's width and wB is B's width
 41 |  */
 42 | template <int BLOCK_SIZE> __global__ void
 43 | matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
 44 | {
 45 |     // Block index
 46 |     int bx = blockIdx.x;
 47 |     int by = blockIdx.y;
 48 | 
 49 |     // Thread index
 50 |     int tx = threadIdx.x;
 51 |     int ty = threadIdx.y;
 52 | 
 53 |     // Index of the first sub-matrix of A processed by the block
 54 |     int aBegin = wA * BLOCK_SIZE * by;
 55 | 
 56 |     // Index of the last sub-matrix of A processed by the block
 57 |     int aEnd   = aBegin + wA - 1;
 58 | 
 59 |     // Step size used to iterate through the sub-matrices of A
 60 |     int aStep  = BLOCK_SIZE;
 61 | 
 62 |     // Index of the first sub-matrix of B processed by the block
 63 |     int bBegin = BLOCK_SIZE * bx;
 64 | 
 65 |     // Step size used to iterate through the sub-matrices of B
 66 |     int bStep  = BLOCK_SIZE * wB;
 67 | 
 68 |     // Csub is used to store the element of the block sub-matrix
 69 |     // that is computed by the thread
 70 |     float Csub = 0;
 71 | 
 72 |     // Loop over all the sub-matrices of A and B
 73 |     // required to compute the block sub-matrix
 74 |     for (int a = aBegin, b = bBegin;
 75 |          a <= aEnd;
 76 |          a += aStep, b += bStep)
 77 |     {
 78 | 
 79 |         // Declaration of the shared memory array As used to
 80 |         // store the sub-matrix of A
 81 |         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 82 | 
 83 |         // Declaration of the shared memory array Bs used to
 84 |         // store the sub-matrix of B
 85 |         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 86 | 
 87 |         // Load the matrices from device memory
 88 |         // to shared memory; each thread loads
 89 |         // one element of each matrix
 90 |         As[ty][tx] = A[a + wA * ty + tx];
 91 |         Bs[ty][tx] = B[b + wB * ty + tx];
 92 | 
 93 |         // Synchronize to make sure the matrices are loaded
 94 |         __syncthreads();
 95 | 
 96 |         // Multiply the two matrices together;
 97 |         // each thread computes one element
 98 |         // of the block sub-matrix
 99 | #pragma unroll
100 | 
101 |         for (int k = 0; k < BLOCK_SIZE; ++k)
102 |         {
103 |             Csub += As[ty][k] * Bs[k][tx];
104 |         }
105 | 
106 |         // Synchronize to make sure that the preceding
107 |         // computation is done before loading two new
108 |         // sub-matrices of A and B in the next iteration
109 |         __syncthreads();
110 |     }
111 | 
112 |     // Write the block sub-matrix to device memory;
113 |     // each thread writes one element
114 |     int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
115 |     C[c + wB * ty + tx] = Csub;
116 | }
117 | 
118 | void constantInit(float *data, int size, float val)
119 | {
120 |     for (int i = 0; i < size; ++i)
121 |     {
122 |         data[i] = val;
123 |     }
124 | }
125 | 
126 | /**
127 |  * Run a simple test of matrix multiplication using CUDA
128 |  */
129 | int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
130 | {
131 |     // Allocate host memory for matrices A and B
132 |     unsigned int size_A = dimsA.x * dimsA.y;
133 |     unsigned int mem_size_A = sizeof(float) * size_A;
134 |     float *h_A = (float *)malloc(mem_size_A);
135 |     unsigned int size_B = dimsB.x * dimsB.y;
136 |     unsigned int mem_size_B = sizeof(float) * size_B;
137 |     float *h_B = (float *)malloc(mem_size_B);
138 | 
139 |     // Initialize host memory
140 |     const float valB = 0.01f;
141 |     constantInit(h_A, size_A, 1.0f);
142 |     constantInit(h_B, size_B, valB);
143 | 
144 |     // Allocate device memory
145 |     float *d_A, *d_B, *d_C;
146 | 
147 |     // Allocate host matrix C
148 |     dim3 dimsC(dimsB.x, dimsA.y, 1);
149 |     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
150 |     float *h_C = (float *) malloc(mem_size_C);
151 | 
152 |     if (h_C == NULL)
153 |     {
154 |         fprintf(stderr, "Failed to allocate host matrix C!\n");
155 |         exit(EXIT_FAILURE);
156 |     }
157 | 
158 |     cudaError_t error;
159 | 
160 |     error = cudaMalloc((void **) &d_A, mem_size_A);
161 | 
162 |     if (error != cudaSuccess)
163 |     {
164 |         printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
165 |         exit(EXIT_FAILURE);
166 |     }
167 | 
168 |     error = cudaMalloc((void **) &d_B, mem_size_B);
169 | 
170 |     if (error != cudaSuccess)
171 |     {
172 |         printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
173 |         exit(EXIT_FAILURE);
174 |     }
175 | 
176 |     error = cudaMalloc((void **) &d_C, mem_size_C);
177 | 
178 |     if (error != cudaSuccess)
179 |     {
180 |         printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
181 |         exit(EXIT_FAILURE);
182 |     }
183 | 
184 |     // copy host memory to device
185 |     error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
186 | 
187 |     if (error != cudaSuccess)
188 |     {
189 |         printf("cudaMemcpy (d_A,h_A) returned error code %d, line(%d)\n", error, __LINE__);
190 |         exit(EXIT_FAILURE);
191 |     }
192 | 
193 |     error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
194 | 
195 |     if (error != cudaSuccess)
196 |     {
197 |         printf("cudaMemcpy (d_B,h_B) returned error code %d, line(%d)\n", error, __LINE__);
198 |         exit(EXIT_FAILURE);
199 |     }
200 | 
201 |     // Setup execution parameters
202 |     dim3 threads(block_size, block_size);
203 |     dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
204 | 
205 |     // Create and start timer
206 |     printf("Computing result using CUDA Kernel...\n");
207 | 
208 |     // Performs warmup operation using matrixMul CUDA kernel
209 |     if (block_size == 16)
210 |     {
211 |         matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
212 |     }
213 |     else
214 |     {
215 |         matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
216 |     }
217 | 
218 |     printf("done\n");
219 | 
220 |     cudaDeviceSynchronize();
221 | 
222 |     // Allocate CUDA events that we'll use for timing
223 |     cudaEvent_t start;
224 |     error = cudaEventCreate(&start);
225 | 
226 |     if (error != cudaSuccess)
227 |     {
228 |         fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
229 |         exit(EXIT_FAILURE);
230 |     }
231 | 
232 |     cudaEvent_t stop;
233 |     error = cudaEventCreate(&stop);
234 | 
235 |     if (error != cudaSuccess)
236 |     {
237 |         fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
238 |         exit(EXIT_FAILURE);
239 |     }
240 | 
241 |     // Record the start event
242 |     error = cudaEventRecord(start, NULL);
243 | 
244 |     if (error != cudaSuccess)
245 |     {
246 |         fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
247 |         exit(EXIT_FAILURE);
248 |     }
249 | 
250 |     // Execute the kernel
251 |     int nIter = 300;
252 | 
253 |     for (int j = 0; j < nIter; j++)
254 |     {
255 |         if (block_size == 16)
256 |         {
257 |             matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
258 |         }
259 |         else
260 |         {
261 |             matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
262 |         }
263 |     }
264 | 
265 |     // Record the stop event
266 |     error = cudaEventRecord(stop, NULL);
267 | 
268 |     if (error != cudaSuccess)
269 |     {
270 |         fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
271 |         exit(EXIT_FAILURE);
272 |     }
273 | 
274 |     // Wait for the stop event to complete
275 |     error = cudaEventSynchronize(stop);
276 | 
277 |     if (error != cudaSuccess)
278 |     {
279 |         fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
280 |         exit(EXIT_FAILURE);
281 |     }
282 | 
283 |     float msecTotal = 0.0f;
284 |     error = cudaEventElapsedTime(&msecTotal, start, stop);
285 | 
286 |     if (error != cudaSuccess)
287 |     {
288 |         fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
289 |         exit(EXIT_FAILURE);
290 |     }
291 | 
292 |     // Compute and print the performance
293 |     float msecPerMatrixMul = msecTotal / nIter;
294 |     double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
295 |     double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
296 |     printf(
297 |         "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
298 |         gigaFlops,
299 |         msecPerMatrixMul,
300 |         flopsPerMatrixMul,
301 |         threads.x * threads.y);
302 | 
303 |     // Copy result from device to host
304 |     error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
305 | 
306 |     if (error != cudaSuccess)
307 |     {
308 |         printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__);
309 |         exit(EXIT_FAILURE);
310 |     }
311 | 
312 |     printf("Checking computed result for correctness: ");
313 |     bool correct = true;
314 | 
315 |     // test relative error by the formula
316 |     //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
317 |     double eps = 1.e-6 ; // machine zero
318 | 
319 |     for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
320 |     {
321 |         double abs_err = fabs(h_C[i] - (dimsA.x * valB));
322 |         double dot_length = dimsA.x;
323 |         double abs_val = fabs(h_C[i]);
324 |         double rel_err = abs_err/abs_val/dot_length ;
325 | 
326 |         if (rel_err > eps)
327 |         {
328 |             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);
329 |             correct = false;
330 |         }
331 |     }
332 | 
333 |     printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
334 | 
335 |     // Clean up memory
336 |     free(h_A);
337 |     free(h_B);
338 |     free(h_C);
339 |     cudaFree(d_A);
340 |     cudaFree(d_B);
341 |     cudaFree(d_C);
342 | 
343 |     printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
344 | 
345 |     // cudaDeviceReset causes the driver to clean up all state. While
346 |     // not mandatory in normal operation, it is good practice.  It is also
347 |     // needed to ensure correct operation when the application is being
348 |     // profiled. Calling cudaDeviceReset causes all profile data to be
349 |     // flushed before the application exits
350 |     cudaDeviceReset();
351 | 
352 |     if (correct)
353 |     {
354 |         return EXIT_SUCCESS;
355 |     }
356 |     else
357 |     {
358 |         return EXIT_FAILURE;
359 |     }
360 | }
361 | 
362 | 
363 | /**
364 |  * Program main
365 |  */
366 | int main(int argc, char **argv)
367 | {
368 |     printf("[Matrix Multiply Using CUDA] - Starting...\n");
369 | 
370 |     if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
371 |         checkCmdLineFlag(argc, (const char **)argv, "?"))
372 |     {
373 |         printf("Usage -device=n (n >= 0 for deviceID)\n");
374 |         printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
375 |         printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
376 |         printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
377 | 
378 |         exit(EXIT_SUCCESS);
379 |     }
380 | 
381 |     // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
382 |     int devID = 0;
383 | 
384 |     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
385 |     {
386 |         devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
387 |         cudaSetDevice(devID);
388 |     }
389 | 
390 |     cudaError_t error;
391 |     cudaDeviceProp deviceProp;
392 |     error = cudaGetDevice(&devID);
393 | 
394 |     if (error != cudaSuccess)
395 |     {
396 |         printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
397 |     }
398 | 
399 |     error = cudaGetDeviceProperties(&deviceProp, devID);
400 | 
401 |     if (deviceProp.computeMode == cudaComputeModeProhibited)
402 |     {
403 |         fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
404 |         exit(EXIT_SUCCESS);
405 |     }
406 | 
407 |     if (error != cudaSuccess)
408 |     {
409 |         printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
410 |     }
411 |     else
412 |     {
413 |         printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
414 |     }
415 | 
416 |     // Use a larger block size for Fermi and above
417 |     int block_size = (deviceProp.major < 2) ? 16 : 32;
418 | 
419 |     dim3 dimsA(5*2*block_size, 5*2*block_size, 1);
420 |     dim3 dimsB(5*4*block_size, 5*2*block_size, 1);
421 | 
422 |     // width of Matrix A
423 |     if (checkCmdLineFlag(argc, (const char **)argv, "wA"))
424 |     {
425 |         dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
426 |     }
427 | 
428 |     // height of Matrix A
429 |     if (checkCmdLineFlag(argc, (const char **)argv, "hA"))
430 |     {
431 |         dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
432 |     }
433 | 
434 |     // width of Matrix B
435 |     if (checkCmdLineFlag(argc, (const char **)argv, "wB"))
436 |     {
437 |         dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
438 |     }
439 | 
440 |     // height of Matrix B
441 |     if (checkCmdLineFlag(argc, (const char **)argv, "hB"))
442 |     {
443 |         dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
444 |     }
445 | 
446 |     if (dimsA.x != dimsB.y)
447 |     {
448 |         printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
449 |                dimsA.x, dimsB.y);
450 |         exit(EXIT_FAILURE);
451 |     }
452 | 
453 |     printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
454 | 
455 |     int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
456 | 
457 |     exit(matrix_result);
458 | }
459 | 


--------------------------------------------------------------------------------
/CUDA/simpleAssert.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | #ifdef _WIN32
 12 | #  define WINDOWS_LEAN_AND_MEAN
 13 | #  define NOMINMAX
 14 | #  include <windows.h>
 15 | #else
 16 | #  include <sys/utsname.h>
 17 | #endif
 18 | 
 19 | // Includes, system
 20 | #include <stdio.h>
 21 | #include <cassert>
 22 | 
 23 | // Includes CUDA
 24 | #include <cuda_runtime.h>
 25 | 
 26 | // Utilities and timing functions
 27 | #include <helper_functions.h>    // includes cuda.h and cuda_runtime_api.h
 28 | 
 29 | // CUDA helper functions
 30 | #include <helper_cuda.h>         // helper functions for CUDA error check
 31 | 
 32 | const char *sampleName = "simpleAssert";
 33 | 
 34 | ////////////////////////////////////////////////////////////////////////////////
 35 | // Auto-Verification Code
 36 | bool testResult = true;
 37 | 
 38 | ////////////////////////////////////////////////////////////////////////////////
 39 | // Kernels
 40 | ////////////////////////////////////////////////////////////////////////////////
 41 | //! Tests assert function.
 42 | //! Thread whose id > N will print assertion failed error message.
 43 | ////////////////////////////////////////////////////////////////////////////////
 44 | __global__ void testKernel(int N)
 45 | {
 46 |     int gtid = blockIdx.x*blockDim.x + threadIdx.x ;
 47 |     assert(gtid < N) ;
 48 | }
 49 | 
 50 | ////////////////////////////////////////////////////////////////////////////////
 51 | // Declaration, forward
 52 | void runTest(int argc, char **argv);
 53 | 
 54 | ////////////////////////////////////////////////////////////////////////////////
 55 | // Program main
 56 | ////////////////////////////////////////////////////////////////////////////////
 57 | int main(int argc, char **argv)
 58 | {
 59 |     printf("%s starting...\n", sampleName);
 60 | 
 61 |     runTest(argc, argv);
 62 | 
 63 |     // cudaDeviceReset causes the driver to clean up all state. While
 64 |     // not mandatory in normal operation, it is good practice.  It is also
 65 |     // needed to ensure correct operation when the application is being
 66 |     // profiled. Calling cudaDeviceReset causes all profile data to be
 67 |     // flushed before the application exits
 68 |     cudaDeviceReset();
 69 |     printf("%s completed, returned %s\n",
 70 |            sampleName,
 71 |            testResult ? "OK" : "ERROR!");
 72 |     exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 73 | }
 74 | 
 75 | void runTest(int argc, char **argv)
 76 | {
 77 |     int devID;
 78 |     int Nblocks = 2;
 79 |     int Nthreads = 32;
 80 |     cudaError_t error ;
 81 |     cudaDeviceProp deviceProp;
 82 | 
 83 | #ifndef _WIN32
 84 |     utsname OS_System_Type;
 85 |     uname(&OS_System_Type);
 86 | 
 87 |     printf("OS_System_Type.release = %s\n", OS_System_Type.release);
 88 | 
 89 |     if (!strcasecmp(OS_System_Type.sysname, "Darwin"))
 90 |     {
 91 |         printf("simpleAssert is not current supported on Mac OSX\n\n");
 92 |         exit(EXIT_SUCCESS);
 93 |     }
 94 |     else
 95 |     {
 96 |         printf("OS Info: <%s>\n\n", OS_System_Type.version);
 97 |     }
 98 | 
 99 | #endif
100 | 
101 |     // This will pick the best possible CUDA capable device
102 |     devID = findCudaDevice(argc, (const char **)argv);
103 | 
104 |     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
105 | 
106 |     if (deviceProp.major < 2)
107 |     {
108 |         printf("simpleAssert requires a GPU with compute capability "
109 |                "2.0 or later, exiting...\n");
110 | 
111 |         // cudaDeviceReset causes the driver to clean up all state. While
112 |         // not mandatory in normal operation, it is good practice.  It is also
113 |         // needed to ensure correct operation when the application is being
114 |         // profiled. Calling cudaDeviceReset causes all profile data to be
115 |         // flushed before the application exits
116 |         cudaDeviceReset();
117 |         exit(EXIT_SUCCESS);
118 |     }
119 | 
120 |     // Kernel configuration, where a one-dimensional
121 |     // grid and one-dimensional blocks are configured.
122 |     dim3 dimGrid(Nblocks);
123 |     dim3 dimBlock(Nthreads);
124 | 
125 |     printf("Launch kernel to generate assertion failures\n");
126 |     testKernel<<<dimGrid, dimBlock>>>(60);
127 | 
128 |     //Synchronize (flushes assert output).
129 |     printf("\n-- Begin assert output\n\n");
130 |     error = cudaDeviceSynchronize();
131 |     printf("\n-- End assert output\n\n");
132 | 
133 |     //Check for errors and failed asserts in asynchronous kernel launch.
134 |     if (error == cudaErrorAssert)
135 |     {
136 |         printf("Device assert failed as expected, "
137 |                "CUDA error message is: %s\n\n",
138 |                cudaGetErrorString(error));
139 |     }
140 | 
141 | 
142 |     testResult = error == cudaErrorAssert;
143 | }
144 | 


--------------------------------------------------------------------------------
/CUDA/simpleAssert_kernel.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | ////////////////////////////////////////////////////////////////////////////////
 3 | // Kernels
 4 | ////////////////////////////////////////////////////////////////////////////////
 5 | //! Tests assert function.
 6 | //! Thread whose id > N will print assertion failed error message.
 7 | ////////////////////////////////////////////////////////////////////////////////
 8 | 
 9 | 
10 | extern "C" __global__ void testKernel(int N)
11 | {
12 |     int gtid = blockIdx.x*blockDim.x + threadIdx.x ;
13 |     assert(gtid < N) ;
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/CUDA/template_runtime.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | /* Template project which demonstrates the basics on how to setup a project
 13 | * example application, doesn't use cutil library.
 14 | */
 15 | 
 16 | #include <stdio.h>
 17 | #include <string.h>
 18 | #include <iostream>
 19 | 
 20 | using namespace std;
 21 | 
 22 | #ifdef _WIN32
 23 | #define STRCASECMP  _stricmp
 24 | #define STRNCASECMP _strnicmp
 25 | #else
 26 | #define STRCASECMP  strcasecmp
 27 | #define STRNCASECMP strncasecmp
 28 | #endif
 29 | 
 30 | #define ASSERT(x, msg, retcode) \
 31 |     if (!(x)) \
 32 |     { \
 33 |         cout << msg << " " << __FILE__ << ":" << __LINE__ << endl; \
 34 |         return retcode; \
 35 |     }
 36 | 
 37 | __global__ void sequence_gpu(int *d_ptr, int length)
 38 | {
 39 |     int elemID = blockIdx.x * blockDim.x + threadIdx.x;
 40 | 
 41 |     if (elemID < length)
 42 |     {
 43 |         d_ptr[elemID] = elemID;
 44 |     }
 45 | }
 46 | 
 47 | 
 48 | void sequence_cpu(int *h_ptr, int length)
 49 | {
 50 |     for (int elemID=0; elemID<length; elemID++)
 51 |     {
 52 |         h_ptr[elemID] = elemID;
 53 |     }
 54 | }
 55 | 
 56 | int main(int argc, char **argv)
 57 | {
 58 |     printf("%s Starting...\n\n", argv[0]);
 59 | 
 60 |     cout << "CUDA Runtime API template" << endl;
 61 |     cout << "=========================" << endl;
 62 |     cout << "Self-test started" << endl;
 63 | 
 64 |     const int N = 100;
 65 | 
 66 |     int *d_ptr;
 67 |     ASSERT(cudaSuccess == cudaMalloc(&d_ptr, N * sizeof(int)), "Device allocation of " << N << " ints failed", -1);
 68 | 
 69 |     int *h_ptr;
 70 |     ASSERT(cudaSuccess == cudaMallocHost(&h_ptr, N * sizeof(int)), "Host allocation of "   << N << " ints failed", -1);
 71 | 
 72 |     cout << "Memory allocated successfully" << endl;
 73 | 
 74 |     dim3 cudaBlockSize(32,1,1);
 75 |     dim3 cudaGridSize((N + cudaBlockSize.x - 1) / cudaBlockSize.x, 1, 1);
 76 |     sequence_gpu<<<cudaGridSize, cudaBlockSize>>>(d_ptr, N);
 77 |     ASSERT(cudaSuccess == cudaGetLastError(), "Kernel launch failed", -1);
 78 |     ASSERT(cudaSuccess == cudaDeviceSynchronize(), "Kernel synchronization failed", -1);
 79 | 
 80 |     sequence_cpu(h_ptr, N);
 81 | 
 82 |     cout << "CUDA and CPU algorithm implementations finished" << endl;
 83 | 
 84 |     int *h_d_ptr;
 85 |     ASSERT(cudaSuccess == cudaMallocHost(&h_d_ptr, N *sizeof(int)), "Host allocation of " << N << " ints failed", -1);
 86 |     ASSERT(cudaSuccess == cudaMemcpy(h_d_ptr, d_ptr, N *sizeof(int), cudaMemcpyDeviceToHost), "Copy of " << N << " ints from device to host failed", -1);
 87 |     bool bValid = true;
 88 | 
 89 |     for (int i=0; i<N && bValid; i++)
 90 |     {
 91 |         if (h_ptr[i] != h_d_ptr[i])
 92 |         {
 93 |             bValid = false;
 94 |         }
 95 |     }
 96 | 
 97 |     ASSERT(cudaSuccess == cudaFree(d_ptr),       "Device deallocation failed", -1);
 98 |     ASSERT(cudaSuccess == cudaFreeHost(h_ptr),   "Host deallocation failed",   -1);
 99 |     ASSERT(cudaSuccess == cudaFreeHost(h_d_ptr), "Host deallocation failed",   -1);
100 | 
101 |     cout << "Memory deallocated successfully" << endl;
102 |     cout << "TEST Results " << endl;
103 | 
104 |     exit(bValid ? EXIT_SUCCESS : EXIT_FAILURE);
105 | }
106 | 


--------------------------------------------------------------------------------
/CUDA/vectorAdd.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | /**
 13 |  * Vector addition: C = A + B.
 14 |  *
 15 |  * This sample is a very basic sample that implements element by element
 16 |  * vector addition. It is the same as the sample illustrating Chapter 2
 17 |  * of the programming guide with some additions like error checking.
 18 |  */
 19 | 
 20 | #include <stdio.h>
 21 | 
 22 | // For the CUDA runtime routines (prefixed with "cuda_")
 23 | #include <cuda_runtime.h>
 24 | 
 25 | /**
 26 |  * CUDA Kernel Device code
 27 |  *
 28 |  * Computes the vector addition of A and B into C. The 3 vectors have the same
 29 |  * number of elements numElements.
 30 |  */
 31 | __global__ void
 32 | vectorAdd(const float *A, const float *B, float *C, int numElements)
 33 | {
 34 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
 35 | 
 36 |     if (i < numElements)
 37 |     {
 38 |         C[i] = A[i] + B[i];
 39 |     }
 40 | }
 41 | 
 42 | /**
 43 |  * Host main routine
 44 |  */
 45 | int
 46 | main(void)
 47 | {
 48 |     // Error code to check return values for CUDA calls
 49 |     cudaError_t err = cudaSuccess;
 50 | 
 51 |     // Print the vector length to be used, and compute its size
 52 |     int numElements = 50000;
 53 |     size_t size = numElements * sizeof(float);
 54 |     printf("[Vector addition of %d elements]\n", numElements);
 55 | 
 56 |     // Allocate the host input vector A
 57 |     float *h_A = (float *)malloc(size);
 58 | 
 59 |     // Allocate the host input vector B
 60 |     float *h_B = (float *)malloc(size);
 61 | 
 62 |     // Allocate the host output vector C
 63 |     float *h_C = (float *)malloc(size);
 64 | 
 65 |     // Verify that allocations succeeded
 66 |     if (h_A == NULL || h_B == NULL || h_C == NULL)
 67 |     {
 68 |         fprintf(stderr, "Failed to allocate host vectors!\n");
 69 |         exit(EXIT_FAILURE);
 70 |     }
 71 | 
 72 |     // Initialize the host input vectors
 73 |     for (int i = 0; i < numElements; ++i)
 74 |     {
 75 |         h_A[i] = rand()/(float)RAND_MAX;
 76 |         h_B[i] = rand()/(float)RAND_MAX;
 77 |     }
 78 | 
 79 |     // Allocate the device input vector A
 80 |     float *d_A = NULL;
 81 |     err = cudaMalloc((void **)&d_A, size);
 82 | 
 83 |     if (err != cudaSuccess)
 84 |     {
 85 |         fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 86 |         exit(EXIT_FAILURE);
 87 |     }
 88 | 
 89 |     // Allocate the device input vector B
 90 |     float *d_B = NULL;
 91 |     err = cudaMalloc((void **)&d_B, size);
 92 | 
 93 |     if (err != cudaSuccess)
 94 |     {
 95 |         fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 96 |         exit(EXIT_FAILURE);
 97 |     }
 98 | 
 99 |     // Allocate the device output vector C
100 |     float *d_C = NULL;
101 |     err = cudaMalloc((void **)&d_C, size);
102 | 
103 |     if (err != cudaSuccess)
104 |     {
105 |         fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
106 |         exit(EXIT_FAILURE);
107 |     }
108 | 
109 |     // Copy the host input vectors A and B in host memory to the device input vectors in
110 |     // device memory
111 |     printf("Copy input data from the host memory to the CUDA device\n");
112 |     err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
113 | 
114 |     if (err != cudaSuccess)
115 |     {
116 |         fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
117 |         exit(EXIT_FAILURE);
118 |     }
119 | 
120 |     err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
121 | 
122 |     if (err != cudaSuccess)
123 |     {
124 |         fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
125 |         exit(EXIT_FAILURE);
126 |     }
127 | 
128 |     // Launch the Vector Add CUDA Kernel
129 |     int threadsPerBlock = 256;
130 |     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
131 |     printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
132 |     vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
133 |     err = cudaGetLastError();
134 | 
135 |     if (err != cudaSuccess)
136 |     {
137 |         fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
138 |         exit(EXIT_FAILURE);
139 |     }
140 | 
141 |     // Copy the device result vector in device memory to the host result vector
142 |     // in host memory.
143 |     printf("Copy output data from the CUDA device to the host memory\n");
144 |     err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
145 | 
146 |     if (err != cudaSuccess)
147 |     {
148 |         fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
149 |         exit(EXIT_FAILURE);
150 |     }
151 | 
152 |     // Verify that the result vector is correct
153 |     for (int i = 0; i < numElements; ++i)
154 |     {
155 |         if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
156 |         {
157 |             fprintf(stderr, "Result verification failed at element %d!\n", i);
158 |             exit(EXIT_FAILURE);
159 |         }
160 |     }
161 | 
162 |     printf("Test PASSED\n");
163 | 
164 |     // Free device global memory
165 |     err = cudaFree(d_A);
166 | 
167 |     if (err != cudaSuccess)
168 |     {
169 |         fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
170 |         exit(EXIT_FAILURE);
171 |     }
172 | 
173 |     err = cudaFree(d_B);
174 | 
175 |     if (err != cudaSuccess)
176 |     {
177 |         fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
178 |         exit(EXIT_FAILURE);
179 |     }
180 | 
181 |     err = cudaFree(d_C);
182 | 
183 |     if (err != cudaSuccess)
184 |     {
185 |         fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
186 |         exit(EXIT_FAILURE);
187 |     }
188 | 
189 |     // Free host memory
190 |     free(h_A);
191 |     free(h_B);
192 |     free(h_C);
193 | 
194 |     // Reset the device and exit
195 |     // cudaDeviceReset causes the driver to clean up all state. While
196 |     // not mandatory in normal operation, it is good practice.  It is also
197 |     // needed to ensure correct operation when the application is being
198 |     // profiled. Calling cudaDeviceReset causes all profile data to be
199 |     // flushed before the application exits
200 |     err = cudaDeviceReset();
201 | 
202 |     if (err != cudaSuccess)
203 |     {
204 |         fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
205 |         exit(EXIT_FAILURE);
206 |     }
207 | 
208 |     printf("Done\n");
209 |     return 0;
210 | }
211 | 
212 | 


--------------------------------------------------------------------------------
/MPI/SimpleSendRcv.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <mpi.h>
 4 | #include <math.h>
 5 | 
 6 | int main(argc,argv)
 7 | int argc;
 8 | char *argv[];
 9 | {
10 |     int myid, numprocs;
11 |     int tag,source,destination,count;
12 |     int buffer;
13 |     MPI_Status status;
14 |  
15 |     MPI_Init(&argc,&argv);
16 |     MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
17 |     MPI_Comm_rank(MPI_COMM_WORLD,&myid);
18 |     tag=1234;
19 |     source=0;
20 |     destination=1;
21 |     count=1;
22 |     if(myid == source){
23 |       buffer=5678;
24 |       MPI_Send(&buffer,count,MPI_INT,destination,tag,MPI_COMM_WORLD);
25 |       printf("processor %d  sent %d\n",myid,buffer);
26 |     }
27 |     if(myid == destination){
28 |         MPI_Recv(&buffer,count,MPI_INT,source,tag,MPI_COMM_WORLD,&status);
29 |         printf("processor %d  got %d\n",myid,buffer);
30 |     }
31 |     MPI_Finalize();
32 | }
33 | 


--------------------------------------------------------------------------------
/MPI/array_prod.c:
--------------------------------------------------------------------------------
 1 | #include<mpi.h>
 2 | #include<stdlib.h>
 3 | #include<stdio.h>
 4 | 
 5 | int main(int argc,char** argv){
 6 | 	int rank,size,epp;
 7 | 	int* A1=NULL;
 8 | 	int* A2=NULL;
 9 | 	int* Rec1=NULL;
10 | 	int* Rec2=NULL;
11 | 	int i,n;
12 | 	MPI_Init(NULL,NULL);
13 | 	MPI_Comm_rank(MPI_COMM_WORLD,&rank);
14 | 	MPI_Comm_size(MPI_COMM_WORLD,&size);
15 | 	if(rank==0){
16 | 			printf("Enter size of arrays...\n");
17 | 			scanf("%d",&n);
18 | 			epp=n/size+(n%size==0?0:1);
19 | 			A1=(int*)malloc(sizeof(int)*size*epp);
20 | 			A2=(int*)malloc(sizeof(int)*size*epp);
21 | 			for(i=0;i<n;i++){
22 | 				printf("Enter A1[%d]:",i);
23 | 				scanf("%d",&A1[i]);
24 | 				printf("Enter A2[%d]:",i);
25 | 				scanf("%d",&A2[i]);
26 | 			}
27 | 			for(i=n;i<size*epp;i++){
28 | 				A1[i]=0;
29 | 				A2[i]=0;
30 | 			}
31 | 	}
32 | 	MPI_Bcast(&epp,1,MPI_INT,0,MPI_COMM_WORLD);
33 | 	Rec1=(int*)malloc(sizeof(int)*epp);
34 | 	Rec2=(int*)malloc(sizeof(int)*epp);
35 | 	MPI_Scatter(A1,epp,MPI_INT,Rec1,epp,MPI_INT,0,MPI_COMM_WORLD);
36 | 	MPI_Scatter(A2,epp,MPI_INT,Rec2,epp,MPI_INT,0,MPI_COMM_WORLD);
37 | 	for(i=0;i<epp;i++)
38 | 		Rec1[i]*=Rec2[i];
39 | 	MPI_Gather(Rec1,epp,MPI_INT,A1,epp,MPI_INT,0,MPI_COMM_WORLD);
40 | 	if(rank==0){
41 | 		printf("Data gathered : ");
42 | 		for(i=0;i<n;i++)
43 | 			printf("%d ",A1[i]);
44 | 		printf("\n");
45 | 	}
46 | 	MPI_Finalize();
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/MPI/average_reduce.c:
--------------------------------------------------------------------------------
 1 | #include<mpi.h>
 2 | #include<stdlib.h>
 3 | #include<stdio.h>
 4 | 
 5 | int main(int argc,char** argv){
 6 | 	int rank,size,epp;
 7 | 	int* dataSend=NULL;
 8 | 	int i;
 9 | 	MPI_Init(NULL,NULL);
10 | 	MPI_Comm_rank(MPI_COMM_WORLD,&rank);
11 | 	MPI_Comm_size(MPI_COMM_WORLD,&size);
12 | 	epp=2;
13 | 	if(rank==0){
14 | 			printf("Master creating data...\n");
15 | 			dataSend=(int*)malloc(sizeof(int)*size*epp);
16 | 			for(i=0;i<epp*size;i++)
17 | 				dataSend[i]=i;
18 | 	}
19 | 	int* dataRecv1=(int*)malloc(sizeof(int)*epp);
20 | 	MPI_Scatter(dataSend,epp,MPI_INT,dataRecv1,epp,MPI_INT,0,MPI_COMM_WORLD);
21 | 	float subavg=0.0f;
22 | 	for(i=0;i<epp;i++)
23 | 		subavg+=dataRecv1[i];
24 | 	subavg/=epp;
25 | 	printf("%d calculates subavg as %f\n",rank,subavg);
26 | 	float finalAvg;
27 | 	MPI_Reduce(&subavg,&finalAvg,1,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD);
28 | 	if(rank==0){
29 | 		printf("The average is : %f",finalAvg/size);
30 | 	}
31 | 	MPI_Finalize();
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/MPI/average_scatter.c:
--------------------------------------------------------------------------------
 1 | #include<mpi.h>
 2 | #include<stdlib.h>
 3 | #include<stdio.h>
 4 | 
 5 | int main(int argc,char** argv){
 6 | 	int rank,size,epp;
 7 | 	int* dataSend=NULL;
 8 | 	int i;
 9 | 	MPI_Init(NULL,NULL);
10 | 	MPI_Comm_rank(MPI_COMM_WORLD,&rank);
11 | 	MPI_Comm_size(MPI_COMM_WORLD,&size);
12 | 	epp=2;
13 | 	if(rank==0){
14 | 			printf("Master creating data...\n");
15 | 			dataSend=(int*)malloc(sizeof(int)*size*epp);
16 | 			for(i=0;i<epp*size;i++)
17 | 				dataSend[i]=i;
18 | 	}
19 | 	int* dataRecv1=(int*)malloc(sizeof(int)*epp);
20 | 	MPI_Scatter(dataSend,epp,MPI_INT,dataRecv1,epp,MPI_INT,0,MPI_COMM_WORLD);
21 | 	float subavg=0.0f;
22 | 	for(i=0;i<epp;i++)
23 | 		subavg+=dataRecv1[i];
24 | 	subavg/=epp;
25 | 	printf("%d calculates subavg as %f\n",rank,subavg);
26 | 	float* dataRecv2=NULL;
27 | 	if(rank==0){
28 | 		dataRecv2=(float*)malloc(sizeof(float)*size);
29 | 	}
30 | 	MPI_Gather(&subavg,1,MPI_FLOAT,dataRecv2,1,MPI_FLOAT,0,MPI_COMM_WORLD);
31 | 	if(rank==0){
32 | 			float finalavg=0.0;
33 | 			for(i=0;i<size;i++){
34 | 				finalavg+=dataRecv2[i];
35 | 			}
36 | 			finalavg/=size;
37 | 			printf("Final avg at Master : %f\n",finalavg);
38 | 	}
39 | 	MPI_Finalize();
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/MPI/factorial.c:
--------------------------------------------------------------------------------
 1 | /* simple parallel factorial calculator. Only useful
 2 |  * to illustrate collective communication :)
 3 |  *
 4 |  * Hector Urtubia
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include "mpi.h"
 9 | 
10 | int main(int argc, char *argv[]){
11 |     int myRank;
12 |     int size;
13 |     int fact;
14 |     int lower,upper;
15 |     int i;
16 |     double local_result = 1.0;
17 |     double total;
18 | 
19 |     /* initialize MPI */
20 |     MPI_Init(&argc,&argv);
21 |     /* get my rank and the size of the communicator */
22 |     MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
23 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
24 | 
25 |     /* get the input. (only if i have rank 0) */
26 |     if(myRank==0){
27 |         printf("Enter a number:");
28 |         scanf("%d",&fact);
29 |     }
30 |     /* since only the process with rank 0 has the input,
31 |      * we must pass it to all the other processes. */
32 | 
33 |     MPI_Bcast(&fact, /* in/out parameter */
34 |               1,     /* count */
35 |               MPI_INT, /* datatype */
36 |               0,       /* root */
37 |               MPI_COMM_WORLD); /* communicator */
38 | 
39 |     /* calculate the upper and lower boundaries
40 |      * for each process
41 |      */
42 |     if(myRank==0){
43 |         lower = 1;
44 |     }else
45 |         lower = myRank * (fact / size) + 1;
46 |     if(myRank==(size-1))
47 |         upper = fact;
48 |     else
49 |         upper = (myRank + 1) * (fact / size);
50 | 
51 |     /* now that we know upper and lower, do the
52 |      * multiplication in our local area
53 |      */
54 |     for(i=lower;i<=upper;i++){
55 |         local_result = local_result * (double)i;
56 |     }
57 | 
58 |     /* combine all the local results by multiplying them 
59 |      * together
60 |      */
61 |     MPI_Reduce(&local_result, /* operand */
62 |                &total,        /* result */
63 |                1,             /* count */
64 |                MPI_DOUBLE,    /* datatype */
65 |                MPI_PROD,      /* operator */
66 |                0,             /* root rank */
67 |                MPI_COMM_WORLD); /* communicator */
68 | 
69 |     /* give the output to the user */
70 |     if(myRank==0){
71 |         printf("The factorial of %d is %lf, and was calculated using %d processes\n",fact,total,size);
72 |     }
73 | 
74 |     /* shut down MPI */
75 |     MPI_Finalize();
76 | 
77 |     return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/MPI/lognSum.c:
--------------------------------------------------------------------------------
 1 | #include<mpi.h>
 2 | #include<stdlib.h>
 3 | #include<stdio.h>
 4 | 
 5 | int main(int argc,char** argv){
 6 | 	int rank,size;
 7 | 	int i,n;
 8 | 	int* A=NULL;
 9 | 	
10 | 	int D[2],sum;
11 | 	MPI_Init(NULL,NULL);
12 | 	MPI_Comm_rank(MPI_COMM_WORLD,&rank);
13 | 	MPI_Comm_size(MPI_COMM_WORLD,&size);
14 | 	int ctr=size*2;
15 | 	A=(int*)malloc(sizeof(int)*size*2);
16 | 	if(rank==0){
17 | 			printf("Enter %d Elements :\n",size*2);
18 | 			for(i=0;i<size*2;i++)
19 | 				scanf("%d",&A[i]);
20 | 	}
21 | 	while(ctr!=1){		
22 | 		MPI_Scatter(A,2,MPI_INT,D,2,MPI_INT,0,MPI_COMM_WORLD);
23 | 		sum=D[0]+D[1];
24 | 		MPI_Gather(&sum,1,MPI_INT,A,1,MPI_INT,0,MPI_COMM_WORLD);
25 | 		ctr/=2;
26 | 	}
27 | 	if(rank==0)
28 | 		printf("Total Sum : %d\n",A[0]);
29 | 	MPI_Finalize();
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/MPI/matrixMult.c:
--------------------------------------------------------------------------------
 1 | // O(n^2) Matrix Multiplication (Using N processors)
 2 | 
 3 | #include <mpi.h>
 4 | #include <stdio.h>
 5 | 
 6 | #define SIZE 12			/* Size of matrices */
 7 | 
 8 | int A[SIZE][SIZE], B[SIZE][SIZE], C[SIZE][SIZE];
 9 | 
10 | void print_matrix(int m[SIZE][SIZE], int n)
11 | {
12 |   int i, j = 0;
13 |   for (i=0; i<n; i++) {
14 |     printf("\n\t| ");
15 |     for (j=0; j<n; j++)
16 |       printf("%2d ", m[i][j]);
17 |     printf("|");
18 |   }
19 | }
20 | 
21 | 
22 | int main(int argc, char *argv[])
23 | {
24 |   int myrank, P, from, to, i, j, k, n, from1, to1;
25 |   int tag = 666;		/* any value will do */
26 |   MPI_Status status;
27 |   
28 |   MPI_Init (&argc, &argv);
29 |   MPI_Comm_rank(MPI_COMM_WORLD, &myrank);	/* who am i */
30 |   MPI_Comm_size(MPI_COMM_WORLD, &P); /* number of processors */
31 | 
32 |   if (myrank==0){
33 | 	  printf("Enter N : ");
34 | 	  scanf("%d",&n);
35 |   }
36 |   MPI_Bcast (&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
37 |   /* Just to use the simple variants of MPI_Gather and MPI_Scatter we */
38 |   /* impose that SIZE is divisible by P. By using the vector versions, */
39 |   /* (MPI_Gatherv and MPI_Scatterv) it is easy to drop this restriction. */
40 | 
41 |   if (P != n) {
42 |     if (myrank==0) printf("Number of processors not divisible by matrix size.\n");
43 |     MPI_Finalize();
44 |     exit(-1);
45 |   }
46 | 
47 |   from = myrank * (n*n)/P;
48 |   to = (myrank+1) * (n*n)/P;
49 | 
50 |   /* Process 0 fills the input matrices and broadcasts them to the rest */
51 |   /* (actually, only the relevant stripe of A is sent to each process) */
52 | 
53 |   if (myrank==0) {
54 |   	printf("Fill Matrix A : \n");
55 |     for (i=0; i<n; i++)
56 |     	for (j=0; j<n; j++)
57 |        		scanf("%d",&A[i][j]);
58 |     printf("Fill Matrix B : \n");
59 |     for (i=0; i<n; i++)
60 |     	for (j=0; j<n; j++)
61 |        		scanf("%d",&B[i][j]);	
62 |   }
63 | 
64 |   MPI_Bcast (B, n*n, MPI_INT, 0, MPI_COMM_WORLD);
65 |   MPI_Scatter (A, n*n/P, MPI_INT, A[from], n*n/P, MPI_INT, 0, MPI_COMM_WORLD);
66 | 
67 |   printf("computing slice %d (from element %d to %d)\n", myrank, from+1, to);
68 |   for (i=from; i<to; i++) 
69 |     for (j=n; j<n; j++) {
70 |       C[i][j]=0;
71 |       for (k=0; k<n; k++)
72 | 		C[i][j] += A[i][k]*B[k][j];
73 |     }
74 | 
75 |   MPI_Gather (C[from], n*n/P, MPI_INT, C, n*n/P, MPI_INT, 0, MPI_COMM_WORLD);
76 | 
77 |   if (myrank==0) {
78 |     printf("\n\n");
79 |     print_matrix(A,n);
80 |     printf("\n\n\t       * \n");
81 |     print_matrix(B,n);
82 |     printf("\n\n\t       = \n");
83 |     print_matrix(C,n);
84 |     printf("\n\n");
85 |   }
86 | 
87 |   MPI_Finalize();
88 |   return 0;
89 | }


--------------------------------------------------------------------------------
/MPI/mpi_hello_world.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | int main(int argc, char** argv) {
 6 |   MPI_Init(NULL, NULL);
 7 |   int world_size;
 8 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 9 |   int world_rank;
10 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
11 |   char processor_name[MPI_MAX_PROCESSOR_NAME];
12 |   int name_len;
13 |   MPI_Get_processor_name(processor_name, &name_len);
14 |   printf("Hello world from processor %s, rank %d out of %d processors\n",
15 |          processor_name, world_rank, world_size);
16 |   MPI_Finalize();
17 | }
18 | 


--------------------------------------------------------------------------------
/MPI/mpibcast.c:
--------------------------------------------------------------------------------
 1 | #include<mpi.h>
 2 | #include<stdio.h>
 3 | 
 4 | int main(int argc,char** argv){
 5 | 	int rank,size;
 6 | 	int data;
 7 | 	MPI_Init(NULL,NULL);
 8 | 	MPI_Comm_rank(MPI_COMM_WORLD,&rank);
 9 | 	MPI_Comm_size(MPI_COMM_WORLD,&size);
10 | 	if(rank==0)
11 | 		data=9000;
12 | 	MPI_Bcast(&data,1,MPI_INT,0,MPI_COMM_WORLD);
13 | 		if(rank==0){
14 | 			printf("Master sends %d\n",data);
15 | 		}
16 | 		else{
17 | 			printf("Slave %d recieves %d\n",rank,data);
18 | 		}
19 | 	MPI_Finalize();
20 | }
21 | 


--------------------------------------------------------------------------------
/MPI/pieCalculation.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <math.h>
 4 | #define M_PI		3.14159265358979323846
 5 | #include <mpi.h>
 6 | 
 7 | /* main program */
 8 | 
 9 | int main(int argc, char* argv[]) {
10 | 
11 |     char* usage_fmt = "usage:  %s number_of_samples seed\n";
12 |     char* end_ptr_for_strtol;
13 | 
14 |     /* initialize for MPI */
15 |     if (MPI_Init(&argc, &argv) != MPI_SUCCESS) {
16 |         fprintf(stderr, "MPI initialization error\n");
17 |         return EXIT_FAILURE;
18 |     }
19 |     int nprocs, myID;
20 |     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
21 |     MPI_Comm_rank(MPI_COMM_WORLD, &myID);
22 | 
23 |     /* process command-line arguments */
24 |     if (argc != 3)  {
25 |         fprintf(stderr, usage_fmt, argv[0]);
26 |         MPI_Finalize(); exit(EXIT_FAILURE);
27 |     }
28 |     long num_samples = strtol(argv[1], &end_ptr_for_strtol, 10);
29 |     if (*end_ptr_for_strtol != '\0') {
30 |         fprintf(stderr, usage_fmt, argv[0]);
31 |         exit(EXIT_FAILURE);
32 |     }
33 |     long seed = strtol(argv[2], &end_ptr_for_strtol, 10);
34 |     if (*end_ptr_for_strtol != '\0') {
35 |         fprintf(stderr, usage_fmt, argv[0]);
36 |         exit(EXIT_FAILURE);
37 |     }
38 | 
39 |     /* do calculation */ 
40 |     srand((unsigned int) seed);
41 |     int count = 0;
42 |     int local_count = 0;
43 |     int i;
44 |     double x, y;
45 |     double pi = 0;
46 |     for (i = myID; i < num_samples; i += nprocs) {
47 |         x = (double) rand() / (double) (RAND_MAX);
48 |         y = (double) rand() / (double) (RAND_MAX);
49 |         if ((x*x + y*y) <= 1.0)
50 |             ++local_count;
51 |     }
52 |     MPI_Reduce(&local_count, &count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
53 |     if (myID == 0)
54 |         pi = 4.0 * (double) count / (double) num_samples;
55 | 
56 |     if (myID == 0) {
57 |         printf("MPI program results with %d processes:\n", nprocs);
58 |         printf("number of samples = %ld, seed = %ld\n", num_samples, seed);
59 |         printf("estimated pi = %12.10f\n", pi);
60 |         printf("difference between estimated pi and math.h M_PI = %12.10f\n", 
61 |                 fabs(pi - M_PI));
62 |     }
63 | 
64 |     /* clean up and return */
65 |     MPI_Finalize();
66 |     return EXIT_SUCCESS;
67 | }
68 | 


--------------------------------------------------------------------------------
/OMP/Critical.c:
--------------------------------------------------------------------------------
 1 | #include<omp.h>
 2 | #include<stdio.h>
 3 | 
 4 | int main(){
 5 | 	int i=0;
 6 | 	omp_set_num_threads(5);
 7 | 	#pragma omp parallel
 8 | 	{
 9 | 		printf("A(%d)\n",omp_get_thread_num());
10 | 		printf("B(%d)\n",omp_get_thread_num());
11 | 		printf("C(%d)\n",omp_get_thread_num());	
12 | 		#pragma omp critical
13 | 		{
14 | 			printf("This is Critcal (%d) ",omp_get_thread_num());
15 | 			printf("For (%d)",omp_get_thread_num());
16 | 			printf("the thread (%d) \n",omp_get_thread_num());
17 | 		}
18 | 		printf("D(%d)\n",omp_get_thread_num());
19 | 		printf("E(%d)\n",omp_get_thread_num());
20 | 	}
21 | 	return 0;
22 | }
23 | 


--------------------------------------------------------------------------------
/OMP/Fibonacci.c:
--------------------------------------------------------------------------------
 1 | #include<omp.h>
 2 | #include<stdio.h>
 3 | 
 4 | int fibonacci(int x){
 5 | 	int f[2],i;			
 6 | 	if(x==1){
 7 | 		return 0;	
 8 | 	}
 9 | 	else if(x==2){
10 | 		return 1;
11 | 	}
12 | 	else {
13 | 		#pragma omp parallel for
14 | 		for(i=1;i<=2;i++){
15 | 			f[i-1]=fibonacci(x-i);		
16 | 			printf("Thread %d calculates term %d\n",omp_get_thread_num(),x-i);		
17 | 		}
18 | 		return f[0]+f[1];		
19 | 	}
20 | }
21 | 
22 | int main(){
23 | 	int x;
24 | 	omp_set_num_threads(2);
25 | 	omp_set_nested(1);
26 | 	printf("Enter which term to be found out : ");
27 | 	scanf("%d",&x);
28 |   	printf("%dth term is : %d\n",x,fibonacci(x));
29 | 	return 0;
30 | }
31 | 


--------------------------------------------------------------------------------
/OMP/HelloWorld.c:
--------------------------------------------------------------------------------
 1 | #include<omp.h>
 2 | #include<stdio.h>
 3 | int main(){
 4 | 	#pragma omp parallel
 5 | 	{
 6 | 		int ID=omp_get_thread_num();
 7 | 		printf("Hello by %d\n",ID);
 8 | 		printf("World by %d\n",ID);
 9 | 	}
10 | 	return 0;
11 | }
12 | 


--------------------------------------------------------------------------------
/OMP/MatrixMul.c:
--------------------------------------------------------------------------------
 1 | #include<omp.h>
 2 | #include<stdio.h>
 3 | int main(){
 4 | 	int A[10][10],B[10][10],C[10][10];
 5 | 	int n,i,j,k;
 6 | 	printf("Enter the value of N :");
 7 | 	scanf("%d",&n);
 8 | 	omp_set_num_threads(n*n*n);
 9 | 
10 | 	printf("Enter A:\n");
11 | 	for(i=0;i<n;i++){
12 | 		for(j=0;j<n;j++){
13 | 			scanf("%d",&A[i][j]);
14 | 		}
15 | 	}
16 | 
17 | 	printf("Enter B:\n");
18 | 	for(i=0;i<n;i++){
19 | 		for(j=0;j<n;j++){
20 | 			scanf("%d",&B[i][j]);
21 | 		}
22 | 	}
23 | 
24 | 	#pragma omp parallel for
25 | 	for(i=0;i<n;i++)
26 | 	{
27 | 		#pragma omp parallel for
28 | 		for(j=0;j<n;j++)
29 | 		{
30 | 			int x=0;
31 | 			#pragma omp parallel for reduction(+:x)
32 | 			for(k=0;k<n;k++)
33 | 			{
34 | 				x+=A[i][k]*B[k][j];
35 | 			}
36 | 			C[i][j]=x;
37 | 		}
38 | 	}
39 | 
40 | 	printf("Matrix C:\n");
41 | 	for(i=0;i<n;i++){
42 | 		for(j=0;j<n;j++){
43 | 			printf("%d ",C[i][j]);
44 | 		}
45 | 		printf("\n");
46 | 	}
47 | 	return 0;
48 | }
49 | 


--------------------------------------------------------------------------------
/OMP/ParallelTreeSearch.c:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<omp.h>
 3 | int tree[512];
 4 | 
 5 | void  createTree(int i){
 6 | 	int ch;
 7 | 	printf("Enter node %d Data :",i);
 8 | 	scanf("%d",&tree[i]);
 9 | 	printf("Is there a Left node of NODE %d ? [0 if not]: ",i);
10 | 	scanf("%d",&ch);
11 | 	if(ch!=0)
12 | 		createTree(2*i);
13 | 	printf("Is there a Right node of NODE %d ? [0 if not]: ",i);
14 | 	scanf("%d",&ch);
15 | 	if(ch!=0)
16 | 		createTree(2*i+1);
17 | }
18 | 
19 | void searchTree(int data,int i){
20 | 	int k;
21 | 	if(tree[i]!=0){
22 | 	if(tree[i]==data){
23 | 		printf("Node at %d \n",i);
24 | 	}
25 | 	else{
26 | 		omp_set_num_threads(2);
27 | 		#pragma omp parallel for
28 | 		for(k=0;k<2;k++){
29 | 			searchTree(data,2*i+k);
30 | 		}
31 | 	}
32 | 	}
33 | }	
34 | 	
35 | int main(){
36 | 	int d;
37 | 	printf("Creating tree \n");
38 | 	createTree(1);
39 | 	omp_set_num_threads(2);
40 | 	printf("Enter data to be searched : ");
41 | 	scanf("%d",&d);
42 | 	searchTree(d,1);
43 | }
44 | 


--------------------------------------------------------------------------------
/OMP/PiCalculation.c:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<omp.h>
 3 | #include<stdlib.h>
 4 | 
 5 | int main(){
 6 | 	int n=1024;
 7 | 	int i=0;
 8 | 	float x;
 9 | 	omp_set_num_threads(10);
10 | 	float* pi=(float*)malloc(sizeof(float)*n); 
11 | 	float PI=0.0f;
12 | 	#pragma omp parallel for
13 | 	for(i=0;i<n;i++){	
14 | 		x=(i+0.5)/n;
15 | 		//printf("Thread %d is working\n",omp_get_thread_num());
16 | 		//printf("PI is %f,adding %f\n",pi,x);
17 | 		pi[i]=((4/(1+x*x))/n);
18 | 	}
19 | 	int ctr=n,k=1;
20 | 	while(ctr!=1){
21 | 		#pragma omp parallel for
22 | 		for(i=0;i<ctr/2;i++){
23 | 			pi[2*k*i]+=pi[2*k*i+k];
24 | 		}
25 | 		ctr/=2;
26 | 		k*=2;
27 | 	}
28 | 	printf("PI is %f\n",pi[0]); 
29 | 	return 0;
30 | }
31 | 


--------------------------------------------------------------------------------
/OMP/ReductionPI.c:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<omp.h>
 3 | #include<stdlib.h>
 4 | 
 5 | int main(){
 6 | 	int n=1024;
 7 | 	int i=0;
 8 | 	float x;
 9 | 	omp_set_num_threads(10);
10 | 	float* pi=(float*)malloc(sizeof(float)*n); 
11 | 	float PI=0.0f;
12 | 	#pragma omp parallel for
13 | 	for(i=0;i<n;i++){	
14 | 		x=(i+0.5)/n;
15 | 		//printf("Thread %d is working\n",omp_get_thread_num());
16 | 		//printf("PI is %f,adding %f\n",pi,x);
17 | 		pi[i]=((4/(1+x*x))/n);
18 | 	}
19 | 	#pragma omp parallel for reduction(+:PI)
20 | 	for(i=0;i<n;i++){
21 | 		PI+=pi[i];
22 | 	}
23 | 	printf("PI : %f\n",PI);
24 | 	return 0;
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/OMP/Single.c:
--------------------------------------------------------------------------------
 1 | #include<omp.h>
 2 | #include<stdio.h>
 3 | 
 4 | int main(){
 5 | 	int i=0;
 6 | 	omp_set_num_threads(5);
 7 | 	printf("SINGLE\n");
 8 | 	#pragma omp parallel
 9 | 	{
10 | 		printf("Executed by %d \n",omp_get_thread_num());
11 | 		#pragma omp single
12 | 		{
13 | 				printf("Executed ONLY by %d \n",omp_get_thread_num());
14 | 		}
15 | 	}
16 | 	printf("\nMASTER\n");
17 | 	#pragma omp parallel
18 | 	{
19 | 		printf("Executed by %d \n",omp_get_thread_num());
20 | 		#pragma omp master
21 | 		{
22 | 				printf("Executed ONLY by %d \n",omp_get_thread_num());
23 | 		}
24 | 	}
25 | 	return 0;
26 | }
27 | 


--------------------------------------------------------------------------------
/OMP/Sorting.c:
--------------------------------------------------------------------------------
 1 | #include<omp.h>
 2 | #include<stdio.h>
 3 | int main(){
 4 | 	int A[20],B[20][20],C[20];
 5 | 	int n;
 6 | 	int i,j,k;
 7 | 
 8 | 	printf("Enter the value of N :");
 9 | 	scanf("%d",&n);
10 | 	printf("Enter Array : \n");
11 | 
12 | 	for(i=0;i<n;i++)
13 | 		scanf("%d",&A[i]);
14 | 
15 | 	double starttime=omp_get_wtime();
16 | 
17 | 	omp_set_num_threads(n*n);
18 | 
19 | 	#pragma omp parallel for
20 | 	for(i=0;i<n;i++)
21 | 	{
22 | 		#pragma omp parallel for
23 | 		for(j=0;j<n;j++)
24 | 		{
25 | 			if(A[i]>A[j])
26 | 			{
27 | 				B[i][j]=1;
28 | 			}
29 | 			else
30 | 				B[i][j]=0;
31 | 		}
32 | 		int x=0;
33 | 		#pragma omp parallel for reduction(+:x)
34 |  		for(j=0;j<n;j++)
35 |  		{
36 | 			x+=B[i][j];
37 | 		}
38 | 		B[i][0]=x;
39 | 		C[B[i][0]]=A[i];
40 | 	}
41 | 	double endtime=omp_get_wtime()-starttime;
42 | 
43 | 	printf("Sorted Array: ");
44 | 	for(i=0;i<n;i++)
45 | 		printf("%d ",C[i]);
46 | 	printf("\nTook %f seconds\n",endtime);
47 | 	return 0;
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/OMP/SumOfArrays.c:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<omp.h>
 3 | int main(){
 4 | 	int i,n=5;
 5 | 	int a[5]={1,2,3,4,5},b[5]={1,1,1,2,1},c[5];
 6 | 	#pragma omp parallel for
 7 | 	for(i=0;i<n;i++){
 8 | 		c[i]=a[i]+b[i];
 9 | 		printf("c[%d]=%d calculated by the thread %d\n",i,c[i],omp_get_thread_num());
10 | 	}
11 | }
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # parallel
 2 | MPI and OPENMP programs shared here !
 3 | 
 4 | ## To install MPI :
 5 | 
 6 | sudo apt-get install lam4-dev
 7 | 
 8 | ## To run MPI :
 9 | 
10 | 1. lamboot
11 | 2. mpicc program.c -o program
12 | 3. mpirun -np <no. of processors> ./program
13 | 
14 | ## To run OpenMP :
15 | 
16 | 1. gcc -fopenmp program.c -o program
17 | 2. ./program
18 | 3. To change the number of processors : export OMP_NUM_THREADS=<number>
19 | 


--------------------------------------------------------------------------------