├── HW1 ├── 9731107_HW01.pdf └── HW01.pdf ├── HW2 ├── 9731107_HW02.pdf └── HW02.pdf ├── HW3 ├── HW3_9731107.pdf ├── matmul3d.cpp └── matmul3d.o ├── HW4 ├── deadlock.cpp ├── deadlock.o ├── det.cpp ├── det.o ├── eq.cpp ├── eq.o └── mat.h ├── HW5 ├── .vscode │ └── settings.json ├── 9731107_HW05.pdf ├── print_thread_info.cu └── vector_add.cu ├── HW6 ├── HW06_9731107 (1).pdf ├── HW6.pdf ├── reduction ├── reduction.cu └── reduction.pdf ├── LAB1 ├── Lab1_9731107.pdf ├── Manual 1.pdf └── lab1.cpp ├── LAB2 ├── LAB2_9731107.pdf ├── Manual 2-1.pdf ├── block_parallel.cpp └── row_parallel.cpp ├── LAB3 ├── LAB3.cpp ├── LAB3_9731107.pdf └── Manual 3-1.pdf ├── LAB4 ├── .vscode │ ├── launch.json │ └── tasks.json ├── LAB4.pdf └── hist.c ├── LAB5 ├── deviceQuery.cu └── گزارش.pdf ├── LAB6 ├── LAB6_9731107.pdf ├── Manual 6-1.pdf ├── matmul ├── matmul.cu ├── matmul1 ├── matmul1.cu ├── matmul2 ├── matmul2.cu ├── matmul3 ├── matmul3.cu ├── matmul3v2 └── matmul3v2.cu └── README.md /HW1/9731107_HW01.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW1/9731107_HW01.pdf -------------------------------------------------------------------------------- /HW1/HW01.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW1/HW01.pdf -------------------------------------------------------------------------------- /HW2/9731107_HW02.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW2/9731107_HW02.pdf -------------------------------------------------------------------------------- /HW2/HW02.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW2/HW02.pdf -------------------------------------------------------------------------------- /HW3/HW3_9731107.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW3/HW3_9731107.pdf -------------------------------------------------------------------------------- /HW3/matmul3d.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define MATRIX_SIZE 512 10 | #define THREADS_NUM 1 11 | #define EXP_NUM 5 12 | 13 | typedef struct { 14 | short data[MATRIX_SIZE][MATRIX_SIZE][MATRIX_SIZE]; 15 | } Tensor3D; 16 | 17 | 18 | Tensor3D A; 19 | Tensor3D B; 20 | Tensor3D C; 21 | 22 | void fill_with_rand_nums(Tensor3D* A); 23 | void fill_with_zero(Tensor3D* A); 24 | void fill_matrix(Tensor3D* A, Tensor3D* B, Tensor3D* C); 25 | void print_matrix(Tensor3D* A); 26 | 27 | void matmul3d_block_p(Tensor3D* A, Tensor3D* B, Tensor3D* C); 28 | void matmul3d_row_p(Tensor3D* A, Tensor3D* B, Tensor3D* C); 29 | void matmul3d_col_p(Tensor3D* A, Tensor3D* B, Tensor3D* C); 30 | 31 | void free_matrix(Tensor3D* A); 32 | 33 | int main() 34 | { 35 | 36 | // checks if openMP is available 37 | #ifndef _OPENMP 38 | printf("OpenMP is not supported, sorry!\n"); 39 | getchar(); 40 | return 0; 41 | #endif 42 | 43 | omp_set_num_threads(THREADS_NUM); 44 | printf("number of threads : %d\n" , THREADS_NUM); 45 | 46 | double exp_times_sum = 0; 47 | fill_matrix(&A, &B, &C); 48 | 49 | for (int i = 0; i < EXP_NUM; i++) 50 | { 51 | double start = omp_get_wtime(); 52 | matmul3d_col_p(&A, &B, &C); 53 | double end = omp_get_wtime() - start; 54 | printf("elapsed time : %f exp : %d\n", end, i + 1); 55 | exp_times_sum += end; 56 | } 57 | 58 | printf("average elapsed time for block parallelism: %f\n", exp_times_sum / EXP_NUM); 59 | 60 | 61 | // free_matrix(&A); 62 | // free_matrix(&B); 63 | // free_matrix(&C); 64 | 65 | return 0; 66 | 67 | } 68 | 69 | 70 | 71 | void fill_with_rand_nums(Tensor3D* A) { 72 | for (int i = 0; i < MATRIX_SIZE; i++) { 73 | for (int j = 0; j < MATRIX_SIZE; j++) { 74 | for (int k = 0; k < MATRIX_SIZE; k++) { 75 | A->data[i][j][k] = rand() % 10; 76 | } 77 | } 78 | } 79 | } 80 | 81 | void fill_with_zero(Tensor3D* A) { 82 | for (int i = 0; i < MATRIX_SIZE; i++) { 83 | for (int j = 0; j < MATRIX_SIZE; j++) { 84 | for (int k = 0; k < MATRIX_SIZE; k++) { 85 | A->data[i][j][k] = 0; 86 | } 87 | } 88 | } 89 | } 90 | 91 | 92 | void fill_matrix(Tensor3D* A, Tensor3D* B, Tensor3D* C) { 93 | double int_size = sizeof(short); 94 | double size_mb = ((double)(MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE)) * int_size / (1024.0 * 1024.0); 95 | printf(" size of matrixes : %lf MB\n", size_mb); 96 | fill_with_rand_nums(A); 97 | fill_with_rand_nums(B); 98 | fill_with_zero(C); 99 | } 100 | 101 | void free_matrix(Tensor3D* A) { 102 | free(A->data); 103 | } 104 | 105 | void print_matrix(Tensor3D* A) { 106 | printf("["); 107 | for (int i = 0; i < MATRIX_SIZE; i++) { 108 | printf("[\n"); 109 | for (int j = 0; j < MATRIX_SIZE; j++) { 110 | for (int k = 0; k < MATRIX_SIZE; k++) { 111 | printf("%d ", A->data[i][j][k]); 112 | } 113 | printf("\n"); 114 | } 115 | printf("]\n"); 116 | } 117 | printf("]\n"); 118 | } 119 | 120 | 121 | void matmul3d_block_p(Tensor3D* A, Tensor3D* B, Tensor3D* C) { 122 | #pragma omp parallel for 123 | for (int i = 0; i < MATRIX_SIZE; i++) { 124 | for (int j = 0; j < MATRIX_SIZE; j++) { 125 | for (int k = 0; k < MATRIX_SIZE; k++) { 126 | for (int p = 0; p < MATRIX_SIZE; p++) { 127 | C->data[i][j][k] += A->data[i][j][p] * B->data[i][p][k]; 128 | } 129 | } 130 | } 131 | } 132 | } 133 | 134 | void matmul3d_row_p(Tensor3D* A, Tensor3D* B, Tensor3D* C) { 135 | #pragma omp parallel for collapse(2) 136 | for (int i = 0; i < MATRIX_SIZE; i++) { 137 | for (int j = 0; j < MATRIX_SIZE; j++) { 138 | for (int k = 0; k < MATRIX_SIZE; k++) { 139 | for (int p = 0; p < MATRIX_SIZE; p++) { 140 | C->data[i][j][k] += A->data[i][j][p] * B->data[i][p][k]; 141 | } 142 | } 143 | } 144 | } 145 | } 146 | 147 | 148 | 149 | void matmul3d_col_p(Tensor3D* A, Tensor3D* B, Tensor3D* C) { 150 | #pragma omp parallel for collapse(3) 151 | for (int i = 0; i < MATRIX_SIZE; i++) { 152 | for (int j = 0; j < MATRIX_SIZE; j++) { 153 | for (int k = 0; k < MATRIX_SIZE; k++) { 154 | for (int p = 0; p < MATRIX_SIZE; p++) { 155 | C->data[i][j][k] += A->data[i][j][p] * B->data[i][p][k]; 156 | } 157 | } 158 | } 159 | } 160 | } -------------------------------------------------------------------------------- /HW3/matmul3d.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW3/matmul3d.o -------------------------------------------------------------------------------- /HW4/deadlock.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | 7 | #define NUM_RESOURCES 8 8 | #define ATTEMPTS 4 9 | 10 | 11 | omp_lock_t locks[NUM_RESOURCES]; 12 | 13 | 14 | void calc(int i , int j){ 15 | int thread_num = omp_get_thread_num(); 16 | 17 | printf("threads %d wants %d and %d \n" , thread_num , i , j); 18 | 19 | omp_set_lock(&locks[i]); 20 | omp_set_lock(&locks[j]); 21 | 22 | int k = 0; 23 | 24 | for(int p = 0 ; p < 1000000 ; p++){ 25 | k++; 26 | } 27 | 28 | omp_unset_lock(&locks[i]); 29 | omp_unset_lock(&locks[j]); 30 | 31 | } 32 | 33 | 34 | int main(){ 35 | #ifndef _OPENMP 36 | printf("OpenMP is not supported, sorry!\n"); 37 | getchar(); 38 | return 0; 39 | #endif 40 | 41 | for (size_t i = 0; i < NUM_RESOURCES; i++) 42 | omp_init_lock(&locks[i]); 43 | 44 | for (int k = 0; k < ATTEMPTS; k++) 45 | { 46 | printf("attempt %d : \n\n" , k + 1); 47 | #pragma omp parallel num_threads(NUM_RESOURCES) 48 | { 49 | int i = rand() % NUM_RESOURCES; 50 | int j = rand() % NUM_RESOURCES; 51 | if(i == j) 52 | j = (j + 1 ) % NUM_RESOURCES; 53 | 54 | #pragma omp barrier 55 | calc(i , j); 56 | } 57 | } 58 | 59 | for (size_t i = 0; i < NUM_RESOURCES; i++) 60 | omp_destroy_lock(&locks[i]); 61 | 62 | return 0; 63 | } -------------------------------------------------------------------------------- /HW4/deadlock.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW4/deadlock.o -------------------------------------------------------------------------------- /HW4/det.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define MATRIX_SIZE 512 7 | #define BLOCK_NUM 4 8 | #define NUM_THREADS 2 9 | #define NUM_EXP 10 10 | 11 | void LUDecompose(float **a, float **l, float **u, int size); 12 | float **invers_matrix(float **A, int size); 13 | float **allocate_float_matrix(int size); 14 | void fill_with_random_float_numbers(float **mat, int size); 15 | void mat_diff_float(float **A, float **B, float **C, int size); 16 | void mat_mat_float(float **A, float **B, float **C, int size); 17 | void print_float_matrix(float **A, int size); 18 | void free_float_array(float **A, int size); 19 | float **create_augmented_matrix(float **A, int size); 20 | float **copy(float **A, int size); 21 | void fill_with_zero(float **mat, int size); 22 | 23 | int main() 24 | { 25 | #ifndef _OPENMP 26 | printf("OpenMP is not supported, sorry!\n"); 27 | getchar(); 28 | return 0; 29 | #endif 30 | double time_sum = 0; 31 | 32 | for(int ex = 0 ; ex < NUM_EXP ; ex++) 33 | { 34 | float **A[BLOCK_NUM][BLOCK_NUM]; 35 | float **L[BLOCK_NUM][BLOCK_NUM]; 36 | float **U[BLOCK_NUM][BLOCK_NUM]; 37 | 38 | int block_size = MATRIX_SIZE / BLOCK_NUM; 39 | 40 | for (int i = 0; i < BLOCK_NUM; i++) 41 | { 42 | for (int j = 0; j < BLOCK_NUM; j++) 43 | { 44 | A[i][j] = allocate_float_matrix(block_size); 45 | fill_with_random_float_numbers(A[i][j], block_size); 46 | } 47 | } 48 | 49 | for (int i = 0; i < BLOCK_NUM; i++) 50 | { 51 | for (int j = 0; j < BLOCK_NUM; j++) 52 | { 53 | L[i][j] = allocate_float_matrix(block_size); 54 | fill_with_zero(L[i][j], block_size); 55 | } 56 | } 57 | 58 | for (int i = 0; i < BLOCK_NUM; i++) 59 | { 60 | for (int j = 0; j < BLOCK_NUM; j++) 61 | { 62 | U[i][j] = allocate_float_matrix(block_size); 63 | fill_with_zero(U[i][j], block_size); 64 | } 65 | } 66 | 67 | float **U_inv[BLOCK_NUM]; 68 | float **L_inv[BLOCK_NUM]; 69 | 70 | for (int i = 0; i < BLOCK_NUM; i++) 71 | { 72 | U_inv[i] = allocate_float_matrix(block_size); 73 | } 74 | 75 | for (int i = 0; i < BLOCK_NUM; i++) 76 | { 77 | L_inv[i] = allocate_float_matrix(block_size); 78 | } 79 | 80 | double start = omp_get_wtime(); 81 | 82 | for (int step = 0; step < BLOCK_NUM; step++) 83 | { 84 | #pragma omp parallel num_threads(NUM_THREADS) 85 | { 86 | #pragma omp single 87 | { 88 | LUDecompose(A[step][step], L[step][step], U[step][step], block_size); 89 | 90 | #pragma omp task 91 | { 92 | U_inv[step] = invers_matrix(U[step][step], block_size); 93 | for (int i = step + 1; i < BLOCK_NUM; i++) 94 | mat_mat_float(A[i][step], U_inv[step], L[i][step], block_size); 95 | } 96 | 97 | #pragma omp task 98 | { 99 | L_inv[step] = invers_matrix(L[step][step], block_size); 100 | for (int i = step + 1; i < BLOCK_NUM; i++) 101 | mat_mat_float(L_inv[step], A[step][i], U[step][i], block_size); 102 | } 103 | } 104 | 105 | #pragma omp single 106 | { 107 | 108 | for (int i = step + 1; i < BLOCK_NUM; i++) 109 | { 110 | for (int j = step + 1; j < BLOCK_NUM; j++) 111 | { 112 | #pragma omp task 113 | { 114 | float **R = allocate_float_matrix(block_size); 115 | mat_mat_float(L[i][step], U[step][j], R, block_size); 116 | mat_diff_float(A[i][j], R, A[i][j], block_size); 117 | 118 | free_float_array(R, block_size); 119 | } 120 | } 121 | } 122 | } 123 | } 124 | } 125 | 126 | 127 | float det1 = 1.0, det2 = 1.0; 128 | #pragma omp for 129 | for (int i = 0; i < BLOCK_NUM; i++) 130 | { 131 | 132 | float **l_block = L[i][i]; 133 | float **u_block = U[i][i]; 134 | 135 | for (int j = 0; j < block_size; j++) 136 | { 137 | det1 *= l_block[j][j]; 138 | det2 *= u_block[j][j]; 139 | } 140 | } 141 | 142 | double end = omp_get_wtime() - start; 143 | 144 | time_sum += end; 145 | 146 | printf("time elapsed with %d threads : %f \n", NUM_THREADS , end); 147 | 148 | for (int i = 0; i < BLOCK_NUM; i++) 149 | { 150 | for (int j = 0; j < BLOCK_NUM; j++) 151 | { 152 | free_float_array(A[i][j], block_size); 153 | } 154 | } 155 | 156 | for (int i = 0; i < BLOCK_NUM; i++) 157 | { 158 | for (int j = 0; j < BLOCK_NUM; j++) 159 | { 160 | free_float_array(L[i][j], block_size); 161 | } 162 | } 163 | 164 | for (int i = 0; i < BLOCK_NUM; i++) 165 | { 166 | for (int j = 0; j < BLOCK_NUM; j++) 167 | { 168 | free_float_array(U[i][j], block_size); 169 | } 170 | } 171 | } 172 | 173 | printf("\n average : %f \n" , time_sum / NUM_EXP); 174 | 175 | return 0; 176 | } 177 | 178 | void LUDecompose(float **a, float **l, float **u, int size) 179 | { 180 | 181 | for (int i = 0; i < size; i++) 182 | { 183 | for (int k = i; k < size; k++) 184 | { 185 | float sum = 0; 186 | for (int j = 0; j < i; j++) 187 | sum += (l[i][j] * u[j][k]); 188 | 189 | u[i][k] = a[i][k] - sum; 190 | } 191 | for (int k = 0; k < size; k++) 192 | { 193 | 194 | if (i == k) 195 | l[i][i] = 1; 196 | else 197 | { 198 | float sum = 0.0; 199 | for (int j = 0; j < i; j++) 200 | sum += (l[k][j] * u[j][i]); 201 | 202 | l[k][i] = (a[k][i] - sum) / u[i][i]; 203 | } 204 | } 205 | } 206 | } 207 | float **invers_matrix(float **A, int size) 208 | { 209 | 210 | float **matrix = create_augmented_matrix(A, size); 211 | 212 | float temp; 213 | 214 | for (int i = size - 1; i > 0; i--) 215 | { 216 | if (matrix[i - 1][0] < matrix[i][0]) 217 | { 218 | float *temp = matrix[i]; 219 | matrix[i] = matrix[i - 1]; 220 | matrix[i - 1] = temp; 221 | } 222 | } 223 | 224 | for (int i = 0; i < size; i++) 225 | { 226 | for (int j = 0; j < size; j++) 227 | { 228 | if (j != i) 229 | { 230 | temp = matrix[j][i] / matrix[i][i]; 231 | for (int k = 0; k < 2 * size; k++) 232 | { 233 | matrix[j][k] -= matrix[i][k] * temp; 234 | } 235 | } 236 | } 237 | } 238 | 239 | for (int i = 0; i < size; i++) 240 | { 241 | 242 | temp = matrix[i][i]; 243 | for (int j = 0; j < 2 * size; j++) 244 | { 245 | 246 | matrix[i][j] = matrix[i][j] / temp; 247 | } 248 | } 249 | 250 | float **result = allocate_float_matrix(size); 251 | 252 | for (int i = 0; i < size; i++) 253 | for (int j = size; j < 2 * size; j++) 254 | result[i][j - size] = matrix[i][j]; 255 | 256 | free_float_array(matrix, size * 2); 257 | 258 | return result; 259 | } 260 | 261 | float **allocate_float_matrix(int size) 262 | { 263 | float **mat; 264 | mat = (float **)malloc(sizeof(*mat) * size); 265 | 266 | for (int i = 0; i < size; i++) 267 | mat[i] = (float *)malloc(sizeof(mat[i]) * size); 268 | 269 | return mat; 270 | } 271 | 272 | void fill_with_random_float_numbers(float **mat, int size) 273 | { 274 | for (int i = 0; i < size; i++) 275 | for (int j = 0; j < size; j++) 276 | mat[i][j] = (float)(rand() % 10 + 1) / 100; 277 | } 278 | 279 | void fill_with_zero(float **mat, int size) 280 | { 281 | for (int i = 0; i < size; i++) 282 | for (int j = 0; j < size; j++) 283 | mat[i][j] = 0; 284 | } 285 | 286 | void mat_diff_float(float **A, float **B, float **C, int size) 287 | { 288 | for (int i = 0; i < size; i++) 289 | for (int j = 0; j < size; j++) 290 | C[i][j] = A[i][j] + B[i][j]; 291 | } 292 | 293 | void mat_mat_float(float **A, float **B, float **C, int size) 294 | { 295 | for (int i = 0; i < size; i++) 296 | for (int j = 0; j < size; j++) 297 | for (int k = 0; k < size; k++) 298 | C[i][j] += A[i][k] * B[k][j]; 299 | } 300 | 301 | void print_float_matrix(float **A, int size) 302 | { 303 | 304 | for (int i = 0; i < size; i++) 305 | { 306 | for (int j = 0; j < size; j++) 307 | { 308 | printf("%f ", A[i][j]); 309 | } 310 | printf("\n"); 311 | } 312 | } 313 | 314 | void free_float_array(float **A, int size) 315 | { 316 | 317 | for (int i; i < size; i++) 318 | { 319 | 320 | float *row = A[i]; 321 | 322 | free(row); 323 | } 324 | } 325 | 326 | float **create_augmented_matrix(float **A, int size) 327 | { 328 | 329 | float **_A = allocate_float_matrix(size * 2); 330 | 331 | for (int i = 0; i < size; i++) 332 | for (int j = 0; j < size; j++) 333 | _A[i][j] = A[i][j]; 334 | 335 | for (int i = 0; i < size; i++) 336 | for (int j = size; j < size * 2; j++) 337 | { 338 | if (j == (i + size)) 339 | _A[i][j] = 1; 340 | } 341 | 342 | return _A; 343 | } 344 | 345 | float **copy(float **A, int size) 346 | { 347 | 348 | float **_A = allocate_float_matrix(size); 349 | 350 | for (int i = 0; i < size; i++) 351 | for (int j = 0; j < size; j++) 352 | 353 | _A[i][j] = A[i][j]; 354 | 355 | return _A; 356 | } 357 | -------------------------------------------------------------------------------- /HW4/det.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW4/det.o -------------------------------------------------------------------------------- /HW4/eq.cpp: -------------------------------------------------------------------------------- 1 | #include "mat.h" 2 | 3 | #define NUM_EXP 10 4 | 5 | int main() 6 | { 7 | #ifndef _OPENMP 8 | printf("OpenMP is not supported, sorry!\n"); 9 | getchar(); 10 | return 0; 11 | #endif 12 | 13 | double average_running_time = 0.0; 14 | 15 | omp_set_num_threads(NUM_THREADS); 16 | 17 | short **A; 18 | short **B; 19 | short **C; 20 | 21 | short **A_t; 22 | short **C_t; 23 | 24 | A = allocate_matrix(); 25 | fill_with_random_numbers(A); 26 | B = allocate_matrix(); 27 | fill_with_random_numbers(B); 28 | C = allocate_matrix(); 29 | fill_with_random_numbers(C); 30 | 31 | A_t = allocate_matrix(); 32 | C_t = allocate_matrix(); 33 | 34 | mat_transpose(A, A_t); 35 | 36 | mat_transpose(C, C_t); 37 | 38 | short **R; 39 | short **R1; 40 | short **R2; 41 | short **R3; 42 | 43 | for (int i = 1; i <= NUM_EXP; i++) 44 | { 45 | 46 | R = allocate_matrix(); 47 | R1 = allocate_matrix(); 48 | R2 = allocate_matrix(); 49 | // R3 = allocate_matrix(); 50 | 51 | double start = omp_get_wtime(); 52 | 53 | 54 | 55 | 56 | // mat_mat(A_t, A, R1); 57 | 58 | // mat_mat(B, A, R2); 59 | 60 | // mat_sum(R1 , R2 , R3); 61 | 62 | // mat_mat(R3 , C_t , R); 63 | // ///////////////////////////////////////// 64 | 65 | 66 | mat_sum(A_t, B, R1); 67 | 68 | mat_mat(R1, A, R2); 69 | 70 | mat_mat(R2, C_t, R); 71 | 72 | 73 | double end = omp_get_wtime(); 74 | 75 | double elapsed_time = end - start; 76 | 77 | printf("running time for %d*%d matrixes with %d threads : %f\n", MATRIX_SIZE, MATRIX_SIZE, NUM_THREADS, elapsed_time); 78 | 79 | average_running_time += elapsed_time; 80 | 81 | free_array(R); 82 | free_array(R1); 83 | free_array(R2); 84 | // free_array(R3); 85 | } 86 | printf("average running time : %f \n", average_running_time / NUM_EXP); 87 | 88 | return 0; 89 | } 90 | 91 | -------------------------------------------------------------------------------- /HW4/eq.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW4/eq.o -------------------------------------------------------------------------------- /HW4/mat.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define MATRIX_SIZE 64 6 | 7 | #define NUM_THREADS 1 8 | 9 | short **allocate_matrix() 10 | { 11 | short **mat; 12 | mat = (short **)malloc(sizeof(*mat) * MATRIX_SIZE); 13 | 14 | for (int i = 0; i < MATRIX_SIZE; i++) 15 | mat[i] = (short *)malloc(sizeof(mat[i]) * MATRIX_SIZE); 16 | 17 | return mat; 18 | } 19 | 20 | void fill_with_random_numbers(short **mat) 21 | { 22 | for (int i = 0; i < MATRIX_SIZE; i++) 23 | for (int j = 0; j < MATRIX_SIZE; j++) 24 | mat[i][j] = rand() % 10; 25 | } 26 | 27 | 28 | void mat_sum(short **A, short **B, short **C) 29 | { 30 | #ifndef _OPENMP 31 | printf("OpenMP is not supported, sorry!\n"); 32 | #endif 33 | 34 | #pragma omp parallel for 35 | for (int i = 0; i < MATRIX_SIZE; i++) 36 | for (int j = 0; j < MATRIX_SIZE; j++) 37 | C[i][j] = A[i][j] + B[i][j]; 38 | } 39 | 40 | void mat_mat(short **A, short **B, short **C) 41 | { 42 | 43 | #ifndef _OPENMP 44 | printf("OpenMP is not supported, sorry!\n"); 45 | #endif 46 | 47 | #pragma omp parallel for 48 | for (int i = 0; i < MATRIX_SIZE; i++) 49 | for (int j = 0; j < MATRIX_SIZE; j++) 50 | for (int k = 0; k < MATRIX_SIZE; k++) 51 | C[i][j] += A[i][k] * B[k][j]; 52 | } 53 | 54 | void mat_transpose(short **mat, short **mat_t) 55 | { 56 | #ifndef _OPENMP 57 | printf("OpenMP is not supported, sorry!\n"); 58 | #endif 59 | 60 | 61 | #pragma omp parallel for 62 | for (int i = 0; i < MATRIX_SIZE; i++) 63 | for (int j = 0; j < MATRIX_SIZE; j++) 64 | mat_t[i][j] = mat[j][i]; 65 | } 66 | 67 | void print_matrix(short **A) 68 | { 69 | 70 | for (int i = 0; i < MATRIX_SIZE; i++) 71 | { 72 | for (int j = 0; j < MATRIX_SIZE; j++) 73 | { 74 | printf("%d ", A[i][j]); 75 | } 76 | printf("\n"); 77 | } 78 | } 79 | 80 | void free_array(short **A) 81 | { 82 | 83 | for (int i; i < MATRIX_SIZE; i++) 84 | { 85 | 86 | short *row = A[i]; 87 | 88 | free(row); 89 | } 90 | } 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | // //////////////////////////////////////////////////////////////////////////// 100 | ////////////////////////////////////////////////////////////////////////////// 101 | ////////////////////////////////////////////////////////////////////////////// 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /HW5/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.associations": { 3 | "condition_variable": "cpp" 4 | } 5 | } -------------------------------------------------------------------------------- /HW5/9731107_HW05.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW5/9731107_HW05.pdf -------------------------------------------------------------------------------- /HW5/print_thread_info.cu: -------------------------------------------------------------------------------- 1 |  2 | #include "cuda_runtime.h" 3 | #include "device_launch_parameters.h" 4 | 5 | #include 6 | 7 | 8 | 9 | void print_info(unsigned int size); 10 | 11 | __global__ void print_info_kernel() 12 | { 13 | int tid = threadIdx.x; 14 | int bid = blockIdx.x; 15 | // int p = tid * bid; 16 | printf("‫‪Hello‬‬ ‫‪CUDA‬‬ ‫‪I’m‬‬ ‫‪a‬‬ ‫‪thread‬‬ %d ‫‪from‬‬ ‫‬‫‪block %d \n‬‬" , tid , bid); 17 | 18 | } 19 | 20 | int main() 21 | { 22 | 23 | print_info(100); 24 | 25 | return 0; 26 | } 27 | 28 | void print_info(unsigned int size) 29 | { 30 | 31 | cudaSetDevice(0); 32 | 33 | 34 | cudaError_t cudaStatus; 35 | print_info_kernel<<<4, size>>>(); 36 | cudaStatus = cudaGetLastError(); 37 | if (cudaStatus != cudaSuccess) 38 | { 39 | fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 40 | } 41 | 42 | cudaStatus = cudaDeviceSynchronize(); 43 | if (cudaStatus != cudaSuccess) { 44 | fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /HW5/vector_add.cu: -------------------------------------------------------------------------------- 1 |  2 | // nvcc -Xcompiler -fopenmp vector_add.cu -o vector_add for compiling 3 | 4 | #include "cuda_runtime.h" 5 | #include "device_launch_parameters.h" 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #define BLOCK_SIZE 256 12 | #define STRIDE 1 13 | #define VEC_SIZE 10000000 14 | 15 | cudaError_t cuda_parallel_vector_add(int *c, int *a, int *b, int size); 16 | void serial_vector_add(int *c, int *a, int *b, int size); 17 | void omp_parallel_vector_add(int *c, int *a, int *b, int size); 18 | int *fill_with_random(int size); 19 | __global__ void addKernel(int *c, int *a, int *b, int *size, int *stride); 20 | int *fill_with_zeros(int size); 21 | 22 | int main() 23 | { 24 | 25 | // using cuda 26 | 27 | int *a = fill_with_random(VEC_SIZE); 28 | int *b = fill_with_random(VEC_SIZE); 29 | int *c = fill_with_zeros(VEC_SIZE); 30 | 31 | // Add vectors in parallel. 32 | cudaError_t cudaStatus = cuda_parallel_vector_add(c, a, b, VEC_SIZE); 33 | if (cudaStatus != cudaSuccess) 34 | { 35 | fprintf(stderr, "cuda_parallel_vector_add failed!"); 36 | return 1; 37 | } 38 | 39 | // cudaDeviceReset must be called before exiting in order for profiling and 40 | // tracing tools such as Nsight and Visual Profiler to show complete traces. 41 | cudaStatus = cudaDeviceReset(); 42 | if (cudaStatus != cudaSuccess) 43 | { 44 | fprintf(stderr, "cudaDeviceReset failed!"); 45 | return 1; 46 | } 47 | 48 | free(a); 49 | free(b); 50 | free(c); 51 | 52 | // using openmp 53 | 54 | a = fill_with_random(VEC_SIZE); 55 | b = fill_with_random(VEC_SIZE); 56 | c = fill_with_zeros(VEC_SIZE); 57 | 58 | double start = omp_get_wtime(); 59 | omp_parallel_vector_add(c, a, b, VEC_SIZE); 60 | double end = omp_get_wtime() - start; 61 | 62 | printf("time for openMP: %f s \n", end); 63 | 64 | free(a); 65 | free(b); 66 | free(c); 67 | 68 | // serial code 69 | 70 | a = fill_with_random(VEC_SIZE); 71 | b = fill_with_random(VEC_SIZE); 72 | c = fill_with_zeros(VEC_SIZE); 73 | 74 | start = omp_get_wtime(); 75 | serial_vector_add(c, a, b, VEC_SIZE); 76 | end = omp_get_wtime() - start; 77 | 78 | printf("time for serial: %f s \n", end); 79 | 80 | free(a); 81 | free(b); 82 | free(c); 83 | 84 | return 0; 85 | } 86 | 87 | __global__ void addKernel(int *c, int *a, int *b, int *size, int *stride) 88 | { 89 | int i = blockDim.x * blockIdx.x + threadIdx.x; 90 | 91 | int start = i * (*stride); 92 | 93 | 94 | for (int j = start; j < start + *stride; j++) 95 | if(j < *size) 96 | c[j] = a[j] + b[j]; 97 | } 98 | 99 | // Helper function for using CUDA to add vectors in parallel. 100 | cudaError_t cuda_parallel_vector_add(int *c, int *a, int *b, int size) 101 | { 102 | int *dev_a = 0; 103 | int *dev_b = 0; 104 | int *dev_c = 0; 105 | int *dev_size = 0; 106 | int *dev_stride = 0; 107 | cudaError_t cudaStatus; 108 | 109 | int num_threads = BLOCK_SIZE / STRIDE + 1; 110 | int stride = STRIDE; 111 | 112 | dim3 DimGrid((size - 1 ) / num_threads + 1, 1, 1); 113 | dim3 DimBlock(num_threads, 1, 1); 114 | 115 | // Choose which GPU to run on, change this on a multi-GPU system. 116 | cudaStatus = cudaSetDevice(0); 117 | if (cudaStatus != cudaSuccess) 118 | { 119 | fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 120 | } 121 | 122 | // Allocate GPU buffers for three vectors (two input, one output) . 123 | cudaStatus = cudaMalloc((void **)&dev_c, size * sizeof(int)); 124 | if (cudaStatus != cudaSuccess) 125 | { 126 | fprintf(stderr, "cudaMalloc failed!"); 127 | } 128 | 129 | cudaStatus = cudaMalloc((void **)&dev_stride, sizeof(int)); 130 | if (cudaStatus != cudaSuccess) 131 | { 132 | fprintf(stderr, "cudaMalloc failed!"); 133 | } 134 | 135 | cudaStatus = cudaMalloc((void **)&dev_a, size * sizeof(int)); 136 | if (cudaStatus != cudaSuccess) 137 | { 138 | fprintf(stderr, "cudaMalloc failed!"); 139 | } 140 | 141 | cudaStatus = cudaMalloc((void **)&dev_b, size * sizeof(int)); 142 | if (cudaStatus != cudaSuccess) 143 | { 144 | fprintf(stderr, "cudaMalloc failed!"); 145 | } 146 | 147 | cudaStatus = cudaMalloc((void **)&dev_size, sizeof(int)); 148 | if (cudaStatus != cudaSuccess) 149 | { 150 | fprintf(stderr, "cudaMalloc failed!"); 151 | } 152 | 153 | // Copy input vectors from host memory to GPU buffers. 154 | cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); 155 | if (cudaStatus != cudaSuccess) 156 | { 157 | fprintf(stderr, "cudaMemcpy failed !"); 158 | } 159 | 160 | cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); 161 | if (cudaStatus != cudaSuccess) 162 | { 163 | fprintf(stderr, "cudaMemcpy failed !"); 164 | } 165 | 166 | cudaStatus = cudaMemcpy(dev_stride, &stride, sizeof(int), cudaMemcpyHostToDevice); 167 | if (cudaStatus != cudaSuccess) 168 | { 169 | fprintf(stderr, "cudaMemcpy failed !"); 170 | } 171 | 172 | cudaStatus = cudaMemcpy(dev_size, &size, sizeof(int), cudaMemcpyHostToDevice); 173 | if (cudaStatus != cudaSuccess) 174 | { 175 | fprintf(stderr, "cudaMemcpy failed !"); 176 | } 177 | double start = omp_get_wtime(); 178 | // Launch a kernel on the GPU with one thread for each element. 179 | addKernel<<>>(dev_c, dev_a, dev_b, dev_size, dev_stride); 180 | 181 | // Check for any errors launching the kernel 182 | cudaStatus = cudaGetLastError(); 183 | if (cudaStatus != cudaSuccess) 184 | { 185 | fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 186 | } 187 | 188 | // cudaDeviceSynchronize waits for the kernel to finish, and returns 189 | // any errors encountered during the launch. 190 | cudaStatus = cudaDeviceSynchronize(); 191 | if (cudaStatus != cudaSuccess) 192 | { 193 | fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 194 | } 195 | 196 | double end = omp_get_wtime() - start; 197 | 198 | printf("time for CUDA: %f s \n", end); 199 | 200 | // Copy output vector from GPU buffer to host memory. 201 | cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); 202 | if (cudaStatus != cudaSuccess) 203 | { 204 | fprintf(stderr, "cudaMemcpy failed !"); 205 | } 206 | 207 | cudaFree(dev_c); 208 | cudaFree(dev_a); 209 | cudaFree(dev_b); 210 | 211 | return cudaStatus; 212 | } 213 | 214 | // a 215 | void serial_vector_add(int *c, int *a, int *b, int size) 216 | { 217 | for (int i = 0; i < size; i++) 218 | { 219 | c[i] = a[i] + b[i]; 220 | } 221 | } 222 | 223 | void omp_parallel_vector_add(int *c, int *a, int *b, int size) 224 | { 225 | #pragma omp parallel for 226 | for (int i = 0; i < size; i++) 227 | { 228 | c[i] = a[i] + b[i]; 229 | } 230 | } 231 | 232 | int *fill_with_random(int size) 233 | { 234 | int *a = (int *)malloc(sizeof(int) * size); 235 | 236 | for (int i = 0; i < size; i++) 237 | { 238 | a[i] = rand() % 100; 239 | } 240 | 241 | return a; 242 | } 243 | 244 | int *fill_with_zeros(int size) 245 | { 246 | int *a = (int *)malloc(sizeof(int) * size); 247 | 248 | for (int i = 0; i < size; i++) 249 | { 250 | a[i] = 0; 251 | } 252 | 253 | return a; 254 | } -------------------------------------------------------------------------------- /HW6/HW06_9731107 (1).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW6/HW06_9731107 (1).pdf -------------------------------------------------------------------------------- /HW6/HW6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW6/HW6.pdf -------------------------------------------------------------------------------- /HW6/reduction: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW6/reduction -------------------------------------------------------------------------------- /HW6/reduction.cu: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // CUDA runtime 8 | #include 9 | #include 10 | #include 11 | 12 | #define BLOCK_SIZE 128 13 | #define N 4194304 14 | // #define N 2048 15 | 16 | __global__ void reduce0(int *g_idata, int *g_odata , int size) 17 | { 18 | 19 | __shared__ int sdata[BLOCK_SIZE]; 20 | // each thread loads one element from global to shared mem 21 | unsigned int tid = threadIdx.x; 22 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 23 | // printf("%d\n" , i); 24 | sdata[tid] = g_idata[i]; 25 | __syncthreads(); 26 | // do reduction in shared mem 27 | for (unsigned int s = 1; s < blockDim.x; s *= 2) 28 | { 29 | if (tid % (2 * s) == 0) 30 | { 31 | sdata[tid] += sdata[tid + s]; 32 | } 33 | __syncthreads(); 34 | } 35 | // write result for this block to global mem 36 | if (tid == 0) 37 | g_odata[blockIdx.x] = sdata[0]; 38 | } 39 | 40 | __global__ void reduce1(int *g_idata, int *g_odata , int size) 41 | { 42 | 43 | __shared__ int sdata[BLOCK_SIZE]; 44 | // each thread loads one element from global to shared mem 45 | unsigned int tid = threadIdx.x; 46 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 47 | 48 | sdata[tid] = g_idata[i]; 49 | __syncthreads(); 50 | // do reduction in shared mem 51 | for (unsigned int s = 1; s < blockDim.x; s *= 2) 52 | { 53 | int index = 2 * s * tid; 54 | if (index < blockDim.x) 55 | { 56 | sdata[index] += sdata[index + s]; 57 | } 58 | __syncthreads(); 59 | } 60 | // write result for this block to global mem 61 | if (tid == 0) 62 | g_odata[blockIdx.x] = sdata[0]; 63 | } 64 | 65 | __global__ void reduce2(int *g_idata, int *g_odata, int size) 66 | { 67 | 68 | __shared__ int sdata[BLOCK_SIZE]; 69 | // each thread loads one element from global to shared mem 70 | unsigned int tid = threadIdx.x; 71 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 72 | 73 | sdata[tid] = g_idata[i]; 74 | __syncthreads(); 75 | // do reduction in shared mem 76 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) 77 | { 78 | if (tid < s) 79 | { 80 | sdata[tid] += sdata[tid + s]; 81 | } 82 | __syncthreads(); 83 | } 84 | // write result for this block to global mem 85 | if (tid == 0) 86 | g_odata[blockIdx.x] = sdata[0]; 87 | } 88 | 89 | __global__ void reduce3(int *g_idata, int *g_odata, int size) 90 | { 91 | 92 | __shared__ int sdata[BLOCK_SIZE]; 93 | // each thread loads one element from global to shared mem 94 | unsigned int tid = threadIdx.x; 95 | unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x; 96 | 97 | if (i < size) 98 | { 99 | sdata[tid] = g_idata[i] + g_idata[i + blockDim.x]; 100 | __syncthreads(); 101 | // do reduction in shared mem 102 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) 103 | { 104 | if (tid < s) 105 | { 106 | sdata[tid] += sdata[tid + s]; 107 | } 108 | __syncthreads(); 109 | } 110 | // write result for this block to global mem 111 | if (tid == 0) 112 | g_odata[blockIdx.x] = sdata[0]; 113 | } 114 | } 115 | 116 | __global__ void reduce4(int *g_idata, int *g_odata, int size) 117 | { 118 | 119 | __shared__ int sdata[BLOCK_SIZE]; 120 | // each thread loads one element from global to shared mem 121 | unsigned int tid = threadIdx.x; 122 | unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x; 123 | 124 | if (i < size) 125 | { 126 | sdata[tid] = g_idata[i] + g_idata[i + blockDim.x]; 127 | __syncthreads(); 128 | // do reduction in shared mem 129 | for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) 130 | { 131 | if (tid < s) 132 | { 133 | sdata[tid] += sdata[tid + s]; 134 | } 135 | __syncthreads(); 136 | } 137 | 138 | if (tid < 32) 139 | { 140 | sdata[tid] += sdata[tid + 32]; 141 | __syncthreads(); 142 | sdata[tid] += sdata[tid + 16]; 143 | __syncthreads(); 144 | sdata[tid] += sdata[tid + 8]; 145 | __syncthreads(); 146 | sdata[tid] += sdata[tid + 4]; 147 | __syncthreads(); 148 | sdata[tid] += sdata[tid + 2]; 149 | __syncthreads(); 150 | sdata[tid] += sdata[tid + 1]; 151 | } 152 | 153 | // write result for this block to global mem 154 | if (tid == 0) 155 | g_odata[blockIdx.x] = sdata[0]; 156 | } 157 | } 158 | 159 | #define KERNEL_NUM 5 160 | 161 | void (*KERNELS[KERNEL_NUM])(int *, int * ,int) = { 162 | reduce0, reduce1, reduce2, reduce3, reduce4}; 163 | 164 | void constantInit(int *data, int size, int val) 165 | { 166 | for (int i = 0; i < size; ++i) 167 | { 168 | data[i] = val; 169 | } 170 | } 171 | 172 | void checkError(cudaError_t error, int line) 173 | { 174 | 175 | if (error != cudaSuccess) 176 | { 177 | printf("### error occurred in line %d \n error : %s", line, cudaGetErrorString(error)); 178 | exit(EXIT_FAILURE); 179 | } 180 | } 181 | 182 | float serial_reduction() 183 | { 184 | 185 | clock_t beginn = clock(); 186 | 187 | int *A = (int *)malloc(sizeof(int) * N); 188 | constantInit(A, N, 1); 189 | 190 | long int sum = 0; 191 | clock_t begin = clock(); 192 | for (int i = 0; i < N; i++) 193 | sum += A[i]; 194 | clock_t end = clock(); 195 | float time_spent = ((float)(end - begin) / CLOCKS_PER_SEC) * 1000; 196 | 197 | printf("serial execution : %f ms\n", time_spent); 198 | 199 | clock_t endd = clock(); 200 | float time_spentt = ((float)(endd - beginn) / CLOCKS_PER_SEC) * 1000; 201 | 202 | printf("total serial execution : %f ms\n", time_spentt); 203 | 204 | return time_spent; 205 | } 206 | 207 | /** 208 | * Run a simple test of matrix multiplication using CUDA 209 | */ 210 | int reduction(int argc, char **argv, int n, int func_index) 211 | { 212 | // Allocate host memory for matrices A and B 213 | unsigned int msize = n; 214 | unsigned int mem_size = sizeof(int) * msize; 215 | int *h_in = (int *)malloc(mem_size); 216 | 217 | constantInit(h_in, msize, 1); 218 | 219 | // Allocate device memory 220 | int *d_in; 221 | int *d_out; 222 | 223 | int grid_size = (n - 1) / BLOCK_SIZE + 1; 224 | if (func_index >= 3) 225 | grid_size /= 2; 226 | 227 | cudaError_t error; 228 | 229 | clock_t begin = clock(); 230 | 231 | error = cudaMalloc((void **)&d_in, mem_size); 232 | checkError(error, __LINE__); 233 | 234 | int output_size = grid_size * sizeof(int); 235 | 236 | error = cudaMalloc((void **)&d_out,output_size ); 237 | checkError(error, __LINE__); 238 | 239 | // copy host memory to device 240 | error = cudaMemcpy(d_in, h_in, mem_size, cudaMemcpyHostToDevice); 241 | checkError(error, __LINE__); 242 | 243 | float total_time = 0.0f; 244 | printf("grid size : %d block size : %d number of threads : %d \n", grid_size, BLOCK_SIZE, grid_size * BLOCK_SIZE); 245 | 246 | int stride = 1; 247 | int size = N; 248 | 249 | while (grid_size >= 1) 250 | { 251 | output_size = grid_size * sizeof(int); 252 | 253 | cudaEvent_t start; 254 | error = cudaEventCreate(&start); 255 | checkError(error, __LINE__); 256 | 257 | cudaEvent_t stop; 258 | error = cudaEventCreate(&stop); 259 | checkError(error, __LINE__); 260 | 261 | // Record the start event 262 | error = cudaEventRecord(start, NULL); 263 | checkError(error, __LINE__); 264 | 265 | dim3 threads(BLOCK_SIZE, 1, 1); 266 | dim3 grid(grid_size, 1, 1); 267 | 268 | KERNELS[func_index]<<>>(d_in, d_out, size); 269 | 270 | error = cudaGetLastError(); 271 | checkError(error, __LINE__); 272 | 273 | // Record the stop event 274 | error = cudaEventRecord(stop, NULL); 275 | checkError(error, __LINE__); 276 | 277 | // Wait for the stop event to complete 278 | error = cudaEventSynchronize(stop); 279 | checkError(error, __LINE__); 280 | 281 | float msecTotal = 0.0f; 282 | error = cudaEventElapsedTime(&msecTotal, start, stop); 283 | total_time += msecTotal; 284 | 285 | error = cudaEventElapsedTime(&msecTotal, start, stop); 286 | 287 | checkError(error, __LINE__); 288 | 289 | // Copy result from device to host 290 | grid_size /= BLOCK_SIZE; 291 | stride *= BLOCK_SIZE; 292 | size /= BLOCK_SIZE; 293 | cudaFree(d_in); 294 | d_in = d_out; 295 | error = cudaMalloc((void **)&d_out, output_size); 296 | checkError(error, __LINE__); 297 | } 298 | 299 | int *h_out = (int *)malloc(output_size); 300 | 301 | error = cudaMemcpy(h_out, d_in, output_size, cudaMemcpyDeviceToHost); 302 | checkError(error, __LINE__); 303 | 304 | int total_sum = 0; 305 | 306 | for(int i = 0 ; i < output_size / sizeof(int) ; i++) 307 | total_sum += h_out[i]; 308 | 309 | 310 | printf("Elapsed time in msec = %f and bandwidth %f GB/s result = %d \n", total_time, mem_size / (total_time * 1e6), total_sum); 311 | 312 | // Clean up memory 313 | free(h_in); 314 | free(h_out); 315 | cudaFree(d_in); 316 | cudaFree(d_out); 317 | 318 | clock_t end = clock(); 319 | float time_spent = ((float)(end - begin) / CLOCKS_PER_SEC) * 1000; 320 | 321 | printf("execution + memory allocations : %f ms\n", time_spent); 322 | 323 | return EXIT_SUCCESS; 324 | } 325 | 326 | /** 327 | * Program main 328 | */ 329 | int main(int argc, char **argv) 330 | { 331 | printf("[Matrix Reduction Using CUDA] - Starting...\n"); 332 | 333 | // By default, we use device 0 334 | int devID = 0; 335 | cudaSetDevice(devID); 336 | 337 | cudaError_t error; 338 | cudaDeviceProp deviceProp; 339 | error = cudaGetDevice(&devID); 340 | checkError(error, __LINE__); 341 | 342 | error = cudaGetDeviceProperties(&deviceProp, devID); 343 | checkError(error, __LINE__); 344 | 345 | if (error != cudaSuccess) 346 | { 347 | printf("cudaGetDeviceProperties returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 348 | } 349 | else 350 | { 351 | printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); 352 | } 353 | 354 | int n = N; 355 | 356 | printf("Array with size (%d)\n", n); 357 | 358 | // serial_reduction(); 359 | 360 | for (size_t i = 0; i < KERNEL_NUM; i++) 361 | { 362 | printf("\n num implementation : %d \n", (int)i + 1); 363 | reduction(argc, argv, n, i); 364 | } 365 | 366 | return 0; 367 | } 368 | -------------------------------------------------------------------------------- /HW6/reduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/HW6/reduction.pdf -------------------------------------------------------------------------------- /LAB1/Lab1_9731107.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB1/Lab1_9731107.pdf -------------------------------------------------------------------------------- /LAB1/Manual 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB1/Manual 1.pdf -------------------------------------------------------------------------------- /LAB1/lab1.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | const long int VERYBIG = 100000; 7 | // *********************************************************************** 8 | int main(void) 9 | { 10 | 11 | // checks if openMP is available 12 | #ifndef _OPENMP 13 | printf("OpenMP is not supported, sorry!\n"); 14 | getchar(); 15 | return 0; 16 | #endif 17 | 18 | 19 | int i; 20 | long int j, k, sum; 21 | double sumx, sumy, total; 22 | double starttime, elapsedtime; 23 | // ----------------------------------------------------------------------- 24 | // Output a start message 25 | printf("parallel Timings for %d iterations\n\n", VERYBIG); 26 | // repeat experiment several times 27 | for (i = 0; i < 10; i++) 28 | { 29 | // get starting time56 x CHAPTER 3 PARALLEL STUDIO XE FOR THE IMPATIENT 30 | starttime = omp_get_wtime(); 31 | // reset check sum & running total 32 | sum = 0; 33 | total = 0.0; 34 | // Work Loop, do some work by looping VERYBIG times 35 | //#pragma omp parallel for private(k, sumx, sumy) reduction(+:sum, total) num_threads(32) 36 | #pragma omp parallel for private(k , sumx , sumy) 37 | for (j = 0; j < VERYBIG; j++) 38 | { 39 | int num_threads = omp_get_num_threads(); 40 | //printf("num threads : %d\n", num_threads); 41 | 42 | 43 | // increment check sum 44 | //printf("thread id : %d \n", omp_get_thread_num()); 45 | #pragma omp critical 46 | sum += 1; 47 | // Calculate first arithmetic series 48 | sumx = 0.0; 49 | for (k = 0; k < j; k++) 50 | sumx = sumx + (double)k; 51 | // Calculate second arithmetic series 52 | sumy = 0.0; 53 | for (k = j; k > 0; k--) 54 | sumy = sumy + (double)k; 55 | if (sumx > 0.0) { 56 | #pragma omp critical 57 | total = total + 1.0 / sqrt(sumx); 58 | } 59 | 60 | if (sumy > 0.0) { 61 | #pragma omp critical 62 | total = total + 1.0 / sqrt(sumy); 63 | } 64 | } 65 | // get ending time and use it to determine elapsed time 66 | elapsedtime = omp_get_wtime() - starttime; 67 | // report elapsed time 68 | printf("Time Elapsed: %f Secs, Total = %lf, Check Sum = %ld , iteration : %d\n", 69 | elapsedtime, total, sum , i 70 | ); 71 | } 72 | // return integer as required by function header 73 | getchar(); 74 | return 0; 75 | } 76 | -------------------------------------------------------------------------------- /LAB2/LAB2_9731107.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB2/LAB2_9731107.pdf -------------------------------------------------------------------------------- /LAB2/Manual 2-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB2/Manual 2-1.pdf -------------------------------------------------------------------------------- /LAB2/block_parallel.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * In His Exalted Name 3 | * Matrix Addition - Sequential Code 4 | * Ahmad Siavashi, Email: siavashi@aut.ac.ir 5 | * 15/04/2018 6 | */ 7 | 8 | // Let it be. 9 | #define _CRT_SECURE_NO_WARNINGS 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | typedef struct { 18 | int* A, * B, * C; 19 | int n, m; 20 | } DataSet; 21 | 22 | void fillDataSet(DataSet* dataSet); 23 | void printDataSet(DataSet dataSet); 24 | void closeDataSet(DataSet dataSet); 25 | void add(DataSet dataSet); 26 | 27 | #define BLOCK_SIZE 32 28 | #define EXP_NUM 10 29 | #define NUM_THREADS 8 30 | 31 | int main(int argc, char* argv[]) { 32 | #ifndef _OPENMP 33 | printf("OpenMP is not supported, sorry!\n"); 34 | getchar(); 35 | return 0; 36 | #endif 37 | double elapsed_time_sum = 0.0; 38 | DataSet dataSet; 39 | if (argc < 3) { 40 | printf("[-] Invalid No. of arguments.\n"); 41 | printf("[-] Try -> \n"); 42 | printf(">>> "); 43 | scanf("%d %d", &dataSet.n, &dataSet.m); 44 | } 45 | else { 46 | dataSet.n = atoi(argv[1]); 47 | dataSet.m = atoi(argv[2]); 48 | } 49 | fillDataSet(&dataSet); 50 | for (int i = 0; i < EXP_NUM; i++) { 51 | double starttime = omp_get_wtime(); 52 | add(dataSet); 53 | double elapsedtime = omp_get_wtime() - starttime; 54 | printf("Time Elapsed: %f Secs , for adding two (%d , %d ) dimensional matrix\n", 55 | elapsedtime, dataSet.n, dataSet.m 56 | ); 57 | elapsed_time_sum += elapsedtime; 58 | //printDataSet(dataSet); 59 | } 60 | printf("average running time : %f\n", elapsed_time_sum / EXP_NUM); 61 | closeDataSet(dataSet); 62 | //system("PAUSE"); 63 | return EXIT_SUCCESS; 64 | } 65 | 66 | void fillDataSet(DataSet* dataSet) { 67 | int i, j; 68 | 69 | dataSet->A = (int*)malloc(sizeof(int) * dataSet->n * dataSet->m); 70 | dataSet->B = (int*)malloc(sizeof(int) * dataSet->n * dataSet->m); 71 | dataSet->C = (int*)malloc(sizeof(int) * dataSet->n * dataSet->m); 72 | 73 | srand(time(NULL)); 74 | 75 | for (i = 0; i < dataSet->n; i++) { 76 | for (j = 0; j < dataSet->m; j++) { 77 | dataSet->A[i * dataSet->m + j] = rand() % 100; 78 | dataSet->B[i * dataSet->m + j] = rand() % 100; 79 | } 80 | } 81 | } 82 | 83 | void printDataSet(DataSet dataSet) { 84 | int i, j; 85 | 86 | printf("[-] Matrix A\n"); 87 | for (i = 0; i < dataSet.n; i++) { 88 | for (j = 0; j < dataSet.m; j++) { 89 | printf("%-4d", dataSet.A[i * dataSet.m + j]); 90 | } 91 | putchar('\n'); 92 | } 93 | 94 | printf("[-] Matrix B\n"); 95 | for (i = 0; i < dataSet.n; i++) { 96 | for (j = 0; j < dataSet.m; j++) { 97 | printf("%-4d", dataSet.B[i * dataSet.m + j]); 98 | } 99 | putchar('\n'); 100 | } 101 | 102 | printf("[-] Matrix C\n"); 103 | for (i = 0; i < dataSet.n; i++) { 104 | for (j = 0; j < dataSet.m; j++) { 105 | printf("%-8d", dataSet.C[i * dataSet.m + j]); 106 | } 107 | putchar('\n'); 108 | } 109 | } 110 | 111 | void closeDataSet(DataSet dataSet) { 112 | free(dataSet.A); 113 | free(dataSet.B); 114 | free(dataSet.C); 115 | } 116 | 117 | void add_block(DataSet dataSet, int i, int j) { 118 | for (int k = i * BLOCK_SIZE; k < (i + 1) * BLOCK_SIZE && k < dataSet.n; k++) { 119 | for (int p = j * BLOCK_SIZE; p < (j + 1) * BLOCK_SIZE && p < dataSet.m; p++) { 120 | dataSet.C[k * dataSet.m + p] = dataSet.A[k * dataSet.m + p] + dataSet.B[k * dataSet.m + p]; 121 | //printf("%d ", dataSet.C[k * dataSet.m + p]); 122 | } 123 | //printf("\n"); 124 | } 125 | } 126 | void add(DataSet dataSet) { 127 | int row_block = (int)ceil(dataSet.n / (double)BLOCK_SIZE); 128 | int col_block = (int)ceil(dataSet.m / (double)BLOCK_SIZE); 129 | #pragma omp parallel for num_threads(NUM_THREADS) collapse(2) 130 | for (int i = 0; i < row_block; i++) { 131 | for (int j = 0; j < col_block; j++) { 132 | //printf("block : %d -- %d\n", i, j); 133 | add_block(dataSet, i, j); 134 | } 135 | } 136 | } 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /LAB2/row_parallel.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * In His Exalted Name 3 | * Matrix Addition - Sequential Code 4 | * Ahmad Siavashi, Email: siavashi@aut.ac.ir 5 | * 15/04/2018 6 | */ 7 | 8 | // Let it be. 9 | #define _CRT_SECURE_NO_WARNINGS 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | typedef struct { 17 | int* A, * B, * C; 18 | int n, m; 19 | } DataSet; 20 | 21 | void fillDataSet(DataSet* dataSet); 22 | void printDataSet(DataSet dataSet); 23 | void closeDataSet(DataSet dataSet); 24 | void add(DataSet dataSet); 25 | 26 | #define EXP_NUM 10 27 | #define NUM_THREADS 8 28 | 29 | int main(int argc, char* argv[]) { 30 | #ifndef _OPENMP 31 | printf("OpenMP is not supported, sorry!\n"); 32 | getchar(); 33 | return 0; 34 | #endif 35 | double elapsed_time_sum = 0.0; 36 | DataSet dataSet; 37 | if (argc < 3) { 38 | printf("[-] Invalid No. of arguments.\n"); 39 | printf("[-] Try -> \n"); 40 | printf(">>> "); 41 | scanf("%d %d", &dataSet.n, &dataSet.m); 42 | } 43 | else { 44 | dataSet.n = atoi(argv[1]); 45 | dataSet.m = atoi(argv[2]); 46 | } 47 | fillDataSet(&dataSet); 48 | for (int i = 0; i < EXP_NUM; i++) { 49 | double starttime = omp_get_wtime(); 50 | add(dataSet); 51 | double elapsedtime = omp_get_wtime() - starttime; 52 | printf("Time Elapsed: %f Secs , for adding two (%d , %d ) dimensional matrix\n", 53 | elapsedtime, dataSet.n, dataSet.m 54 | ); 55 | elapsed_time_sum += elapsedtime; 56 | //printDataSet(dataSet); 57 | } 58 | printf("average running time : %f\n", elapsed_time_sum / EXP_NUM); 59 | closeDataSet(dataSet); 60 | //system("PAUSE"); 61 | return EXIT_SUCCESS; 62 | } 63 | 64 | void fillDataSet(DataSet* dataSet) { 65 | int i, j; 66 | 67 | dataSet->A = (int*)malloc(sizeof(int) * dataSet->n * dataSet->m); 68 | dataSet->B = (int*)malloc(sizeof(int) * dataSet->n * dataSet->m); 69 | dataSet->C = (int*)malloc(sizeof(int) * dataSet->n * dataSet->m); 70 | 71 | srand(time(NULL)); 72 | 73 | for (i = 0; i < dataSet->n; i++) { 74 | for (j = 0; j < dataSet->m; j++) { 75 | dataSet->A[i * dataSet->m + j] = rand() % 100; 76 | dataSet->B[i * dataSet->m + j] = rand() % 100; 77 | } 78 | } 79 | } 80 | 81 | void printDataSet(DataSet dataSet) { 82 | int i, j; 83 | 84 | printf("[-] Matrix A\n"); 85 | for (i = 0; i < dataSet.n; i++) { 86 | for (j = 0; j < dataSet.m; j++) { 87 | printf("%-4d", dataSet.A[i * dataSet.m + j]); 88 | } 89 | putchar('\n'); 90 | } 91 | 92 | printf("[-] Matrix B\n"); 93 | for (i = 0; i < dataSet.n; i++) { 94 | for (j = 0; j < dataSet.m; j++) { 95 | printf("%-4d", dataSet.B[i * dataSet.m + j]); 96 | } 97 | putchar('\n'); 98 | } 99 | 100 | printf("[-] Matrix C\n"); 101 | for (i = 0; i < dataSet.n; i++) { 102 | for (j = 0; j < dataSet.m; j++) { 103 | printf("%-8d", dataSet.C[i * dataSet.m + j]); 104 | } 105 | putchar('\n'); 106 | } 107 | } 108 | 109 | void closeDataSet(DataSet dataSet) { 110 | free(dataSet.A); 111 | free(dataSet.B); 112 | free(dataSet.C); 113 | } 114 | 115 | void add(DataSet dataSet) { 116 | #pragma omp parallel for num_threads(NUM_THREADS) 117 | for (int i = 0; i < dataSet.n; i++) { 118 | for (int j = 0; j < dataSet.m; j++) { 119 | dataSet.C[i * dataSet.m + j] = dataSet.A[i * dataSet.m + j] + dataSet.B[i * dataSet.m + j]; 120 | } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /LAB3/LAB3.cpp: -------------------------------------------------------------------------------- 1 | // Example Program 2 | // Optimizes code for maximum speed 3 | #pragma optimize( "2", on ) 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | // Adds an additional library so that timeGetTime() can be used 10 | #pragma comment(lib, "winmm.lib") 11 | const long int VERYBIG = 100000; 12 | // *********************************************************************** 13 | int main(void) 14 | { 15 | int i; 16 | long int j, k, sum; 17 | double sumx, sumy, total; 18 | DWORD starttime, elapsedtime; 19 | // ----------------------------------------------------------------------- 20 | // Output a start message 21 | printf("None Parallel Timings for %d iterations\n\n", VERYBIG); 22 | // repeat experiment several times 23 | for (i = 0; i < 1; i++) 24 | { 25 | // get starting time56 x CHAPTER 3 PARALLEL STUDIO XE FOR THE IMPATIENT 26 | starttime = timeGetTime(); 27 | // reset check sum & running total 28 | sum = 0; 29 | total = 0.0; 30 | // Work Loop, do some work by looping VERYBIG times 31 | #pragma omp parallel for num_threads(8) private(sumx , sumy , k) reduction(+ : sum , total) schedule(dynamic , 2000) 32 | for (j = 0; j < VERYBIG; j++) 33 | { 34 | // increment check sum 35 | sum += 1; 36 | // Calculate first arithmetic series 37 | sumx = 0.0; 38 | for (k = 0; k < j; k++) 39 | sumx = sumx + (double)k; 40 | // Calculate second arithmetic series 41 | sumy = 0.0; 42 | for (k = j; k > 0; k--) 43 | sumy = sumy + (double)k; 44 | if (sumx > 0.0)total = total + 1.0 / sqrt(sumx); 45 | if (sumy > 0.0)total = total + 1.0 / sqrt(sumy); 46 | } 47 | // get ending time and use it to determine elapsed time 48 | elapsedtime = timeGetTime() - starttime; 49 | // report elapsed time 50 | printf("Time Elapsed % 10d mSecs Total = %lf Check Sum = %ld\n", 51 | (int)elapsedtime, total, sum); 52 | } 53 | // return integer as required by function header 54 | return 0; 55 | } 56 | // ********************************************************************** -------------------------------------------------------------------------------- /LAB3/LAB3_9731107.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB3/LAB3_9731107.pdf -------------------------------------------------------------------------------- /LAB3/Manual 3-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB3/Manual 3-1.pdf -------------------------------------------------------------------------------- /LAB4/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "gcc-9 - Build and debug active file", 9 | "type": "cppdbg", 10 | "request": "launch", 11 | "program": "${fileDirname}/${fileBasenameNoExtension}", 12 | "args": [], 13 | "stopAtEntry": false, 14 | "cwd": "${fileDirname}", 15 | "environment": [], 16 | "externalConsole": false, 17 | "MIMode": "gdb", 18 | "setupCommands": [ 19 | { 20 | "description": "Enable pretty-printing for gdb", 21 | "text": "-enable-pretty-printing", 22 | "ignoreFailures": true 23 | } 24 | ], 25 | "preLaunchTask": "C/C++: gcc-9 build active file", 26 | "miDebuggerPath": "/usr/bin/gdb" 27 | } 28 | ] 29 | } -------------------------------------------------------------------------------- /LAB4/.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "tasks": [ 3 | { 4 | "type": "cppbuild", 5 | "label": "C/C++: gcc-9 build active file", 6 | "command": "/usr/bin/gcc-9", 7 | "args": [ 8 | "-g", 9 | "${file}", 10 | "-o", 11 | "${fileDirname}/${fileBasenameNoExtension}" 12 | ], 13 | "options": { 14 | "cwd": "${fileDirname}" 15 | }, 16 | "problemMatcher": [ 17 | "$gcc" 18 | ], 19 | "group": { 20 | "kind": "build", 21 | "isDefault": true 22 | }, 23 | "detail": "Task generated by Debugger." 24 | } 25 | ], 26 | "version": "2.0.0" 27 | } -------------------------------------------------------------------------------- /LAB4/LAB4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB4/LAB4.pdf -------------------------------------------------------------------------------- /LAB4/hist.c: -------------------------------------------------------------------------------- 1 | // Let it be. 2 | #define _CRT_SECURE_NO_WARNINGS 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | void fill_array(int *a, size_t n); 10 | void prefix_sum(int *a, size_t n); 11 | void print_array(int *a, size_t n); 12 | void s_prefix_sum(int *a, size_t n); 13 | void p_prefix_sum(int *a, size_t n); 14 | void h_prefix_sum(int *a, size_t n); 15 | 16 | #define NUM_THREADS 8 17 | #define EXP_NUM 5 18 | 19 | int main(int argc, char *argv[]) 20 | { 21 | 22 | #ifndef _OPENMP 23 | printf("OpenMP is not supported, sorry!\n"); 24 | getchar(); 25 | return 0; 26 | #endif 27 | 28 | unsigned int p; 29 | 30 | printf("[-] enter 1 if you want parallelism otherwise enter 0 : \n"); 31 | scanf("%uld\n", &p); 32 | 33 | unsigned int n = 0; 34 | printf("[-] Please enter N: "); 35 | scanf("%uld\n", &n); 36 | double summ = 0; 37 | 38 | for(int i = 0 ; i < EXP_NUM ; i++) 39 | { 40 | int *a = (int *)malloc(n * sizeof a); 41 | 42 | fill_array(a, n); 43 | 44 | if (p == 1) 45 | { 46 | double start = omp_get_wtime(); 47 | p_prefix_sum(a, n); 48 | double end = omp_get_wtime() - start; 49 | printf("parallel time for %d threads : %lf\n", NUM_THREADS, end); 50 | summ += end; 51 | } 52 | else if(p == 0) 53 | { 54 | double start = omp_get_wtime(); 55 | s_prefix_sum(a, n); 56 | double end = omp_get_wtime() - start; 57 | printf("serial time : %lf\n", end); 58 | summ += end; 59 | 60 | }else if(p == 2){ 61 | double start = omp_get_wtime(); 62 | h_prefix_sum(a, n); 63 | double end = omp_get_wtime() - start; 64 | printf("parallel time for %d threads (h&s): %lf\n", NUM_THREADS, end); 65 | summ += end; 66 | } 67 | 68 | free(a); 69 | 70 | } 71 | printf("average tiem : %f \n" , (summ/EXP_NUM)); 72 | 73 | return EXIT_SUCCESS; 74 | } 75 | 76 | void p_prefix_sum(int *a, size_t n) 77 | { 78 | int end[NUM_THREADS]; 79 | 80 | #pragma omp parallel num_threads(NUM_THREADS) 81 | { 82 | int th_num = omp_get_thread_num(); 83 | 84 | int inedx = -1; 85 | #pragma omp for schedule(static) 86 | for (int i = 1; i < n; i++) 87 | { 88 | a[i] = a[i] + a[i - 1]; 89 | inedx = i; 90 | } 91 | end[th_num] = inedx; 92 | 93 | #pragma omp barrier 94 | 95 | #pragma omp single 96 | { 97 | for (int k = 0; k < NUM_THREADS - 1; k++) 98 | { 99 | for (int j = k + 1; j < NUM_THREADS; j++) 100 | { 101 | a[end[j]] += a[end[k]]; 102 | } 103 | } 104 | } 105 | 106 | #pragma omp for 107 | for (int k = 0; k < NUM_THREADS - 1; k++) 108 | { 109 | int start = end[k] + 1; 110 | int endd = end[k + 1] - 1; 111 | int last_val = a[end[k]]; 112 | for (int j = start; j < endd; j++) 113 | a[j] += last_val; 114 | } 115 | } 116 | } 117 | void s_prefix_sum(int *a, size_t n) 118 | { 119 | for (int i = 1; i < n; ++i) 120 | { 121 | a[i] = a[i] + a[i - 1]; 122 | } 123 | } 124 | 125 | void h_prefix_sum(int *a, size_t n) 126 | { 127 | for(long step = 1 ; step < n ; step *= 2){ 128 | int tmp[n]; 129 | 130 | #pragma omp parallel num_threads(NUM_THREADS) 131 | { 132 | #pragma omp single 133 | { 134 | for(int i = 0 ; i < n - step ; i++){ 135 | #pragma omp task 136 | { 137 | tmp[i + step] = a[i] + a[i + step]; 138 | } 139 | } 140 | } 141 | 142 | #pragma omp barrier 143 | 144 | #pragma omp for 145 | for(int i = 0 ; i < n ; i++) 146 | { 147 | a[i] = tmp[i]; 148 | } 149 | 150 | } 151 | } 152 | } 153 | 154 | void print_array(int *a, size_t n) 155 | { 156 | int i; 157 | printf("[-] array: "); 158 | for (i = 0; i < n; ++i) 159 | { 160 | printf("%d, ", a[i]); 161 | } 162 | printf("\b\b \n"); 163 | } 164 | 165 | void fill_array(int *a, size_t n) 166 | { 167 | int i; 168 | for (i = 0; i < n; ++i) 169 | { 170 | a[i] = i + 1; 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /LAB5/deviceQuery.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO USER: 5 | * 6 | * This source code is subject to NVIDIA ownership rights under U.S. and 7 | * international Copyright laws. Users and possessors of this source code 8 | * are hereby granted a nonexclusive, royalty-free license to use this code 9 | * in individual and commercial software. 10 | * 11 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 12 | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 13 | * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 14 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 15 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 16 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 17 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 18 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 19 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 20 | * OR PERFORMANCE OF THIS SOURCE CODE. 21 | * 22 | * U.S. Government End Users. This source code is a "commercial item" as 23 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 24 | * "commercial computer software" and "commercial computer software 25 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 26 | * and is provided to the U.S. Government only as a commercial end item. 27 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 28 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 29 | * source code with only those rights set forth herein. 30 | * 31 | * Any use of this source code in individual and commercial software must 32 | * include, in the user documentation and internal comments to the code, 33 | * the above Disclaimer and U.S. Government End Users Notice. 34 | */ 35 | 36 | /* This sample queries the properties of the CUDA devices present in the system. */ 37 | 38 | // includes, system 39 | #include 40 | #include 41 | #include 42 | 43 | // includes, project 44 | //#include 45 | 46 | //////////////////////////////////////////////////////////////////////////////// 47 | // Program main 48 | //////////////////////////////////////////////////////////////////////////////// 49 | int 50 | main(int argc, char** argv) 51 | { 52 | int deviceCount; 53 | cudaGetDeviceCount(&deviceCount); 54 | if (deviceCount == 0) 55 | printf("There is no device supporting CUDA\n"); 56 | int dev; 57 | for (dev = 0; dev < deviceCount; ++dev) { 58 | cudaDeviceProp deviceProp; 59 | cudaGetDeviceProperties(&deviceProp, dev); 60 | if (dev == 0) { 61 | if (deviceProp.major == 9999 && deviceProp.minor == 9999) 62 | printf("There is no device supporting CUDA.\n"); 63 | else if (deviceCount == 1) 64 | printf("There is 1 device supporting CUDA\n"); 65 | else 66 | printf("There are %d devices supporting CUDA\n", deviceCount); 67 | } 68 | printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); 69 | printf(" Major revision number: %d\n", 70 | deviceProp.major); 71 | printf(" Minor revision number: %d\n", 72 | deviceProp.minor); 73 | printf(" Total amount of global memory: %u bytes\n", 74 | (unsigned int)deviceProp.totalGlobalMem); 75 | #if CUDART_VERSION >= 2000 76 | printf(" Number of multiprocessors: %d\n", 77 | deviceProp.multiProcessorCount); 78 | printf(" Number of cores: %d\n", 79 | 8 * deviceProp.multiProcessorCount); 80 | #endif 81 | printf(" Total amount of constant memory: %u bytes\n", 82 | (unsigned int)deviceProp.totalConstMem); 83 | printf(" Total amount of shared memory per block: %u bytes\n", 84 | (unsigned int)deviceProp.sharedMemPerBlock); 85 | printf(" Total number of registers available per block: %d\n", 86 | deviceProp.regsPerBlock); 87 | printf(" Warp size: %d\n", 88 | deviceProp.warpSize); 89 | printf(" Maximum number of threads per block: %d\n", 90 | deviceProp.maxThreadsPerBlock); 91 | printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", 92 | deviceProp.maxThreadsDim[0], 93 | deviceProp.maxThreadsDim[1], 94 | deviceProp.maxThreadsDim[2]); 95 | printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", 96 | deviceProp.maxGridSize[0], 97 | deviceProp.maxGridSize[1], 98 | deviceProp.maxGridSize[2]); 99 | printf(" Maximum memory pitch: %u bytes\n", 100 | (unsigned int)deviceProp.memPitch); 101 | printf(" Texture alignment: %u bytes\n", 102 | (unsigned int)deviceProp.textureAlignment); 103 | printf(" Clock rate: %.2f GHz\n", 104 | deviceProp.clockRate * 1e-6f); 105 | #if CUDART_VERSION >= 2000 106 | printf(" Concurrent copy and execution: %s\n", 107 | deviceProp.deviceOverlap ? "Yes" : "No"); 108 | #endif 109 | } 110 | printf("\nTEST PASSED\n"); 111 | } 112 | -------------------------------------------------------------------------------- /LAB5/گزارش.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB5/گزارش.pdf -------------------------------------------------------------------------------- /LAB6/LAB6_9731107.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB6/LAB6_9731107.pdf -------------------------------------------------------------------------------- /LAB6/Manual 6-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB6/Manual 6-1.pdf -------------------------------------------------------------------------------- /LAB6/matmul: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB6/matmul -------------------------------------------------------------------------------- /LAB6/matmul.cu: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | 6 | // CUDA runtime 7 | #include 8 | #include 9 | 10 | 11 | /** 12 | * Matrix multiplication (CUDA Kernel) on the device: C = A * B 13 | */ 14 | #define TILE_WIDTH 16 15 | __global__ void 16 | matrixMulCUDA(float *C, float *A, float *B, int n) 17 | { 18 | int k; 19 | int row = threadIdx.y , col = threadIdx.x; 20 | float sum = 0.0f; 21 | for(k = 0 ; k < n ; ++k){ 22 | sum += A[row * n + k] * B[k * n + col]; 23 | } 24 | C[row * n + col] = sum; 25 | } 26 | 27 | void constantInit(float *data, int size, float val) 28 | { 29 | for (int i = 0; i < size; ++i) 30 | { 31 | data[i] = val; 32 | } 33 | } 34 | 35 | /** 36 | * Run a simple test of matrix multiplication using CUDA 37 | */ 38 | int matrixMultiply(int argc, char **argv, int n) 39 | { 40 | // Allocate host memory for matrices A and B 41 | unsigned int size_A = n * n; 42 | unsigned int mem_size_A = sizeof(float)* size_A; 43 | float *h_A = (float *)malloc(mem_size_A); 44 | unsigned int size_B = n * n; 45 | unsigned int mem_size_B = sizeof(float)* size_B; 46 | float *h_B = (float *)malloc(mem_size_B); 47 | 48 | // Initialize host memory 49 | const float valB = 0.01f; 50 | constantInit(h_A, size_A, 1.0f); 51 | constantInit(h_B, size_B, valB); 52 | 53 | // Allocate device memory 54 | float *d_A, *d_B, *d_C; 55 | 56 | // Allocate host matrix C 57 | unsigned int mem_size_C = n * n * sizeof(float); 58 | float *h_C = (float *)malloc(mem_size_C); 59 | 60 | if (h_C == NULL) 61 | { 62 | fprintf(stderr, "Failed to allocate host matrix C!\n"); 63 | exit(EXIT_FAILURE); 64 | } 65 | 66 | cudaError_t error; 67 | 68 | error = cudaMalloc((void **)&d_A, mem_size_A); 69 | 70 | if (error != cudaSuccess) 71 | { 72 | printf("cudaMalloc d_A returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 73 | exit(EXIT_FAILURE); 74 | } 75 | 76 | error = cudaMalloc((void **)&d_B, mem_size_B); 77 | 78 | if (error != cudaSuccess) 79 | { 80 | printf("cudaMalloc d_B returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 81 | exit(EXIT_FAILURE); 82 | } 83 | 84 | error = cudaMalloc((void **)&d_C, mem_size_C); 85 | 86 | if (error != cudaSuccess) 87 | { 88 | printf("cudaMalloc d_C returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 89 | exit(EXIT_FAILURE); 90 | } 91 | 92 | // copy host memory to device 93 | error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 94 | 95 | if (error != cudaSuccess) 96 | { 97 | printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 98 | exit(EXIT_FAILURE); 99 | } 100 | 101 | error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 102 | 103 | if (error != cudaSuccess) 104 | { 105 | printf("cudaMemcpy (d_B,h_B) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 106 | exit(EXIT_FAILURE); 107 | } 108 | 109 | // Setup execution parameters 110 | dim3 threads(32, 32,1); 111 | dim3 grid(1,1,1); 112 | 113 | // Create and start timer 114 | printf("Computing result using CUDA Kernel...\n"); 115 | 116 | // Allocate CUDA events that we'll use for timing 117 | cudaEvent_t start; 118 | error = cudaEventCreate(&start); 119 | 120 | if (error != cudaSuccess) 121 | { 122 | fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); 123 | exit(EXIT_FAILURE); 124 | } 125 | 126 | cudaEvent_t stop; 127 | error = cudaEventCreate(&stop); 128 | 129 | if (error != cudaSuccess) 130 | { 131 | fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); 132 | exit(EXIT_FAILURE); 133 | } 134 | 135 | // Record the start event 136 | error = cudaEventRecord(start, NULL); 137 | 138 | if (error != cudaSuccess) 139 | { 140 | fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); 141 | exit(EXIT_FAILURE); 142 | } 143 | 144 | // Execute the kernel 145 | matrixMulCUDA << < grid, threads >> > (d_C, d_A, d_B, n); 146 | 147 | error = cudaGetLastError(); 148 | if (error != cudaSuccess) 149 | { 150 | fprintf(stderr, "Failed to launch kernel!\n", cudaGetErrorString(error)); 151 | exit(EXIT_FAILURE); 152 | } 153 | 154 | // Record the stop event 155 | error = cudaEventRecord(stop, NULL); 156 | 157 | if (error != cudaSuccess) 158 | { 159 | fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); 160 | exit(EXIT_FAILURE); 161 | } 162 | 163 | // Wait for the stop event to complete 164 | error = cudaEventSynchronize(stop); 165 | 166 | if (error != cudaSuccess) 167 | { 168 | fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); 169 | exit(EXIT_FAILURE); 170 | } 171 | 172 | float msecTotal = 0.0f; 173 | error = cudaEventElapsedTime(&msecTotal, start, stop); 174 | 175 | printf("Elapsed time in msec = %f\n", msecTotal); 176 | 177 | if (error != cudaSuccess) 178 | { 179 | fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); 180 | exit(EXIT_FAILURE); 181 | } 182 | 183 | // Copy result from device to host 184 | error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); 185 | 186 | if (error != cudaSuccess) 187 | { 188 | printf("cudaMemcpy (h_C,d_C) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 189 | exit(EXIT_FAILURE); 190 | } 191 | 192 | 193 | // Clean up memory 194 | free(h_A); 195 | free(h_B); 196 | free(h_C); 197 | cudaFree(d_A); 198 | cudaFree(d_B); 199 | cudaFree(d_C); 200 | 201 | return EXIT_SUCCESS; 202 | 203 | } 204 | 205 | 206 | /** 207 | * Program main 208 | */ 209 | int main(int argc, char **argv) 210 | { 211 | printf("[Matrix Multiply Using CUDA] - Starting...\n"); 212 | 213 | // By default, we use device 0 214 | int devID = 0; 215 | cudaSetDevice(devID); 216 | 217 | cudaError_t error; 218 | cudaDeviceProp deviceProp; 219 | error = cudaGetDevice(&devID); 220 | 221 | if (error != cudaSuccess) 222 | { 223 | printf("cudaGetDevice returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 224 | } 225 | 226 | error = cudaGetDeviceProperties(&deviceProp, devID); 227 | 228 | if (deviceProp.computeMode == cudaComputeModeProhibited) 229 | { 230 | fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); 231 | exit(EXIT_SUCCESS); 232 | } 233 | 234 | if (error != cudaSuccess) 235 | { 236 | printf("cudaGetDeviceProperties returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 237 | } 238 | else 239 | { 240 | printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); 241 | } 242 | 243 | // Size of square matrices 244 | size_t n = 0; 245 | printf("[-] N = "); 246 | scanf("%u", &n); 247 | 248 | printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", n, n, n, n); 249 | 250 | int matrix_result = matrixMultiply(argc, argv, n); 251 | 252 | exit(matrix_result); 253 | } 254 | -------------------------------------------------------------------------------- /LAB6/matmul1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB6/matmul1 -------------------------------------------------------------------------------- /LAB6/matmul1.cu: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | 6 | // CUDA runtime 7 | #include 8 | #include 9 | 10 | /** 11 | * Matrix multiplication (CUDA Kernel) on the device: C = A * B 12 | */ 13 | 14 | #define N 2048 15 | 16 | #define BLOCK_SIZE 16 17 | 18 | #define TILE_WIDTH N/BLOCK_SIZE 19 | __global__ void 20 | matrixMulCUDA(float *C, float *A, float *B, int n) 21 | { 22 | int start_row = threadIdx.y * TILE_WIDTH; 23 | int end_row = start_row + TILE_WIDTH; 24 | int start_col = threadIdx.x * TILE_WIDTH; 25 | int end_col = start_col + TILE_WIDTH; 26 | for (int row = start_row; row < end_row; row++) 27 | { 28 | for (int col = start_col; col < end_col; col++) 29 | { 30 | float C_val = 0; 31 | for (int k = 0; k < n; ++k) 32 | { 33 | float A_elem = A[row * n + k]; 34 | float B_elem = B[k * n + col]; 35 | C_val += A_elem * B_elem; 36 | } 37 | C[row * n + col] = C_val; 38 | } 39 | } 40 | } 41 | 42 | void constantInit(float *data, int size, float val) 43 | { 44 | for (int i = 0; i < size; ++i) 45 | { 46 | data[i] = val; 47 | } 48 | } 49 | 50 | /** 51 | * Run a simple test of matrix multiplication using CUDA 52 | */ 53 | int matrixMultiply(int argc, char **argv, int n) 54 | { 55 | // Allocate host memory for matrices A and B 56 | unsigned int size_A = n * n; 57 | unsigned int mem_size_A = sizeof(float) * size_A; 58 | float *h_A = (float *)malloc(mem_size_A); 59 | unsigned int size_B = n * n; 60 | unsigned int mem_size_B = sizeof(float) * size_B; 61 | float *h_B = (float *)malloc(mem_size_B); 62 | 63 | // Initialize host memory 64 | const float valB = 0.01f; 65 | constantInit(h_A, size_A, 1.0f); 66 | constantInit(h_B, size_B, valB); 67 | 68 | // Allocate device memory 69 | float *d_A, *d_B, *d_C; 70 | 71 | // Allocate host matrix C 72 | unsigned int mem_size_C = n * n * sizeof(float); 73 | float *h_C = (float *)malloc(mem_size_C); 74 | 75 | if (h_C == NULL) 76 | { 77 | fprintf(stderr, "Failed to allocate host matrix C!\n"); 78 | exit(EXIT_FAILURE); 79 | } 80 | 81 | cudaError_t error; 82 | 83 | error = cudaMalloc((void **)&d_A, mem_size_A); 84 | 85 | if (error != cudaSuccess) 86 | { 87 | printf("cudaMalloc d_A returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 88 | exit(EXIT_FAILURE); 89 | } 90 | 91 | error = cudaMalloc((void **)&d_B, mem_size_B); 92 | 93 | if (error != cudaSuccess) 94 | { 95 | printf("cudaMalloc d_B returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 96 | exit(EXIT_FAILURE); 97 | } 98 | 99 | error = cudaMalloc((void **)&d_C, mem_size_C); 100 | 101 | if (error != cudaSuccess) 102 | { 103 | printf("cudaMalloc d_C returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 104 | exit(EXIT_FAILURE); 105 | } 106 | 107 | // copy host memory to device 108 | error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 109 | 110 | if (error != cudaSuccess) 111 | { 112 | printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 113 | exit(EXIT_FAILURE); 114 | } 115 | 116 | error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 117 | 118 | if (error != cudaSuccess) 119 | { 120 | printf("cudaMemcpy (d_B,h_B) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 121 | exit(EXIT_FAILURE); 122 | } 123 | 124 | // Setup execution parameters 125 | dim3 threads(BLOCK_SIZE, BLOCK_SIZE, 1); 126 | dim3 grid(1, 1, 1); 127 | 128 | // Create and start timer 129 | printf("Computing result using CUDA Kernel...\n"); 130 | 131 | // Allocate CUDA events that we'll use for timing 132 | cudaEvent_t start; 133 | error = cudaEventCreate(&start); 134 | 135 | if (error != cudaSuccess) 136 | { 137 | fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); 138 | exit(EXIT_FAILURE); 139 | } 140 | 141 | cudaEvent_t stop; 142 | error = cudaEventCreate(&stop); 143 | 144 | if (error != cudaSuccess) 145 | { 146 | fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); 147 | exit(EXIT_FAILURE); 148 | } 149 | 150 | // Record the start event 151 | error = cudaEventRecord(start, NULL); 152 | 153 | if (error != cudaSuccess) 154 | { 155 | fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); 156 | exit(EXIT_FAILURE); 157 | } 158 | 159 | // Execute the kernel 160 | matrixMulCUDA<<>>(d_C, d_A, d_B, n); 161 | 162 | error = cudaGetLastError(); 163 | if (error != cudaSuccess) 164 | { 165 | fprintf(stderr, "Failed to launch kernel!\n", cudaGetErrorString(error)); 166 | exit(EXIT_FAILURE); 167 | } 168 | 169 | // Record the stop event 170 | error = cudaEventRecord(stop, NULL); 171 | 172 | if (error != cudaSuccess) 173 | { 174 | fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); 175 | exit(EXIT_FAILURE); 176 | } 177 | 178 | // Wait for the stop event to complete 179 | error = cudaEventSynchronize(stop); 180 | 181 | if (error != cudaSuccess) 182 | { 183 | fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); 184 | exit(EXIT_FAILURE); 185 | } 186 | 187 | float msecTotal = 0.0f; 188 | error = cudaEventElapsedTime(&msecTotal, start, stop); 189 | 190 | printf("Elapsed time in msec = %f\n", msecTotal); 191 | 192 | if (error != cudaSuccess) 193 | { 194 | fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); 195 | exit(EXIT_FAILURE); 196 | } 197 | 198 | // Copy result from device to host 199 | error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); 200 | 201 | if (error != cudaSuccess) 202 | { 203 | printf("cudaMemcpy (h_C,d_C) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 204 | exit(EXIT_FAILURE); 205 | } 206 | 207 | // Clean up memory 208 | free(h_A); 209 | free(h_B); 210 | free(h_C); 211 | cudaFree(d_A); 212 | cudaFree(d_B); 213 | cudaFree(d_C); 214 | 215 | return EXIT_SUCCESS; 216 | } 217 | 218 | /** 219 | * Program main 220 | */ 221 | int main(int argc, char **argv) 222 | { 223 | printf("[Matrix Multiply Using CUDA] - Starting...\n"); 224 | 225 | // By default, we use device 0 226 | int devID = 0; 227 | cudaSetDevice(devID); 228 | 229 | cudaError_t error; 230 | cudaDeviceProp deviceProp; 231 | error = cudaGetDevice(&devID); 232 | 233 | if (error != cudaSuccess) 234 | { 235 | printf("cudaGetDevice returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 236 | } 237 | 238 | error = cudaGetDeviceProperties(&deviceProp, devID); 239 | 240 | if (deviceProp.computeMode == cudaComputeModeProhibited) 241 | { 242 | fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); 243 | exit(EXIT_SUCCESS); 244 | } 245 | 246 | if (error != cudaSuccess) 247 | { 248 | printf("cudaGetDeviceProperties returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 249 | } 250 | else 251 | { 252 | printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); 253 | } 254 | 255 | // Size of square matrices 256 | size_t n = N; 257 | // printf("[-] N = "); 258 | // scanf("%u", &n); 259 | 260 | printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", n, n, n, n); 261 | 262 | int matrix_result = matrixMultiply(argc, argv, n); 263 | 264 | exit(matrix_result); 265 | } 266 | -------------------------------------------------------------------------------- /LAB6/matmul2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB6/matmul2 -------------------------------------------------------------------------------- /LAB6/matmul2.cu: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | 6 | // CUDA runtime 7 | #include 8 | #include 9 | 10 | /** 11 | * Matrix multiplication (CUDA Kernel) on the device: C = A * B 12 | */ 13 | #define BLOCK_SIZE 16 14 | #define N 2048 15 | 16 | __global__ void 17 | matrixMulCUDA(float *C, float *A, float *B, int n) 18 | { 19 | int row = blockIdx.y * blockDim.y + threadIdx.y; 20 | int col = blockIdx.x * blockDim.x + threadIdx.x; 21 | float C_val = 0; 22 | for (int k = 0; k < n; ++k) 23 | { 24 | float A_elem = A[row * n + k]; 25 | float B_elem = B[k * n + col]; 26 | C_val += A_elem * B_elem; 27 | } 28 | C[row * n + col] = C_val; 29 | } 30 | 31 | void constantInit(float *data, int size, float val) 32 | { 33 | for (int i = 0; i < size; ++i) 34 | { 35 | data[i] = val; 36 | } 37 | } 38 | 39 | /** 40 | * Run a simple test of matrix multiplication using CUDA 41 | */ 42 | int matrixMultiply(int argc, char **argv, int n) 43 | { 44 | // Allocate host memory for matrices A and B 45 | unsigned int size_A = n * n; 46 | unsigned int mem_size_A = sizeof(float) * size_A; 47 | float *h_A = (float *)malloc(mem_size_A); 48 | unsigned int size_B = n * n; 49 | unsigned int mem_size_B = sizeof(float) * size_B; 50 | float *h_B = (float *)malloc(mem_size_B); 51 | 52 | // Initialize host memory 53 | const float valB = 0.01f; 54 | constantInit(h_A, size_A, 1.0f); 55 | constantInit(h_B, size_B, valB); 56 | 57 | // Allocate device memory 58 | float *d_A, *d_B, *d_C; 59 | 60 | // Allocate host matrix C 61 | unsigned int mem_size_C = n * n * sizeof(float); 62 | float *h_C = (float *)malloc(mem_size_C); 63 | 64 | if (h_C == NULL) 65 | { 66 | fprintf(stderr, "Failed to allocate host matrix C!\n"); 67 | exit(EXIT_FAILURE); 68 | } 69 | 70 | cudaError_t error; 71 | 72 | error = cudaMalloc((void **)&d_A, mem_size_A); 73 | 74 | if (error != cudaSuccess) 75 | { 76 | printf("cudaMalloc d_A returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 77 | exit(EXIT_FAILURE); 78 | } 79 | 80 | error = cudaMalloc((void **)&d_B, mem_size_B); 81 | 82 | if (error != cudaSuccess) 83 | { 84 | printf("cudaMalloc d_B returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 85 | exit(EXIT_FAILURE); 86 | } 87 | 88 | error = cudaMalloc((void **)&d_C, mem_size_C); 89 | 90 | if (error != cudaSuccess) 91 | { 92 | printf("cudaMalloc d_C returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 93 | exit(EXIT_FAILURE); 94 | } 95 | 96 | // copy host memory to device 97 | error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 98 | 99 | if (error != cudaSuccess) 100 | { 101 | printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 102 | exit(EXIT_FAILURE); 103 | } 104 | 105 | error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 106 | 107 | if (error != cudaSuccess) 108 | { 109 | printf("cudaMemcpy (d_B,h_B) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 110 | exit(EXIT_FAILURE); 111 | } 112 | 113 | // Setup execution parameters 114 | dim3 threads(BLOCK_SIZE, BLOCK_SIZE, 1); 115 | dim3 grid((n-1) / BLOCK_SIZE + 1, (n-1) / BLOCK_SIZE + 1, 1); 116 | 117 | // Create and start timer 118 | printf("Computing result using CUDA Kernel...\n"); 119 | 120 | // Allocate CUDA events that we'll use for timing 121 | cudaEvent_t start; 122 | error = cudaEventCreate(&start); 123 | 124 | if (error != cudaSuccess) 125 | { 126 | fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); 127 | exit(EXIT_FAILURE); 128 | } 129 | 130 | cudaEvent_t stop; 131 | error = cudaEventCreate(&stop); 132 | 133 | if (error != cudaSuccess) 134 | { 135 | fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); 136 | exit(EXIT_FAILURE); 137 | } 138 | 139 | // Record the start event 140 | error = cudaEventRecord(start, NULL); 141 | 142 | if (error != cudaSuccess) 143 | { 144 | fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); 145 | exit(EXIT_FAILURE); 146 | } 147 | 148 | // Execute the kernel 149 | matrixMulCUDA<<>>(d_C, d_A, d_B, n); 150 | 151 | error = cudaGetLastError(); 152 | if (error != cudaSuccess) 153 | { 154 | fprintf(stderr, "Failed to launch kernel!\n", cudaGetErrorString(error)); 155 | exit(EXIT_FAILURE); 156 | } 157 | 158 | // Record the stop event 159 | error = cudaEventRecord(stop, NULL); 160 | 161 | if (error != cudaSuccess) 162 | { 163 | fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); 164 | exit(EXIT_FAILURE); 165 | } 166 | 167 | // Wait for the stop event to complete 168 | error = cudaEventSynchronize(stop); 169 | 170 | if (error != cudaSuccess) 171 | { 172 | fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); 173 | exit(EXIT_FAILURE); 174 | } 175 | 176 | float msecTotal = 0.0f; 177 | error = cudaEventElapsedTime(&msecTotal, start, stop); 178 | 179 | printf("Elapsed time in msec = %f\n", msecTotal); 180 | 181 | if (error != cudaSuccess) 182 | { 183 | fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); 184 | exit(EXIT_FAILURE); 185 | } 186 | 187 | // Copy result from device to host 188 | error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); 189 | 190 | if (error != cudaSuccess) 191 | { 192 | printf("cudaMemcpy (h_C,d_C) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 193 | exit(EXIT_FAILURE); 194 | } 195 | 196 | // Clean up memory 197 | free(h_A); 198 | free(h_B); 199 | free(h_C); 200 | cudaFree(d_A); 201 | cudaFree(d_B); 202 | cudaFree(d_C); 203 | 204 | return EXIT_SUCCESS; 205 | } 206 | 207 | /** 208 | * Program main 209 | */ 210 | int main(int argc, char **argv) 211 | { 212 | printf("[Matrix Multiply Using CUDA] - Starting...\n"); 213 | 214 | // By default, we use device 0 215 | int devID = 0; 216 | cudaSetDevice(devID); 217 | 218 | cudaError_t error; 219 | cudaDeviceProp deviceProp; 220 | error = cudaGetDevice(&devID); 221 | 222 | if (error != cudaSuccess) 223 | { 224 | printf("cudaGetDevice returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 225 | } 226 | 227 | error = cudaGetDeviceProperties(&deviceProp, devID); 228 | 229 | if (deviceProp.computeMode == cudaComputeModeProhibited) 230 | { 231 | fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); 232 | exit(EXIT_SUCCESS); 233 | } 234 | 235 | if (error != cudaSuccess) 236 | { 237 | printf("cudaGetDeviceProperties returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 238 | } 239 | else 240 | { 241 | printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); 242 | } 243 | 244 | // Size of square matrices 245 | size_t n = N; 246 | // printf("[-] N = "); 247 | // scanf("%u", &n); 248 | 249 | printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", n, n, n, n); 250 | 251 | int matrix_result = matrixMultiply(argc, argv, n); 252 | 253 | exit(matrix_result); 254 | } 255 | -------------------------------------------------------------------------------- /LAB6/matmul3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB6/matmul3 -------------------------------------------------------------------------------- /LAB6/matmul3.cu: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | 6 | // CUDA runtime 7 | #include 8 | #include 9 | 10 | /** 11 | * Matrix multiplication (CUDA Kernel) on the device: C = A * B 12 | */ 13 | #define TILE_WIDTH 16 14 | #define BLOCK_SIZE 16 15 | #define N 2048 16 | 17 | __global__ void 18 | matrixMulCUDA(float *C, float *A, float *B, int n) 19 | { 20 | int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH; 21 | int end_row = start_row + TILE_WIDTH; 22 | int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH; 23 | int end_col = start_col + TILE_WIDTH; 24 | for (int row = start_row; row < end_row; row++) 25 | { 26 | for (int col = start_col; col < end_col; col++) 27 | { 28 | float C_val = 0; 29 | for (int k = 0; k < n; ++k) 30 | { 31 | float A_elem = A[row * n + k]; 32 | float B_elem = B[k * n + col]; 33 | C_val += A_elem * B_elem; 34 | } 35 | C[row * n + col] = C_val; 36 | } 37 | } 38 | } 39 | 40 | void constantInit(float *data, int size, float val) 41 | { 42 | for (int i = 0; i < size; ++i) 43 | { 44 | data[i] = val; 45 | } 46 | } 47 | 48 | /** 49 | * Run a simple test of matrix multiplication using CUDA 50 | */ 51 | int matrixMultiply(int argc, char **argv, int n) 52 | { 53 | // Allocate host memory for matrices A and B 54 | unsigned int size_A = n * n; 55 | unsigned int mem_size_A = sizeof(float) * size_A; 56 | float *h_A = (float *)malloc(mem_size_A); 57 | unsigned int size_B = n * n; 58 | unsigned int mem_size_B = sizeof(float) * size_B; 59 | float *h_B = (float *)malloc(mem_size_B); 60 | 61 | // Initialize host memory 62 | const float valB = 0.01f; 63 | constantInit(h_A, size_A, 1.0f); 64 | constantInit(h_B, size_B, valB); 65 | 66 | // Allocate device memory 67 | float *d_A, *d_B, *d_C; 68 | 69 | // Allocate host matrix C 70 | unsigned int mem_size_C = n * n * sizeof(float); 71 | float *h_C = (float *)malloc(mem_size_C); 72 | 73 | if (h_C == NULL) 74 | { 75 | fprintf(stderr, "Failed to allocate host matrix C!\n"); 76 | exit(EXIT_FAILURE); 77 | } 78 | 79 | cudaError_t error; 80 | 81 | error = cudaMalloc((void **)&d_A, mem_size_A); 82 | 83 | if (error != cudaSuccess) 84 | { 85 | printf("cudaMalloc d_A returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 86 | exit(EXIT_FAILURE); 87 | } 88 | 89 | error = cudaMalloc((void **)&d_B, mem_size_B); 90 | 91 | if (error != cudaSuccess) 92 | { 93 | printf("cudaMalloc d_B returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 94 | exit(EXIT_FAILURE); 95 | } 96 | 97 | error = cudaMalloc((void **)&d_C, mem_size_C); 98 | 99 | if (error != cudaSuccess) 100 | { 101 | printf("cudaMalloc d_C returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 102 | exit(EXIT_FAILURE); 103 | } 104 | 105 | // copy host memory to device 106 | error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 107 | 108 | if (error != cudaSuccess) 109 | { 110 | printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 111 | exit(EXIT_FAILURE); 112 | } 113 | 114 | error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 115 | 116 | if (error != cudaSuccess) 117 | { 118 | printf("cudaMemcpy (d_B,h_B) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 119 | exit(EXIT_FAILURE); 120 | } 121 | 122 | // Setup execution parameters 123 | dim3 threads(BLOCK_SIZE, BLOCK_SIZE, 1); 124 | dim3 grid((((n-1) / BLOCK_SIZE + 1) - 1) / TILE_WIDTH + 1 ,(((n-1) / BLOCK_SIZE + 1) - 1) / TILE_WIDTH + 1, 1); 125 | 126 | // Create and start timer 127 | printf("Computing result using CUDA Kernel...\n"); 128 | 129 | // Allocate CUDA events that we'll use for timing 130 | cudaEvent_t start; 131 | error = cudaEventCreate(&start); 132 | 133 | if (error != cudaSuccess) 134 | { 135 | fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); 136 | exit(EXIT_FAILURE); 137 | } 138 | 139 | cudaEvent_t stop; 140 | error = cudaEventCreate(&stop); 141 | 142 | if (error != cudaSuccess) 143 | { 144 | fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); 145 | exit(EXIT_FAILURE); 146 | } 147 | 148 | // Record the start event 149 | error = cudaEventRecord(start, NULL); 150 | 151 | if (error != cudaSuccess) 152 | { 153 | fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); 154 | exit(EXIT_FAILURE); 155 | } 156 | 157 | // Execute the kernel 158 | matrixMulCUDA<<>>(d_C, d_A, d_B, n); 159 | 160 | error = cudaGetLastError(); 161 | if (error != cudaSuccess) 162 | { 163 | fprintf(stderr, "Failed to launch kernel!\n", cudaGetErrorString(error)); 164 | exit(EXIT_FAILURE); 165 | } 166 | 167 | // Record the stop event 168 | error = cudaEventRecord(stop, NULL); 169 | 170 | if (error != cudaSuccess) 171 | { 172 | fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); 173 | exit(EXIT_FAILURE); 174 | } 175 | 176 | // Wait for the stop event to complete 177 | error = cudaEventSynchronize(stop); 178 | 179 | if (error != cudaSuccess) 180 | { 181 | fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); 182 | exit(EXIT_FAILURE); 183 | } 184 | 185 | float msecTotal = 0.0f; 186 | error = cudaEventElapsedTime(&msecTotal, start, stop); 187 | 188 | printf("Elapsed time in msec = %f\n", msecTotal); 189 | 190 | if (error != cudaSuccess) 191 | { 192 | fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); 193 | exit(EXIT_FAILURE); 194 | } 195 | 196 | // Copy result from device to host 197 | error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); 198 | 199 | if (error != cudaSuccess) 200 | { 201 | printf("cudaMemcpy (h_C,d_C) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 202 | exit(EXIT_FAILURE); 203 | } 204 | 205 | // Clean up memory 206 | free(h_A); 207 | free(h_B); 208 | free(h_C); 209 | cudaFree(d_A); 210 | cudaFree(d_B); 211 | cudaFree(d_C); 212 | 213 | return EXIT_SUCCESS; 214 | } 215 | 216 | /** 217 | * Program main 218 | */ 219 | int main(int argc, char **argv) 220 | { 221 | printf("[Matrix Multiply Using CUDA] - Starting...\n"); 222 | 223 | // By default, we use device 0 224 | int devID = 0; 225 | cudaSetDevice(devID); 226 | 227 | cudaError_t error; 228 | cudaDeviceProp deviceProp; 229 | error = cudaGetDevice(&devID); 230 | 231 | if (error != cudaSuccess) 232 | { 233 | printf("cudaGetDevice returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 234 | } 235 | 236 | error = cudaGetDeviceProperties(&deviceProp, devID); 237 | 238 | if (deviceProp.computeMode == cudaComputeModeProhibited) 239 | { 240 | fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); 241 | exit(EXIT_SUCCESS); 242 | } 243 | 244 | if (error != cudaSuccess) 245 | { 246 | printf("cudaGetDeviceProperties returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 247 | } 248 | else 249 | { 250 | printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); 251 | } 252 | 253 | // Size of square matrices 254 | size_t n = N; 255 | // printf("[-] N = "); 256 | // scanf("%u", &n); 257 | 258 | printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", n, n, n, n); 259 | 260 | int matrix_result = matrixMultiply(argc, argv, n); 261 | 262 | exit(matrix_result); 263 | } 264 | -------------------------------------------------------------------------------- /LAB6/matmul3v2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlirezaAK2000/Multicore-Programming/9f828cec63c03dd2ccebf6ef489ca270b0aeb3c8/LAB6/matmul3v2 -------------------------------------------------------------------------------- /LAB6/matmul3v2.cu: -------------------------------------------------------------------------------- 1 | // System includes 2 | #include 3 | #include 4 | #include 5 | 6 | // CUDA runtime 7 | #include 8 | #include 9 | 10 | /** 11 | * Matrix multiplication (CUDA Kernel) on the device: C = A * B 12 | */ 13 | #define TILE_WIDTH 16 14 | #define BLOCK_SIZE 16 15 | #define N 2048 16 | 17 | __global__ void 18 | matrixMulCUDA(float *C, float *A, float *B, int n) 19 | { 20 | 21 | __shared__ float s_a[TILE_WIDTH][TILE_WIDTH]; 22 | __shared__ float s_b[TILE_WIDTH][TILE_WIDTH]; 23 | 24 | int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH; 25 | int end_row = start_row + TILE_WIDTH; 26 | int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH; 27 | int end_col = start_col + TILE_WIDTH; 28 | 29 | int tx = threadIdx.x; 30 | int ty = threadIdx.y; 31 | 32 | for (int row = start_row; row < end_row; row++) 33 | { 34 | for (int col = start_col; col < end_col; col++) 35 | { 36 | float C_val = 0; 37 | for (int i = 0; i < n / (TILE_WIDTH * BLOCK_SIZE); i++) 38 | { 39 | for (int j = 0; j < TILE_WIDTH; j++) 40 | { 41 | s_a[ty][tx] = A[(row * n) + (i * TILE_WIDTH * BLOCK_SIZE) + (j * TILE_WIDTH) + tx]; 42 | s_b[ty][tx] = B[( (i * TILE_WIDTH * BLOCK_SIZE) + (j * TILE_WIDTH) + ty ) * N + col]; 43 | 44 | __syncthreads(); 45 | 46 | for(int p = 0; p < TILE_WIDTH;p++) 47 | { 48 | C_val += s_a[ty][p] * s_b[p][tx]; 49 | } 50 | __syncthreads(); 51 | } 52 | } 53 | C[row * n + col] = C_val; 54 | } 55 | } 56 | } 57 | 58 | void constantInit(float *data, int size, float val) 59 | { 60 | for (int i = 0; i < size; ++i) 61 | { 62 | data[i] = val; 63 | } 64 | } 65 | 66 | /** 67 | * Run a simple test of matrix multiplication using CUDA 68 | */ 69 | int matrixMultiply(int argc, char **argv, int n) 70 | { 71 | // Allocate host memory for matrices A and B 72 | unsigned int size_A = n * n; 73 | unsigned int mem_size_A = sizeof(float) * size_A; 74 | float *h_A = (float *)malloc(mem_size_A); 75 | unsigned int size_B = n * n; 76 | unsigned int mem_size_B = sizeof(float) * size_B; 77 | float *h_B = (float *)malloc(mem_size_B); 78 | 79 | // Initialize host memory 80 | const float valB = 0.01f; 81 | constantInit(h_A, size_A, 1.0f); 82 | constantInit(h_B, size_B, valB); 83 | 84 | // Allocate device memory 85 | float *d_A, *d_B, *d_C; 86 | 87 | // Allocate host matrix C 88 | unsigned int mem_size_C = n * n * sizeof(float); 89 | float *h_C = (float *)malloc(mem_size_C); 90 | 91 | if (h_C == NULL) 92 | { 93 | fprintf(stderr, "Failed to allocate host matrix C!\n"); 94 | exit(EXIT_FAILURE); 95 | } 96 | 97 | cudaError_t error; 98 | 99 | error = cudaMalloc((void **)&d_A, mem_size_A); 100 | 101 | if (error != cudaSuccess) 102 | { 103 | printf("cudaMalloc d_A returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 104 | exit(EXIT_FAILURE); 105 | } 106 | 107 | error = cudaMalloc((void **)&d_B, mem_size_B); 108 | 109 | if (error != cudaSuccess) 110 | { 111 | printf("cudaMalloc d_B returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 112 | exit(EXIT_FAILURE); 113 | } 114 | 115 | error = cudaMalloc((void **)&d_C, mem_size_C); 116 | 117 | if (error != cudaSuccess) 118 | { 119 | printf("cudaMalloc d_C returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 120 | exit(EXIT_FAILURE); 121 | } 122 | 123 | // copy host memory to device 124 | error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 125 | 126 | if (error != cudaSuccess) 127 | { 128 | printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 129 | exit(EXIT_FAILURE); 130 | } 131 | 132 | error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 133 | 134 | if (error != cudaSuccess) 135 | { 136 | printf("cudaMemcpy (d_B,h_B) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 137 | exit(EXIT_FAILURE); 138 | } 139 | 140 | // Setup execution parameters 141 | dim3 threads(BLOCK_SIZE, BLOCK_SIZE, 1); 142 | dim3 grid((((n - 1) / BLOCK_SIZE + 1) - 1) / TILE_WIDTH + 1, (((n - 1) / BLOCK_SIZE + 1) - 1) / TILE_WIDTH + 1, 1); 143 | 144 | // Create and start timer 145 | printf("Computing result using CUDA Kernel...\n"); 146 | 147 | // Allocate CUDA events that we'll use for timing 148 | cudaEvent_t start; 149 | error = cudaEventCreate(&start); 150 | 151 | if (error != cudaSuccess) 152 | { 153 | fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); 154 | exit(EXIT_FAILURE); 155 | } 156 | 157 | cudaEvent_t stop; 158 | error = cudaEventCreate(&stop); 159 | 160 | if (error != cudaSuccess) 161 | { 162 | fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); 163 | exit(EXIT_FAILURE); 164 | } 165 | 166 | // Record the start event 167 | error = cudaEventRecord(start, NULL); 168 | 169 | if (error != cudaSuccess) 170 | { 171 | fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); 172 | exit(EXIT_FAILURE); 173 | } 174 | 175 | // Execute the kernel 176 | matrixMulCUDA<<>>(d_C, d_A, d_B, n); 177 | 178 | error = cudaGetLastError(); 179 | if (error != cudaSuccess) 180 | { 181 | fprintf(stderr, "Failed to launch kernel!\n", cudaGetErrorString(error)); 182 | exit(EXIT_FAILURE); 183 | } 184 | 185 | // Record the stop event 186 | error = cudaEventRecord(stop, NULL); 187 | 188 | if (error != cudaSuccess) 189 | { 190 | fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); 191 | exit(EXIT_FAILURE); 192 | } 193 | 194 | // Wait for the stop event to complete 195 | error = cudaEventSynchronize(stop); 196 | 197 | if (error != cudaSuccess) 198 | { 199 | fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); 200 | exit(EXIT_FAILURE); 201 | } 202 | 203 | float msecTotal = 0.0f; 204 | error = cudaEventElapsedTime(&msecTotal, start, stop); 205 | 206 | printf("Elapsed time in msec = %f\n", msecTotal); 207 | 208 | if (error != cudaSuccess) 209 | { 210 | fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); 211 | exit(EXIT_FAILURE); 212 | } 213 | 214 | // Copy result from device to host 215 | error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); 216 | 217 | if (error != cudaSuccess) 218 | { 219 | printf("cudaMemcpy (h_C,d_C) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 220 | exit(EXIT_FAILURE); 221 | } 222 | 223 | // Clean up memory 224 | free(h_A); 225 | free(h_B); 226 | free(h_C); 227 | cudaFree(d_A); 228 | cudaFree(d_B); 229 | cudaFree(d_C); 230 | 231 | return EXIT_SUCCESS; 232 | } 233 | 234 | /** 235 | * Program main 236 | */ 237 | int main(int argc, char **argv) 238 | { 239 | printf("[Matrix Multiply Using CUDA] - Starting...\n"); 240 | 241 | // By default, we use device 0 242 | int devID = 0; 243 | cudaSetDevice(devID); 244 | 245 | cudaError_t error; 246 | cudaDeviceProp deviceProp; 247 | error = cudaGetDevice(&devID); 248 | 249 | if (error != cudaSuccess) 250 | { 251 | printf("cudaGetDevice returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 252 | } 253 | 254 | error = cudaGetDeviceProperties(&deviceProp, devID); 255 | 256 | if (deviceProp.computeMode == cudaComputeModeProhibited) 257 | { 258 | fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); 259 | exit(EXIT_SUCCESS); 260 | } 261 | 262 | if (error != cudaSuccess) 263 | { 264 | printf("cudaGetDeviceProperties returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); 265 | } 266 | else 267 | { 268 | printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); 269 | } 270 | 271 | // Size of square matrices 272 | size_t n = N; 273 | // printf("[-] N = "); 274 | // scanf("%u", &n); 275 | 276 | printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", n, n, n, n); 277 | 278 | int matrix_result = matrixMultiply(argc, argv, n); 279 | 280 | exit(matrix_result); 281 | } 282 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multicore-Programming 2 | Multicore programming course materials (Spring 2021) 3 | --------------------------------------------------------------------------------