├── README.md ├── Assignment 1_ Parallel DFS.cpp ├── Assignment 1_ Parallel BFS.cpp ├── Assignment 4_ Addition of two large vectors.cpp ├── Assignment 4_ Matrix Multiplication using CUDA C.cpp ├── Assignment 3_ Implement Min, Max, Sum and Average operations using Parallel Reduction.cpp └── Assignment 2_ Merge and Bubble sort.cpp /README.md: -------------------------------------------------------------------------------- 1 | # HPC-Practical 2 | Practical assignments of High performance computing. 3 | -------------------------------------------------------------------------------- /Assignment 1_ Parallel DFS.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | 7 | struct Node { 8 | int data; 9 | vector neighbors; 10 | }; 11 | 12 | void parallel_DFS(Node* node, vector& visited) { 13 | visited[node->data] = true; 14 | cout << node->data << " "; 15 | 16 | // Parallel exploration of unvisited neighbors 17 | #pragma omp parallel for 18 | for (Node* neighbor : node->neighbors) { 19 | if (!visited[neighbor->data]) { 20 | parallel_DFS(neighbor, visited); 21 | } 22 | } 23 | } 24 | 25 | int main() { 26 | // Create a sample undirected graph (adjacency list) 27 | vector graph(5); 28 | 29 | graph[0].neighbors = {&graph[1], &graph[2]}; 30 | graph[1].neighbors = {&graph[0], &graph[3]}; 31 | graph[2].neighbors = {&graph[0], &graph[4]}; 32 | graph[3].neighbors = {&graph[1]}; 33 | graph[4].neighbors = {&graph[2]}; 34 | 35 | vector visited(graph.size(), false); 36 | parallel_DFS(&graph[0], visited); 37 | 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /Assignment 1_ Parallel BFS.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | struct Node { 9 | int data; 10 | vector children; 11 | }; 12 | 13 | void parallel_BFS(Node* root) { 14 | #pragma omp parallel shared(root) 15 | { 16 | queue queue; 17 | queue.push(root); 18 | 19 | while (!queue.empty()) { 20 | // Single thread to determine level size to avoid race conditions 21 | int level_size = queue.size(); // Declare level_size here 22 | 23 | // Parallel processing for nodes in the current level 24 | #pragma omp for nowait 25 | for (int i = 0; i < level_size; ++i) { 26 | Node* current = queue.front(); 27 | queue.pop(); 28 | 29 | // Visit current node 30 | cout << current->data << " "; 31 | 32 | // Add unvisited children to queue in parallel 33 | #pragma omp task shared(queue) 34 | for (Node* child : current->children) { 35 | queue.push(child); 36 | } 37 | } 38 | 39 | // Wait for all tasks within the loop to finish before continuing 40 | #pragma omp taskwait 41 | } 42 | } 43 | } 44 | 45 | int main() { 46 | // Create a sample tree using initializer list (corrected) 47 | Node* root = new Node{1, {new Node{2}, new Node{3, {new Node{4}}}}}; 48 | 49 | parallel_BFS(root); 50 | 51 | return 0; 52 | } 53 | -------------------------------------------------------------------------------- /Assignment 4_ Addition of two large vectors.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void addVectors(int* A, int* B, int* C, int n) { 5 | int i = blockIdx.x * blockDim.x + threadIdx.x; 6 | if (i < n) { 7 | C[i] = A[i] + B[i]; 8 | } 9 | } 10 | 11 | int main() { 12 | int n = 1000000; 13 | int* A, *B, *h_C; // Allocate result on host memory 14 | int size = n * sizeof(int); 15 | 16 | // Allocate memory on the host 17 | cudaMallocHost(&A, size); 18 | cudaMallocHost(&B, size); 19 | malloc(&h_C, size); 20 | 21 | // Initialize the vectors 22 | for (int i = 0; i < n; i++) { 23 | A[i] = i; 24 | B[i] = i * 2; 25 | } 26 | 27 | // Allocate memory on the device 28 | int* dev_A, *dev_B, *dev_C; 29 | cudaMalloc(&dev_A, size); 30 | cudaMalloc(&dev_B, size); 31 | cudaMalloc(&dev_C, size); 32 | 33 | // Copy data from host to device 34 | cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice); 35 | cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice); 36 | 37 | // Launch the kernel 38 | int blockSize = 256; 39 | int numBlocks = (n + blockSize - 1) / blockSize; 40 | addVectors<<>>(dev_A, dev_B, dev_C, n); 41 | 42 | // Copy data from device to host 43 | cudaMemcpy(h_C, dev_C, size, cudaMemcpyDeviceToHost); 44 | 45 | // Print the results (limited to first 10 elements) 46 | for (int i = 0; i < 10; i++) { 47 | cout << h_C[i] << " "; 48 | } 49 | cout << endl; 50 | 51 | // Free memory 52 | cudaFree(dev_A); 53 | cudaFree(dev_B); 54 | cudaFree(dev_C); 55 | cudaFreeHost(A); 56 | cudaFreeHost(B); 57 | free(h_C); // Free host memory allocated with malloc 58 | 59 | return 0; 60 | } 61 | -------------------------------------------------------------------------------- /Assignment 4_ Matrix Multiplication using CUDA C.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void matmul(int *A, int *B, int *C, int N) { 5 | int row = blockIdx.y * blockDim.y + threadIdx.y; 6 | int col = blockIdx.x * blockDim.x + threadIdx.x; 7 | 8 | // Check for valid matrix indices within bounds 9 | if (row < N && col < N) { 10 | int sum = 0; 11 | for (int k = 0; k < N; ++k) { 12 | // Access elements using global memory indexing 13 | sum += A[row * N + k] * B[k * N + col]; 14 | } 15 | C[row * N + col] = sum; 16 | } 17 | } 18 | 19 | int main() { 20 | int N = 512; 21 | int size = N * N * sizeof(int); 22 | 23 | // Allocate memory on host 24 | int *h_A, *h_B, *h_C; 25 | cudaMallocHost(&h_A, size); 26 | cudaMallocHost(&h_B, size); 27 | cudaMallocHost(&h_C, size); 28 | 29 | // Allocate memory on device 30 | int *d_A, *d_B, *d_C; 31 | cudaMalloc(&d_A, size); 32 | cudaMalloc(&d_B, size); 33 | cudaMalloc(&d_C, size); 34 | 35 | // Initialize matrices A and B (example) 36 | for (int i = 0; i < N; ++i) { 37 | for (int j = 0; j < N; ++j) { 38 | h_A[i * N + j] = i * N + j; 39 | h_B[i * N + j] = j * N + i; 40 | } 41 | } 42 | 43 | // Copy data from host to device 44 | cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 45 | cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 46 | 47 | // Launch the kernel 48 | int threadsPerBlock = 16; 49 | int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; 50 | dim3 dimBlock(threadsPerBlock, threadsPerBlock); 51 | dim3 dimGrid(blocksPerGrid, blocksPerGrid); 52 | matmul<<>>(d_A, d_B, d_C, N); 53 | 54 | // Copy data from device to host 55 | cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 56 | 57 | // Print the result (limited to a sub-matrix for large N) 58 | for (int i = 0; i < 10; ++i) { 59 | for (int j = 0; j < 10; ++j) { 60 | std::cout << h_C[i * N + j] << " "; 61 | } 62 | std::cout << std::endl; 63 | } 64 | 65 | // Free memory 66 | cudaFree(d_A); 67 | cudaFree(d_B); 68 | cudaFree(d_C); 69 | cudaFreeHost(h_A); 70 | cudaFreeHost(h_B); 71 | cudaFreeHost(h_C); 72 | 73 | return 0; 74 | } 75 | -------------------------------------------------------------------------------- /Assignment 3_ Implement Min, Max, Sum and Average operations using Parallel Reduction.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | 7 | // Function to find the minimum value in an array using parallel reduction 8 | void min_reduction(int arr[], int n) { 9 | int min_value = INT_MAX; // Initialize min_value to positive infinity 10 | 11 | // Use OpenMP parallel for loop with reduction clause (min) 12 | #pragma omp parallel for reduction(min: min_value) 13 | for (int i = 0; i < n; i++) { 14 | if (arr[i] < min_value) { 15 | min_value = arr[i]; // Update min_value if a smaller element is found 16 | } 17 | } 18 | 19 | cout << "Minimum value: " << min_value << endl; 20 | } 21 | 22 | // Function to find the maximum value in an array using parallel reduction 23 | void max_reduction(int arr[], int n) { 24 | int max_value = INT_MIN; // Initialize max_value to negative infinity 25 | 26 | // Use OpenMP parallel for loop with reduction clause (max) 27 | #pragma omp parallel for reduction(max: max_value) 28 | for (int i = 0; i < n; i++) { 29 | if (arr[i] > max_value) { 30 | max_value = arr[i]; // Update max_value if a larger element is found 31 | } 32 | } 33 | 34 | cout << "Maximum value: " << max_value << endl; 35 | } 36 | 37 | // Function to calculate the sum of elements in an array using parallel reduction 38 | void sum_reduction(int arr[], int n) { 39 | int sum = 0; 40 | 41 | // Use OpenMP parallel for loop with reduction clause (+) 42 | #pragma omp parallel for reduction(+: sum) 43 | for (int i = 0; i < n; i++) { 44 | sum += arr[i]; // Add each element to the sum 45 | } 46 | 47 | cout << "Sum: " << sum << endl; 48 | } 49 | 50 | // Function to calculate the average of elements in an array using parallel reduction 51 | void average_reduction(int arr[], int n) { 52 | int sum = 0; 53 | 54 | // Use OpenMP parallel for loop with reduction clause (+) 55 | #pragma omp parallel for reduction(+: sum) 56 | for (int i = 0; i < n; i++) { 57 | sum += arr[i]; // Add each element to the sum 58 | } 59 | 60 | // Calculate average using the reduced sum (note: consider division by n-1 for unbiased average) 61 | double average = (double)sum / (n - 1); 62 | cout << "Average: " << average << endl; 63 | } 64 | 65 | int main() { 66 | int n; 67 | cout << "\nEnter the total number of elements: "; 68 | cin >> n; 69 | 70 | int *arr = new int[n]; // Allocate memory for the array 71 | 72 | cout << "\nEnter the elements: "; 73 | for (int i = 0; i < n; i++) { 74 | cin >> arr[i]; 75 | } 76 | 77 | min_reduction(arr, n); 78 | max_reduction(arr, n); 79 | sum_reduction(arr, n); 80 | average_reduction(arr, n); 81 | 82 | delete[] arr; // Deallocate memory after use 83 | return 0; 84 | } 85 | -------------------------------------------------------------------------------- /Assignment 2_ Merge and Bubble sort.cpp: -------------------------------------------------------------------------------- 1 | // Write a program to implement Parallel Bubble Sort and Merge sort using OpenMP. Use existing algorithms and measure the performance of sequential and parallel algorithms 2 | 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | // Function to swap two elements 12 | void swap(int& a, int& b) { 13 | int temp = a; 14 | a = b; 15 | b = temp; 16 | } 17 | 18 | // Sequential Bubble Sort 19 | void bubbleSortSequential(vector& arr) { 20 | int n = arr.size(); 21 | for (int i = 0; i < n - 1; ++i) { 22 | bool swapped = false; 23 | for (int j = 0; j < n - i - 1; ++j) { 24 | if (arr[j] > arr[j + 1]) { 25 | swap(arr[j], arr[j + 1]); 26 | swapped = true; 27 | } 28 | } 29 | if (!swapped) { 30 | break; // Early termination if no swaps occurred 31 | } 32 | } 33 | } 34 | 35 | // Merge function for Merge Sort 36 | void merge(vector& arr, int left, int mid, int right) { 37 | int n1 = mid - left + 1; 38 | int n2 = right - mid; 39 | 40 | vector leftArr(n1); 41 | vector rightArr(n2); 42 | 43 | for (int i = 0; i < n1; ++i) { 44 | leftArr[i] = arr[left + i]; 45 | } 46 | for (int j = 0; j < n2; ++j) { 47 | rightArr[j] = arr[mid + 1 + j]; 48 | } 49 | 50 | int i = 0, j = 0, k = left; 51 | while (i < n1 && j < n2) { 52 | if (leftArr[i] <= rightArr[j]) { 53 | arr[k] = leftArr[i]; 54 | i++; 55 | } else { 56 | arr[k] = rightArr[j]; 57 | j++; 58 | } 59 | k++; 60 | } 61 | 62 | while (i < n1) { 63 | arr[k] = leftArr[i]; 64 | i++; 65 | k++; 66 | } 67 | 68 | while (j < n2) { 69 | arr[k] = rightArr[j]; 70 | j++; 71 | k++; 72 | } 73 | } 74 | 75 | // Recursive Merge Sort 76 | void mergeSortRecursive(vector& arr, int left, int right) { 77 | if (left < right) { 78 | int mid = left + (right - left) / 2; 79 | mergeSortRecursive(arr, left, mid); 80 | mergeSortRecursive(arr, mid + 1, right); 81 | merge(arr, left, mid, right); 82 | } 83 | } 84 | 85 | // Parallel Merge Sort (uses Merge SortRecursive for base cases) 86 | void mergeSortParallel(vector& arr, int left, int right) { 87 | if (right - left <= 100) { // Threshold for sequential execution (adjust as needed) 88 | mergeSortRecursive(arr, left, right); 89 | } else { 90 | int mid = left + (right - left) / 2; 91 | #pragma omp task firstprivate(arr, left, mid) 92 | mergeSortParallel(arr, left, mid); 93 | #pragma omp task firstprivate(arr, mid + 1, right) 94 | mergeSortParallel(arr, mid + 1, right); 95 | #pragma omp taskwait 96 | merge(arr, left, mid, right); 97 | } 98 | } 99 | 100 | // Parallel Bubble Sort 101 | void bubbleSortParallel(vector& arr) { 102 | int n = arr.size(); 103 | for (int i = 0; i < n - 1; ++i) { 104 | bool swapped = false; 105 | #pragma omp parallel for num_threads(4) // Adjust num_threads as needed 106 | for (int j = 0; j < n - i - 1; ++j) { 107 | if (arr[j] > arr[j + 1]) { 108 | swap(arr[j], arr[j + 1]); 109 | swapped = true; 110 | } 111 | } 112 | #pragma omp barrier // Ensure all threads finish iteration before next 113 | if (!swapped) { 114 | break; // Early termination if no swaps occurred 115 | } 116 | } 117 | } 118 | 119 | int main() { 120 | int N; 121 | 122 | // Get vector size from user 123 | cout << "Enter the size of the vector: "; 124 | cin >> N; 125 | 126 | vector arr(N); 127 | 128 | // Get elements from user 129 | cout << "Enter the elements of the vector (space-separated): "; 130 | for (int i = 0; i < N; ++i) { 131 | cin >> arr[i]; 132 | } 133 | 134 | // Measure performance of sequential bubble sort 135 | auto start = chrono::high_resolution_clock::now(); 136 | bubbleSortSequential(arr); 137 | auto end = chrono::high_resolution_clock::now(); 138 | double sequentialBubbleSortTime = chrono::duration_cast(end - start).count() / 1e6; 139 | cout << "Sequential Bubble Sort Time (ms): " << sequentialBubbleSortTime << endl; 140 | 141 | // Print the sorted vector (optional) 142 | // cout << "Sorted vector (Sequential Bubble Sort): "; 143 | // for (int i = 0; i < N; ++i) { 144 | // cout << arr[i] << " "; 145 | // } 146 | // cout << endl; 147 | 148 | // Reset the vector (optional, uncomment if needed for multiple sorts) 149 | // arr.assign(N, 0); 150 | 151 | // Measure performance of parallel bubble sort 152 | start = chrono::high_resolution_clock::now(); 153 | bubbleSortParallel(arr); 154 | end = chrono::high_resolution_clock::now(); 155 | double parallelBubbleSortTime = chrono::duration_cast(end - start).count() / 1e6; 156 | cout << "Parallel Bubble Sort Time (ms): " << parallelBubbleSortTime << endl; 157 | 158 | // Print the sorted vector (optional) 159 | // cout << "Sorted vector (Parallel Bubble Sort): "; 160 | // for (int i = 0; i < N; ++i) { 161 | // cout << arr[i] << " "; 162 | // } 163 | // cout << endl; 164 | 165 | // Reset the vector (optional, uncomment if needed for multiple sorts) 166 | // arr.assign(N, 0); 167 | 168 | // Measure performance of sequential merge sort 169 | start = chrono::high_resolution_clock::now(); 170 | mergeSortRecursive(arr, 0, N - 1); 171 | end = chrono::high_resolution_clock::now(); 172 | double sequentialMergeSortTime = chrono::duration_cast(end - start).count() / 1e6; 173 | cout << "Sequential Merge Sort Time (ms): " << sequentialMergeSortTime << endl; 174 | 175 | // Print the sorted vector (optional) 176 | // cout << "Sorted vector (Sequential Merge Sort): "; 177 | // for (int i = 0; i < N; ++i) { 178 | // cout << arr[i] << " "; 179 | // } 180 | // cout << endl; 181 | 182 | // Reset the vector (optional, uncomment if needed for multiple sorts) 183 | // arr.assign(N, 0); 184 | 185 | // Measure performance of parallel merge sort 186 | start = chrono::high_resolution_clock::now(); 187 | mergeSortParallel(arr, 0, N - 1); 188 | end = chrono::high_resolution_clock::now(); 189 | double parallelMergeSortTime = chrono::duration_cast(end - start).count() / 1e6; 190 | cout << "Parallel Merge Sort Time (ms): " << parallelMergeSortTime << endl; 191 | 192 | // Print the sorted vector (optional) 193 | // cout << "Sorted vector (Parallel Merge Sort): "; 194 | // for (int i = 0; i < N; ++i) { 195 | // cout << arr[i] << " "; 196 | // } 197 | // cout << endl; 198 | 199 | return 0; 200 | } 201 | 202 | --------------------------------------------------------------------------------