├── README.md
├── Assignment 1_ Parallel DFS.cpp
├── Assignment 1_ Parallel BFS.cpp
├── Assignment 4_ Addition of two large vectors.cpp
├── Assignment 4_ Matrix Multiplication using CUDA C.cpp
├── Assignment 3_ Implement Min, Max, Sum and Average operations using Parallel Reduction.cpp
└── Assignment 2_ Merge and Bubble sort.cpp
/README.md:
--------------------------------------------------------------------------------
1 | # HPC-Practical
2 | Practical assignments of High performance computing.
3 |
--------------------------------------------------------------------------------
/Assignment 1_ Parallel DFS.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | using namespace std;
6 |
7 | struct Node {
8 | int data;
9 | vector neighbors;
10 | };
11 |
12 | void parallel_DFS(Node* node, vector& visited) {
13 | visited[node->data] = true;
14 | cout << node->data << " ";
15 |
16 | // Parallel exploration of unvisited neighbors
17 | #pragma omp parallel for
18 | for (Node* neighbor : node->neighbors) {
19 | if (!visited[neighbor->data]) {
20 | parallel_DFS(neighbor, visited);
21 | }
22 | }
23 | }
24 |
25 | int main() {
26 | // Create a sample undirected graph (adjacency list)
27 | vector graph(5);
28 |
29 | graph[0].neighbors = {&graph[1], &graph[2]};
30 | graph[1].neighbors = {&graph[0], &graph[3]};
31 | graph[2].neighbors = {&graph[0], &graph[4]};
32 | graph[3].neighbors = {&graph[1]};
33 | graph[4].neighbors = {&graph[2]};
34 |
35 | vector visited(graph.size(), false);
36 | parallel_DFS(&graph[0], visited);
37 |
38 | return 0;
39 | }
40 |
--------------------------------------------------------------------------------
/Assignment 1_ Parallel BFS.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | using namespace std;
7 |
8 | struct Node {
9 | int data;
10 | vector children;
11 | };
12 |
13 | void parallel_BFS(Node* root) {
14 | #pragma omp parallel shared(root)
15 | {
16 | queue queue;
17 | queue.push(root);
18 |
19 | while (!queue.empty()) {
20 | // Single thread to determine level size to avoid race conditions
21 | int level_size = queue.size(); // Declare level_size here
22 |
23 | // Parallel processing for nodes in the current level
24 | #pragma omp for nowait
25 | for (int i = 0; i < level_size; ++i) {
26 | Node* current = queue.front();
27 | queue.pop();
28 |
29 | // Visit current node
30 | cout << current->data << " ";
31 |
32 | // Add unvisited children to queue in parallel
33 | #pragma omp task shared(queue)
34 | for (Node* child : current->children) {
35 | queue.push(child);
36 | }
37 | }
38 |
39 | // Wait for all tasks within the loop to finish before continuing
40 | #pragma omp taskwait
41 | }
42 | }
43 | }
44 |
45 | int main() {
46 | // Create a sample tree using initializer list (corrected)
47 | Node* root = new Node{1, {new Node{2}, new Node{3, {new Node{4}}}}};
48 |
49 | parallel_BFS(root);
50 |
51 | return 0;
52 | }
53 |
--------------------------------------------------------------------------------
/Assignment 4_ Addition of two large vectors.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void addVectors(int* A, int* B, int* C, int n) {
5 | int i = blockIdx.x * blockDim.x + threadIdx.x;
6 | if (i < n) {
7 | C[i] = A[i] + B[i];
8 | }
9 | }
10 |
11 | int main() {
12 | int n = 1000000;
13 | int* A, *B, *h_C; // Allocate result on host memory
14 | int size = n * sizeof(int);
15 |
16 | // Allocate memory on the host
17 | cudaMallocHost(&A, size);
18 | cudaMallocHost(&B, size);
19 | malloc(&h_C, size);
20 |
21 | // Initialize the vectors
22 | for (int i = 0; i < n; i++) {
23 | A[i] = i;
24 | B[i] = i * 2;
25 | }
26 |
27 | // Allocate memory on the device
28 | int* dev_A, *dev_B, *dev_C;
29 | cudaMalloc(&dev_A, size);
30 | cudaMalloc(&dev_B, size);
31 | cudaMalloc(&dev_C, size);
32 |
33 | // Copy data from host to device
34 | cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
35 | cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);
36 |
37 | // Launch the kernel
38 | int blockSize = 256;
39 | int numBlocks = (n + blockSize - 1) / blockSize;
40 | addVectors<<>>(dev_A, dev_B, dev_C, n);
41 |
42 | // Copy data from device to host
43 | cudaMemcpy(h_C, dev_C, size, cudaMemcpyDeviceToHost);
44 |
45 | // Print the results (limited to first 10 elements)
46 | for (int i = 0; i < 10; i++) {
47 | cout << h_C[i] << " ";
48 | }
49 | cout << endl;
50 |
51 | // Free memory
52 | cudaFree(dev_A);
53 | cudaFree(dev_B);
54 | cudaFree(dev_C);
55 | cudaFreeHost(A);
56 | cudaFreeHost(B);
57 | free(h_C); // Free host memory allocated with malloc
58 |
59 | return 0;
60 | }
61 |
--------------------------------------------------------------------------------
/Assignment 4_ Matrix Multiplication using CUDA C.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void matmul(int *A, int *B, int *C, int N) {
5 | int row = blockIdx.y * blockDim.y + threadIdx.y;
6 | int col = blockIdx.x * blockDim.x + threadIdx.x;
7 |
8 | // Check for valid matrix indices within bounds
9 | if (row < N && col < N) {
10 | int sum = 0;
11 | for (int k = 0; k < N; ++k) {
12 | // Access elements using global memory indexing
13 | sum += A[row * N + k] * B[k * N + col];
14 | }
15 | C[row * N + col] = sum;
16 | }
17 | }
18 |
19 | int main() {
20 | int N = 512;
21 | int size = N * N * sizeof(int);
22 |
23 | // Allocate memory on host
24 | int *h_A, *h_B, *h_C;
25 | cudaMallocHost(&h_A, size);
26 | cudaMallocHost(&h_B, size);
27 | cudaMallocHost(&h_C, size);
28 |
29 | // Allocate memory on device
30 | int *d_A, *d_B, *d_C;
31 | cudaMalloc(&d_A, size);
32 | cudaMalloc(&d_B, size);
33 | cudaMalloc(&d_C, size);
34 |
35 | // Initialize matrices A and B (example)
36 | for (int i = 0; i < N; ++i) {
37 | for (int j = 0; j < N; ++j) {
38 | h_A[i * N + j] = i * N + j;
39 | h_B[i * N + j] = j * N + i;
40 | }
41 | }
42 |
43 | // Copy data from host to device
44 | cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
45 | cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
46 |
47 | // Launch the kernel
48 | int threadsPerBlock = 16;
49 | int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
50 | dim3 dimBlock(threadsPerBlock, threadsPerBlock);
51 | dim3 dimGrid(blocksPerGrid, blocksPerGrid);
52 | matmul<<>>(d_A, d_B, d_C, N);
53 |
54 | // Copy data from device to host
55 | cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
56 |
57 | // Print the result (limited to a sub-matrix for large N)
58 | for (int i = 0; i < 10; ++i) {
59 | for (int j = 0; j < 10; ++j) {
60 | std::cout << h_C[i * N + j] << " ";
61 | }
62 | std::cout << std::endl;
63 | }
64 |
65 | // Free memory
66 | cudaFree(d_A);
67 | cudaFree(d_B);
68 | cudaFree(d_C);
69 | cudaFreeHost(h_A);
70 | cudaFreeHost(h_B);
71 | cudaFreeHost(h_C);
72 |
73 | return 0;
74 | }
75 |
--------------------------------------------------------------------------------
/Assignment 3_ Implement Min, Max, Sum and Average operations using Parallel Reduction.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | using namespace std;
6 |
7 | // Function to find the minimum value in an array using parallel reduction
8 | void min_reduction(int arr[], int n) {
9 | int min_value = INT_MAX; // Initialize min_value to positive infinity
10 |
11 | // Use OpenMP parallel for loop with reduction clause (min)
12 | #pragma omp parallel for reduction(min: min_value)
13 | for (int i = 0; i < n; i++) {
14 | if (arr[i] < min_value) {
15 | min_value = arr[i]; // Update min_value if a smaller element is found
16 | }
17 | }
18 |
19 | cout << "Minimum value: " << min_value << endl;
20 | }
21 |
22 | // Function to find the maximum value in an array using parallel reduction
23 | void max_reduction(int arr[], int n) {
24 | int max_value = INT_MIN; // Initialize max_value to negative infinity
25 |
26 | // Use OpenMP parallel for loop with reduction clause (max)
27 | #pragma omp parallel for reduction(max: max_value)
28 | for (int i = 0; i < n; i++) {
29 | if (arr[i] > max_value) {
30 | max_value = arr[i]; // Update max_value if a larger element is found
31 | }
32 | }
33 |
34 | cout << "Maximum value: " << max_value << endl;
35 | }
36 |
37 | // Function to calculate the sum of elements in an array using parallel reduction
38 | void sum_reduction(int arr[], int n) {
39 | int sum = 0;
40 |
41 | // Use OpenMP parallel for loop with reduction clause (+)
42 | #pragma omp parallel for reduction(+: sum)
43 | for (int i = 0; i < n; i++) {
44 | sum += arr[i]; // Add each element to the sum
45 | }
46 |
47 | cout << "Sum: " << sum << endl;
48 | }
49 |
50 | // Function to calculate the average of elements in an array using parallel reduction
51 | void average_reduction(int arr[], int n) {
52 | int sum = 0;
53 |
54 | // Use OpenMP parallel for loop with reduction clause (+)
55 | #pragma omp parallel for reduction(+: sum)
56 | for (int i = 0; i < n; i++) {
57 | sum += arr[i]; // Add each element to the sum
58 | }
59 |
60 | // Calculate average using the reduced sum (note: consider division by n-1 for unbiased average)
61 | double average = (double)sum / (n - 1);
62 | cout << "Average: " << average << endl;
63 | }
64 |
65 | int main() {
66 | int n;
67 | cout << "\nEnter the total number of elements: ";
68 | cin >> n;
69 |
70 | int *arr = new int[n]; // Allocate memory for the array
71 |
72 | cout << "\nEnter the elements: ";
73 | for (int i = 0; i < n; i++) {
74 | cin >> arr[i];
75 | }
76 |
77 | min_reduction(arr, n);
78 | max_reduction(arr, n);
79 | sum_reduction(arr, n);
80 | average_reduction(arr, n);
81 |
82 | delete[] arr; // Deallocate memory after use
83 | return 0;
84 | }
85 |
--------------------------------------------------------------------------------
/Assignment 2_ Merge and Bubble sort.cpp:
--------------------------------------------------------------------------------
1 | // Write a program to implement Parallel Bubble Sort and Merge sort using OpenMP. Use existing algorithms and measure the performance of sequential and parallel algorithms
2 |
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | using namespace std;
10 |
11 | // Function to swap two elements
12 | void swap(int& a, int& b) {
13 | int temp = a;
14 | a = b;
15 | b = temp;
16 | }
17 |
18 | // Sequential Bubble Sort
19 | void bubbleSortSequential(vector& arr) {
20 | int n = arr.size();
21 | for (int i = 0; i < n - 1; ++i) {
22 | bool swapped = false;
23 | for (int j = 0; j < n - i - 1; ++j) {
24 | if (arr[j] > arr[j + 1]) {
25 | swap(arr[j], arr[j + 1]);
26 | swapped = true;
27 | }
28 | }
29 | if (!swapped) {
30 | break; // Early termination if no swaps occurred
31 | }
32 | }
33 | }
34 |
35 | // Merge function for Merge Sort
36 | void merge(vector& arr, int left, int mid, int right) {
37 | int n1 = mid - left + 1;
38 | int n2 = right - mid;
39 |
40 | vector leftArr(n1);
41 | vector rightArr(n2);
42 |
43 | for (int i = 0; i < n1; ++i) {
44 | leftArr[i] = arr[left + i];
45 | }
46 | for (int j = 0; j < n2; ++j) {
47 | rightArr[j] = arr[mid + 1 + j];
48 | }
49 |
50 | int i = 0, j = 0, k = left;
51 | while (i < n1 && j < n2) {
52 | if (leftArr[i] <= rightArr[j]) {
53 | arr[k] = leftArr[i];
54 | i++;
55 | } else {
56 | arr[k] = rightArr[j];
57 | j++;
58 | }
59 | k++;
60 | }
61 |
62 | while (i < n1) {
63 | arr[k] = leftArr[i];
64 | i++;
65 | k++;
66 | }
67 |
68 | while (j < n2) {
69 | arr[k] = rightArr[j];
70 | j++;
71 | k++;
72 | }
73 | }
74 |
75 | // Recursive Merge Sort
76 | void mergeSortRecursive(vector& arr, int left, int right) {
77 | if (left < right) {
78 | int mid = left + (right - left) / 2;
79 | mergeSortRecursive(arr, left, mid);
80 | mergeSortRecursive(arr, mid + 1, right);
81 | merge(arr, left, mid, right);
82 | }
83 | }
84 |
85 | // Parallel Merge Sort (uses Merge SortRecursive for base cases)
86 | void mergeSortParallel(vector& arr, int left, int right) {
87 | if (right - left <= 100) { // Threshold for sequential execution (adjust as needed)
88 | mergeSortRecursive(arr, left, right);
89 | } else {
90 | int mid = left + (right - left) / 2;
91 | #pragma omp task firstprivate(arr, left, mid)
92 | mergeSortParallel(arr, left, mid);
93 | #pragma omp task firstprivate(arr, mid + 1, right)
94 | mergeSortParallel(arr, mid + 1, right);
95 | #pragma omp taskwait
96 | merge(arr, left, mid, right);
97 | }
98 | }
99 |
100 | // Parallel Bubble Sort
101 | void bubbleSortParallel(vector& arr) {
102 | int n = arr.size();
103 | for (int i = 0; i < n - 1; ++i) {
104 | bool swapped = false;
105 | #pragma omp parallel for num_threads(4) // Adjust num_threads as needed
106 | for (int j = 0; j < n - i - 1; ++j) {
107 | if (arr[j] > arr[j + 1]) {
108 | swap(arr[j], arr[j + 1]);
109 | swapped = true;
110 | }
111 | }
112 | #pragma omp barrier // Ensure all threads finish iteration before next
113 | if (!swapped) {
114 | break; // Early termination if no swaps occurred
115 | }
116 | }
117 | }
118 |
119 | int main() {
120 | int N;
121 |
122 | // Get vector size from user
123 | cout << "Enter the size of the vector: ";
124 | cin >> N;
125 |
126 | vector arr(N);
127 |
128 | // Get elements from user
129 | cout << "Enter the elements of the vector (space-separated): ";
130 | for (int i = 0; i < N; ++i) {
131 | cin >> arr[i];
132 | }
133 |
134 | // Measure performance of sequential bubble sort
135 | auto start = chrono::high_resolution_clock::now();
136 | bubbleSortSequential(arr);
137 | auto end = chrono::high_resolution_clock::now();
138 | double sequentialBubbleSortTime = chrono::duration_cast(end - start).count() / 1e6;
139 | cout << "Sequential Bubble Sort Time (ms): " << sequentialBubbleSortTime << endl;
140 |
141 | // Print the sorted vector (optional)
142 | // cout << "Sorted vector (Sequential Bubble Sort): ";
143 | // for (int i = 0; i < N; ++i) {
144 | // cout << arr[i] << " ";
145 | // }
146 | // cout << endl;
147 |
148 | // Reset the vector (optional, uncomment if needed for multiple sorts)
149 | // arr.assign(N, 0);
150 |
151 | // Measure performance of parallel bubble sort
152 | start = chrono::high_resolution_clock::now();
153 | bubbleSortParallel(arr);
154 | end = chrono::high_resolution_clock::now();
155 | double parallelBubbleSortTime = chrono::duration_cast(end - start).count() / 1e6;
156 | cout << "Parallel Bubble Sort Time (ms): " << parallelBubbleSortTime << endl;
157 |
158 | // Print the sorted vector (optional)
159 | // cout << "Sorted vector (Parallel Bubble Sort): ";
160 | // for (int i = 0; i < N; ++i) {
161 | // cout << arr[i] << " ";
162 | // }
163 | // cout << endl;
164 |
165 | // Reset the vector (optional, uncomment if needed for multiple sorts)
166 | // arr.assign(N, 0);
167 |
168 | // Measure performance of sequential merge sort
169 | start = chrono::high_resolution_clock::now();
170 | mergeSortRecursive(arr, 0, N - 1);
171 | end = chrono::high_resolution_clock::now();
172 | double sequentialMergeSortTime = chrono::duration_cast(end - start).count() / 1e6;
173 | cout << "Sequential Merge Sort Time (ms): " << sequentialMergeSortTime << endl;
174 |
175 | // Print the sorted vector (optional)
176 | // cout << "Sorted vector (Sequential Merge Sort): ";
177 | // for (int i = 0; i < N; ++i) {
178 | // cout << arr[i] << " ";
179 | // }
180 | // cout << endl;
181 |
182 | // Reset the vector (optional, uncomment if needed for multiple sorts)
183 | // arr.assign(N, 0);
184 |
185 | // Measure performance of parallel merge sort
186 | start = chrono::high_resolution_clock::now();
187 | mergeSortParallel(arr, 0, N - 1);
188 | end = chrono::high_resolution_clock::now();
189 | double parallelMergeSortTime = chrono::duration_cast(end - start).count() / 1e6;
190 | cout << "Parallel Merge Sort Time (ms): " << parallelMergeSortTime << endl;
191 |
192 | // Print the sorted vector (optional)
193 | // cout << "Sorted vector (Parallel Merge Sort): ";
194 | // for (int i = 0; i < N; ++i) {
195 | // cout << arr[i] << " ";
196 | // }
197 | // cout << endl;
198 |
199 | return 0;
200 | }
201 |
202 |
--------------------------------------------------------------------------------