├── OpenMP ├── OpenMPNEON.cpp ├── OpenMPSSE.cpp └── OpenMPSpecialGauss.cpp ├── Pthread ├── PthreadNEON.cpp ├── PthreadSSE AVX.cpp └── PthreadSpecialGauss.cpp ├── SIMD ├── Guass.cpp ├── NEON.cpp └── SpecialGuass.cpp ├── cuda_learning ├── 0.cu ├── 1.cu ├── 3.cu └── 4.cu ├── final ├── big_scale_IO.cpp ├── big_scale_IO2.cpp ├── bitmap_store.cpp ├── sparse_store.cpp ├── wrong_pthread.cpp └── wrong_pthread_improved.cpp ├── homework1 ├── main1.cpp └── main2.cpp ├── mpi ├── mpi.cpp ├── mpi_improved.cpp ├── mpi_omp.cpp ├── mpi_omp_simd.cpp ├── mpi_pipeline.cpp └── mpi_specialGauss.cpp └── readme.md /OpenMP/OpenMPNEON.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonLight0123/parallel_repository/4f3979fa2e1dca5a20350d4d2c01ecc85a7c6ce1/OpenMP/OpenMPNEON.cpp -------------------------------------------------------------------------------- /OpenMP/OpenMPSSE.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonLight0123/parallel_repository/4f3979fa2e1dca5a20350d4d2c01ecc85a7c6ce1/OpenMP/OpenMPSSE.cpp -------------------------------------------------------------------------------- /OpenMP/OpenMPSpecialGauss.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonLight0123/parallel_repository/4f3979fa2e1dca5a20350d4d2c01ecc85a7c6ce1/OpenMP/OpenMPSpecialGauss.cpp -------------------------------------------------------------------------------- /Pthread/PthreadNEON.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonLight0123/parallel_repository/4f3979fa2e1dca5a20350d4d2c01ecc85a7c6ce1/Pthread/PthreadNEON.cpp -------------------------------------------------------------------------------- /Pthread/PthreadSSE AVX.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonLight0123/parallel_repository/4f3979fa2e1dca5a20350d4d2c01ecc85a7c6ce1/Pthread/PthreadSSE AVX.cpp -------------------------------------------------------------------------------- /Pthread/PthreadSpecialGauss.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonLight0123/parallel_repository/4f3979fa2e1dca5a20350d4d2c01ecc85a7c6ce1/Pthread/PthreadSpecialGauss.cpp -------------------------------------------------------------------------------- /SIMD/Guass.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonLight0123/parallel_repository/4f3979fa2e1dca5a20350d4d2c01ecc85a7c6ce1/SIMD/Guass.cpp -------------------------------------------------------------------------------- /SIMD/NEON.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | using namespace std; 5 | const int N=1024; 6 | float a[N][N]; 7 | void init() 8 | { 9 | for(int i=0;itv_sec*1000+double(stop->tv_usec)/1000-start->tv_sec*1000-double(start->tv_usec)/1000; 141 | cout << " SequentialAlgorithm time: " << double(durationTime) << " ms" << endl; 142 | 143 | init(); 144 | gettimeofday(start,NULL); 145 | ParallelAlgorithm(); 146 | gettimeofday(stop,NULL); 147 | durationTime =stop->tv_sec*1000+double(stop->tv_usec)/1000-start->tv_sec*1000-double(start->tv_usec)/1000; 148 | cout << " ParallelAlgorithm time: " << double(durationTime) << " ms" << endl; 149 | 150 | init(); 151 | gettimeofday(start,NULL); 152 | AlignedParallelAlgorithm(); 153 | gettimeofday(stop,NULL); 154 | durationTime =stop->tv_sec*1000+double(stop->tv_usec)/1000-start->tv_sec*1000-double(start->tv_usec)/1000; 155 | cout << " AlignedParallelAlgorithm time: " << double(durationTime) << " ms" << endl; 156 | 157 | return 0; 158 | } 159 | -------------------------------------------------------------------------------- /SIMD/SpecialGuass.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonLight0123/parallel_repository/4f3979fa2e1dca5a20350d4d2c01ecc85a7c6ce1/SIMD/SpecialGuass.cpp -------------------------------------------------------------------------------- /cuda_learning/0.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | /* 4 | * Initialize array values on the host. 5 | */ 6 | 7 | void init(int *a, int N) 8 | { 9 | int i; 10 | for (i = 0; i < N; ++i) 11 | { 12 | a[i] = i; 13 | } 14 | } 15 | 16 | /* 17 | * Double elements in parallel on the GPU. 18 | */ 19 | 20 | __global__ 21 | void doubleElements(int *a, int N) 22 | { 23 | int i; 24 | i = blockIdx.x * blockDim.x + threadIdx.x; 25 | if (i < N) 26 | { 27 | a[i] *= 2; 28 | } 29 | } 30 | 31 | /* 32 | * Check all elements have been doubled on the host. 33 | */ 34 | 35 | bool checkElementsAreDoubled(int *a, int N) 36 | { 37 | int i; 38 | for (i = 0; i < N; ++i) 39 | { 40 | if (a[i] != i*2) return false; 41 | } 42 | return true; 43 | } 44 | 45 | int main() 46 | { 47 | int N = 100; 48 | int *a; 49 | 50 | size_t size = N * sizeof(int); 51 | 52 | /* 53 | * Refactor this memory allocation to provide a pointer 54 | * `a` that can be used on both the host and the device. 55 | */ 56 | 57 | //a = (int *)malloc(size); 58 | cudaMallocManaged(&a, size); 59 | init(a, N); 60 | 61 | size_t threads_per_block = 10; 62 | size_t number_of_blocks = 10; 63 | 64 | /* 65 | * This launch will not work until the pointer `a` is also 66 | * available to the device. 67 | */ 68 | 69 | doubleElements<<>>(a, N); 70 | cudaDeviceSynchronize(); 71 | 72 | bool areDoubled = checkElementsAreDoubled(a, N); 73 | printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE"); 74 | 75 | /* 76 | * Refactor to free memory that has been allocated to be 77 | * accessed by both the host and the device. 78 | */ 79 | 80 | cudaFree(a); 81 | } 82 | -------------------------------------------------------------------------------- /cuda_learning/1.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | /* 4 | * Refactor firstParallel so that it can run on the GPU. 5 | */ 6 | 7 | __global__ void firstParallel() 8 | { 9 | printf("This should be running in parallel.\n"); 10 | } 11 | 12 | int main() 13 | { 14 | /* 15 | * Refactor this call to firstParallel to execute in parallel 16 | * on the GPU. 17 | */ 18 | 19 | firstParallel<<<3,3>>>(); 20 | cudaDeviceSynchronize(); 21 | /* 22 | * Some code is needed below so that the CPU will wait 23 | * for the GPU kernels to complete before proceeding. 24 | */ 25 | 26 | } 27 | 28 | #include 29 | 30 | __global__ void printSuccessForCorrectExecutionConfiguration() 31 | { 32 | 33 | if(threadIdx.x == 1023 && blockIdx.x == 255) 34 | { 35 | printf("Success!\n"); 36 | } 37 | } 38 | 39 | int main() 40 | { 41 | /* 42 | * This is one possible execution context that will make 43 | * the kernel launch print its success message. 44 | */ 45 | 46 | printSuccessForCorrectExecutionConfiguration<<<256, 1024>>>(); 47 | 48 | /* 49 | * Don't forget kernel execution is asynchronous and you must 50 | * sync on its completion. 51 | */ 52 | 53 | cudaDeviceSynchronize(); 54 | } -------------------------------------------------------------------------------- /cuda_learning/3.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | /* 4 | * Host function to initialize vector elements. This function 5 | * simply initializes each element to equal its index in the 6 | * vector. 7 | */ 8 | 9 | void initWith(float num, float *a, int N) 10 | { 11 | for(int i = 0; i < N; ++i) 12 | { 13 | a[i] = num; 14 | } 15 | } 16 | 17 | /* 18 | * Device kernel stores into `result` the sum of each 19 | * same-indexed value of `a` and `b`. 20 | */ 21 | 22 | __global__ 23 | void addVectorsInto(float *result, float *a, float *b, int N) 24 | { 25 | int index = threadIdx.x + blockIdx.x * blockDim.x; 26 | int stride = blockDim.x * gridDim.x; 27 | 28 | for(int i = index; i < N; i += stride) 29 | { 30 | result[i] = a[i] + b[i]; 31 | } 32 | } 33 | 34 | /* 35 | * Host function to confirm values in `vector`. This function 36 | * assumes all values are the same `target` value. 37 | */ 38 | 39 | void checkElementsAre(float target, float *vector, int N) 40 | { 41 | for(int i = 0; i < N; i++) 42 | { 43 | if(vector[i] != target) 44 | { 45 | printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target); 46 | exit(1); 47 | } 48 | } 49 | printf("Success! All values calculated correctly.\n"); 50 | } 51 | 52 | int main() 53 | { 54 | const int N = 2<<24; 55 | size_t size = N * sizeof(float); 56 | 57 | float *a; 58 | float *b; 59 | float *c; 60 | 61 | cudaMallocManaged(&a, size); 62 | cudaMallocManaged(&b, size); 63 | cudaMallocManaged(&c, size); 64 | 65 | initWith(3, a, N); 66 | initWith(4, b, N); 67 | initWith(0, c, N); 68 | 69 | size_t threadsPerBlock; 70 | size_t numberOfBlocks; 71 | 72 | /* 73 | * nsys should register performance changes when execution configuration 74 | * is updated. 75 | */ 76 | 77 | threadsPerBlock = 1000; 78 | numberOfBlocks = 1024; 79 | 80 | cudaError_t addVectorsErr; 81 | cudaError_t asyncErr; 82 | 83 | addVectorsInto<<>>(c, a, b, N); 84 | 85 | addVectorsErr = cudaGetLastError(); 86 | if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr)); 87 | 88 | asyncErr = cudaDeviceSynchronize(); 89 | if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr)); 90 | 91 | checkElementsAre(7, c, N); 92 | 93 | cudaFree(a); 94 | cudaFree(b); 95 | cudaFree(c); 96 | } -------------------------------------------------------------------------------- /cuda_learning/4.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "timer.h" 5 | #include "files.h" 6 | 7 | #define SOFTENING 1e-9f 8 | 9 | /* 10 | * Each body contains x, y, and z coordinate positions, 11 | * as well as velocities in the x, y, and z directions. 12 | */ 13 | 14 | typedef struct { float x, y, z, vx, vy, vz; } Body; 15 | 16 | /* 17 | * Calculate the gravitational impact of all bodies in the system 18 | * on all others. 19 | */ 20 | 21 | __global__ 22 | void integratePosition(Body *p, float dt,int n) { 23 | int index = threadIdx.x + blockIdx.x * blockDim.x; 24 | int stride = blockDim.x * gridDim.x; 25 | for (int i = index; i < n; i += stride) { 26 | p[i].x += p[i].vx*dt; 27 | p[i].y += p[i].vy*dt; 28 | p[i].z += p[i].vz*dt; 29 | } 30 | } 31 | 32 | __global__ 33 | void bodyForce(Body *p, float dt, int n) { 34 | int index = threadIdx.x + blockIdx.x * blockDim.x; 35 | int stride = blockDim.x * gridDim.x; 36 | for (int i = index; i < n; i += stride) { 37 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 38 | for (int j = 0; j < n; j++) { 39 | float dx = p[j].x - p[i].x; 40 | float dy = p[j].y - p[i].y; 41 | float dz = p[j].z - p[i].z; 42 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 43 | float invDist = rsqrtf(distSqr); 44 | float invDist3 = invDist * invDist * invDist; 45 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 46 | } 47 | 48 | p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz; 49 | } 50 | } 51 | 52 | 53 | int main(const int argc, const char** argv) { 54 | 55 | // The assessment will test against both 2<11 and 2<15. 56 | // Feel free to pass the command line argument 15 when you generate ./nbody report files 57 | int nBodies = 2<<11; 58 | if (argc > 1) nBodies = 2<