├── .gitignore ├── HW1 ├── part1 │ ├── Makefile │ ├── PPintrin.cpp │ ├── PPintrin.h │ ├── def.h │ ├── logger.cpp │ ├── logger.h │ ├── main.cpp │ ├── serialOP.cpp │ └── vectorOP.cpp ├── part2 │ ├── Makefile │ ├── assembly │ │ └── test2.vec.s │ ├── fasttime.h │ ├── main.cpp │ ├── test.h │ ├── test1.cpp │ ├── test2.cpp │ ├── test2.cpp.patch │ └── test3.cpp └── submit │ ├── url.txt │ └── vectorOP.cpp ├── HW2 ├── part1 │ ├── Makefile │ ├── pi.c │ ├── report.txt │ └── shishua-avx2.h ├── part2 │ ├── Makefile │ ├── common │ │ ├── CycleTimer.h │ │ └── ppm.cpp │ ├── main.cpp │ ├── mandelbrotSerial.cpp │ ├── mandelbrotThread.cpp │ └── report.txt └── submit │ ├── part1 │ ├── Makefile │ ├── pi.c │ └── shishua-avx2.h │ ├── part2 │ └── mandelbrotThread.cpp │ └── url.txt ├── HW3 ├── part1 │ ├── Makefile │ ├── README │ ├── cg.c │ ├── cg_impl.c │ ├── cg_impl.h │ ├── common │ │ ├── c_timers.c │ │ ├── randdp.c │ │ ├── randdp.h │ │ ├── timers.h │ │ ├── type.h │ │ ├── wtime.c │ │ ├── wtime.h │ │ └── wtime_sgi64.c │ ├── def_cg.a │ ├── globals.h │ ├── grade.c │ ├── make.common │ ├── ref_cg.a │ └── report.txt ├── part2 │ ├── breadth_first_search │ │ ├── Makefile │ │ ├── bfs.cpp │ │ ├── bfs.h │ │ ├── grade.cpp │ │ ├── main.cpp │ │ ├── ref_bfs.a │ │ └── report.txt │ ├── common │ │ ├── CycleTimer.h │ │ ├── contracts.h │ │ ├── grade.h │ │ ├── graph.cpp │ │ ├── graph.h │ │ └── graph_internal.h │ ├── doc │ │ └── bfs.pdf │ ├── graphs │ │ └── README.md │ ├── page_rank │ │ ├── Makefile │ │ ├── grade.cpp │ │ ├── main.cpp │ │ ├── page_rank.cpp │ │ ├── page_rank.h │ │ ├── ref_pr.a │ │ └── report.txt │ └── tools │ │ ├── Makefile │ │ ├── graphTools.cpp │ │ └── plaintext.graph └── submit │ ├── bfs.cpp │ ├── cg_impl.c │ └── page_rank.cpp ├── HW4 ├── part1 │ ├── Makefile │ ├── hello.cc │ ├── hosts_mpi.txt │ ├── hosts_part1.txt │ ├── pi_block_linear.cc │ ├── pi_block_tree.cc │ ├── pi_gather.cc │ ├── pi_nonblock_linear.cc │ ├── pi_reduce.cc │ ├── report.txt │ └── test.py ├── part2 │ ├── Makefile │ ├── hosts_part2_4slots.txt │ ├── hosts_part2_7slots.txt │ ├── main.cc │ ├── matrix.cc │ ├── report.txt │ ├── test.py │ └── testdata │ │ ├── ans0_1 │ │ ├── ans0_2 │ │ ├── data0_1 │ │ └── data0_2 ├── setting │ ├── config │ └── hosts.txt └── submit │ ├── part1 │ ├── hello.cc │ ├── pi_block_linear.cc │ ├── pi_block_tree.cc │ ├── pi_gather.cc │ ├── pi_nonblock_linear.cc │ └── pi_reduce.cc │ ├── part2 │ ├── Makefile │ ├── main.cc │ └── matrix.cc │ └── url.txt ├── HW5 ├── Makefile ├── common │ ├── CycleTimer.h │ └── ppm.cpp ├── kernel.h ├── kernel1.cu ├── kernel2.cu ├── kernel3.cu ├── kernel4.cu ├── main.cpp ├── mandelbrotSerial.cpp ├── mandelbrotThread.cpp ├── mandelbrotThreadRef.a ├── mandelbrotThreadRef50.a └── mandelbrotThreadRefAll.a ├── HW6 ├── CycleTimer.h ├── Makefile ├── bmpfuncs.c ├── bmpfuncs.h ├── filter1.csv ├── filter2.csv ├── filter3.csv ├── helper.c ├── helper.h ├── hostFE.c ├── hostFE.h ├── input.bmp ├── kernel.cl ├── main.c ├── output.bmp ├── ref.bmp ├── serialConv.c └── serialConv.h └── Readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | # HW6 2 | HW6/conv 3 | 4 | # HW5 5 | HW5/mandelbrot 6 | HW5/*.ppm 7 | 8 | # HW4 9 | HW4/part1/mpi_hello 10 | HW4/part1/pi_block_linear 11 | HW4/part1/pi_block_tree 12 | HW4/part1/pi_gather 13 | HW4/part1/pi_nonblock_linear 14 | HW4/part1/pi_reduce 15 | HW4/part2/matmul 16 | 17 | # HW3 18 | HW3/part1/cg 19 | HW3/part1/cg_grader 20 | HW3/part2/breadth_first_search/bfs 21 | HW3/part2/breadth_first_search/bfs_grader 22 | HW3/part2/page_rank/pr 23 | HW3/part2/page_rank/pr_grader 24 | 25 | # HW2 26 | HW2/part1/pi.out 27 | HW2/part2/mandelbrot 28 | HW2/part2/*.ppm 29 | 30 | # HW1 31 | HW1/part1/myexp 32 | 33 | # Profiling file 34 | gmon.out 35 | profiling_result 36 | perf.data 37 | perf.data.old 38 | 39 | # Common Extensions 40 | *.o 41 | *.zip 42 | *.graph -------------------------------------------------------------------------------- /HW1/part1/Makefile: -------------------------------------------------------------------------------- 1 | CXX := g++ 2 | CXXFLAGS := -I./common -O3 -std=c++17 -Wall 3 | 4 | ifeq (/usr/bin/g++-10,$(wildcard /usr/bin/g++-10*)) 5 | CXX=g++-10 6 | endif 7 | 8 | all: myexp 9 | 10 | logger.o: logger.cpp logger.h PPintrin.h PPintrin.cpp def.h 11 | $(CXX) $(CXXFLAGS) -c logger.cpp 12 | 13 | PPintrin.o: PPintrin.cpp PPintrin.h logger.cpp logger.h def.h 14 | $(CXX) $(CXXFLAGS) -c PPintrin.cpp 15 | 16 | myexp: PPintrin.o logger.o main.cpp serialOP.cpp vectorOP.cpp 17 | g++ -I./common logger.o PPintrin.o main.cpp serialOP.cpp vectorOP.cpp -o myexp 18 | 19 | clean: 20 | rm -f *.o *.s myexp *~ 21 | -------------------------------------------------------------------------------- /HW1/part1/PPintrin.h: -------------------------------------------------------------------------------- 1 | #ifndef PPINTRIN_H_ 2 | #define PPINTRIN_H_ 3 | 4 | #include 5 | #include 6 | #include "logger.h" 7 | #include "def.h" 8 | //******************* 9 | //* Type Definition * 10 | //******************* 11 | 12 | extern Logger PPLogger; 13 | 14 | template 15 | struct __pp_vec { 16 | T value[VECTOR_WIDTH]; 17 | }; 18 | 19 | // Declare a mask with __pp_mask 20 | struct __pp_mask : __pp_vec {}; 21 | 22 | // Declare a floating point vector register with __pp_vec_float 23 | #define __pp_vec_float __pp_vec 24 | 25 | // Declare an integer vector register with __pp_vec_int 26 | #define __pp_vec_int __pp_vec 27 | 28 | //*********************** 29 | //* Function Definition * 30 | //*********************** 31 | 32 | // Return a mask initialized to 1 in the first N lanes and 0 in the others 33 | __pp_mask _pp_init_ones(int first = VECTOR_WIDTH); 34 | 35 | // Return the inverse of maska 36 | __pp_mask _pp_mask_not(__pp_mask &maska); 37 | 38 | // Return (maska | maskb) 39 | __pp_mask _pp_mask_or(__pp_mask &maska, __pp_mask &maskb); 40 | 41 | // Return (maska & maskb) 42 | __pp_mask _pp_mask_and(__pp_mask &maska, __pp_mask &maskb); 43 | 44 | // Count the number of 1s in maska 45 | int _pp_cntbits(__pp_mask &maska); 46 | 47 | // Set register to value if vector lane is active 48 | // otherwise keep the old value 49 | void _pp_vset_float(__pp_vec_float &vecResult, float value, __pp_mask &mask); 50 | void _pp_vset_int(__pp_vec_int &vecResult, int value, __pp_mask &mask); 51 | // For user's convenience, returns a vector register with all lanes initialized to value 52 | __pp_vec_float _pp_vset_float(float value); 53 | __pp_vec_int _pp_vset_int(int value); 54 | 55 | // Copy values from vector register src to vector register dest if vector lane active 56 | // otherwise keep the old value 57 | void _pp_vmove_float(__pp_vec_float &dest, __pp_vec_float &src, __pp_mask &mask); 58 | void _pp_vmove_int(__pp_vec_int &dest, __pp_vec_int &src, __pp_mask &mask); 59 | 60 | // Load values from array src to vector register dest if vector lane active 61 | // otherwise keep the old value 62 | void _pp_vload_float(__pp_vec_float &dest, float* src, __pp_mask &mask); 63 | void _pp_vload_int(__pp_vec_int &dest, int* src, __pp_mask &mask); 64 | 65 | // Store values from vector register src to array dest if vector lane active 66 | // otherwise keep the old value 67 | void _pp_vstore_float(float* dest, __pp_vec_float &src, __pp_mask &mask); 68 | void _pp_vstore_int(int* dest, __pp_vec_int &src, __pp_mask &mask); 69 | 70 | // Return calculation of (veca + vecb) if vector lane active 71 | // otherwise keep the old value 72 | void _pp_vadd_float(__pp_vec_float &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask); 73 | void _pp_vadd_int(__pp_vec_int &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask); 74 | 75 | // Return calculation of (veca - vecb) if vector lane active 76 | // otherwise keep the old value 77 | void _pp_vsub_float(__pp_vec_float &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask); 78 | void _pp_vsub_int(__pp_vec_int &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask); 79 | 80 | // Return calculation of (veca * vecb) if vector lane active 81 | // otherwise keep the old value 82 | void _pp_vmult_float(__pp_vec_float &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask); 83 | void _pp_vmult_int(__pp_vec_int &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask); 84 | 85 | // Return calculation of (veca / vecb) if vector lane active 86 | // otherwise keep the old value 87 | void _pp_vdiv_float(__pp_vec_float &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask); 88 | void _pp_vdiv_int(__pp_vec_int &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask); 89 | 90 | 91 | // Return calculation of absolute value abs(veca) if vector lane active 92 | // otherwise keep the old value 93 | void _pp_vabs_float(__pp_vec_float &vecResult, __pp_vec_float &veca, __pp_mask &mask); 94 | void _pp_vabs_int(__pp_vec_int &vecResult, __pp_vec_int &veca, __pp_mask &mask); 95 | 96 | // Return a mask of (veca > vecb) if vector lane active 97 | // otherwise keep the old value 98 | void _pp_vgt_float(__pp_mask &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask); 99 | void _pp_vgt_int(__pp_mask &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask); 100 | 101 | // Return a mask of (veca < vecb) if vector lane active 102 | // otherwise keep the old value 103 | void _pp_vlt_float(__pp_mask &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask); 104 | void _pp_vlt_int(__pp_mask &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask); 105 | 106 | // Return a mask of (veca == vecb) if vector lane active 107 | // otherwise keep the old value 108 | void _pp_veq_float(__pp_mask &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask); 109 | void _pp_veq_int(__pp_mask &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask); 110 | 111 | // Adds up adjacent pairs of elements, so 112 | // [0 1 2 3] -> [0+1 0+1 2+3 2+3] 113 | void _pp_hadd_float(__pp_vec_float &vecResult, __pp_vec_float &vec); 114 | 115 | // Performs an even-odd interleaving where all even-indexed elements move to front half 116 | // of the array and odd-indexed to the back half, so 117 | // [0 1 2 3 4 5 6 7] -> [0 2 4 6 1 3 5 7] 118 | void _pp_interleave_float(__pp_vec_float &vecResult, __pp_vec_float &vec); 119 | 120 | // Add a customized log to help debugging 121 | void addUserLog(const char * logStr); 122 | 123 | #endif 124 | -------------------------------------------------------------------------------- /HW1/part1/def.h: -------------------------------------------------------------------------------- 1 | // Define vector unit width here 2 | #define VECTOR_WIDTH 16 3 | #define EXP_MAX 10 4 | -------------------------------------------------------------------------------- /HW1/part1/logger.cpp: -------------------------------------------------------------------------------- 1 | #include "logger.h" 2 | #include "PPintrin.h" 3 | 4 | void Logger::addLog(const char *instruction, __pp_mask mask, int N) 5 | { 6 | Log newLog; 7 | strcpy(newLog.instruction, instruction); 8 | newLog.mask = 0; 9 | for (int i = 0; i < N; i++) 10 | { 11 | if (mask.value[i]) 12 | { 13 | newLog.mask |= (((unsigned long long)1) << i); 14 | stats.utilized_lane++; 15 | } 16 | } 17 | stats.total_lane += N; 18 | stats.total_instructions += (N > 0); 19 | log.push_back(newLog); 20 | } 21 | 22 | void Logger::printStats() 23 | { 24 | printf("****************** Printing Vector Unit Statistics *******************\n"); 25 | printf("Vector Width: %d\n", VECTOR_WIDTH); 26 | printf("Total Vector Instructions: %lld\n", stats.total_instructions); 27 | printf("Vector Utilization: %.1f%%\n", (double)stats.utilized_lane / stats.total_lane * 100); 28 | printf("Utilized Vector Lanes: %lld\n", stats.utilized_lane); 29 | printf("Total Vector Lanes: %lld\n", stats.total_lane); 30 | } 31 | 32 | void Logger::printLog() 33 | { 34 | printf("***************** Printing Vector Unit Execution Log *****************\n"); 35 | printf(" Instruction | Vector Lane Occupancy ('*' for active, '_' for inactive)\n"); 36 | printf("------------- --------------------------------------------------------\n"); 37 | for (int i = 0; i < log.size(); i++) 38 | { 39 | printf("%12s | ", log[i].instruction); 40 | for (int j = 0; j < VECTOR_WIDTH; j++) 41 | { 42 | if (log[i].mask & (((unsigned long long)1) << j)) 43 | { 44 | printf("*"); 45 | } 46 | else 47 | { 48 | printf("_"); 49 | } 50 | } 51 | printf("\n"); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /HW1/part1/logger.h: -------------------------------------------------------------------------------- 1 | #ifndef LOGGER_H_ 2 | #define LOGGER_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | #define MAX_INST_LEN 32 10 | 11 | struct __pp_mask; 12 | 13 | struct Log { 14 | char instruction[MAX_INST_LEN]; 15 | unsigned long long mask; // support vector width up to 64 16 | }; 17 | 18 | struct Statistics { 19 | unsigned long long utilized_lane; 20 | unsigned long long total_lane; 21 | unsigned long long total_instructions; 22 | }; 23 | 24 | class Logger { 25 | private: 26 | vector log; 27 | Statistics stats; 28 | 29 | public: 30 | void addLog(const char * instruction, __pp_mask mask, int N = 0); 31 | void printStats(); 32 | void printLog(); 33 | void refresh() { 34 | stats.total_instructions = 0; 35 | stats.total_lane = 0; 36 | stats.utilized_lane = 0; 37 | }; 38 | }; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /HW1/part1/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "PPintrin.h" 6 | #include "logger.h" 7 | #include 8 | #include "def.h" 9 | using namespace std; 10 | 11 | Logger PPLogger; 12 | 13 | void usage(const char *progname); 14 | void initValue(float *values, int *exponents, float *output, float *gold, unsigned int N); 15 | void absSerial(float *values, float *output, int N); 16 | void absVector(float *values, float *output, int N); 17 | void clampedExpSerial(float *values, int *exponents, float *output, int N); 18 | void clampedExpVector(float *values, int *exponents, float *output, int N); 19 | float arraySumSerial(float *values, int N); 20 | float arraySumVector(float *values, int N); 21 | bool verifyResult(float *values, int *exponents, float *output, float *gold, int N); 22 | 23 | int main(int argc, char *argv[]) 24 | { 25 | int N = 16; 26 | bool printLog = false; 27 | 28 | // parse commandline options //////////////////////////////////////////// 29 | int opt; 30 | static struct option long_options[] = { 31 | {"size", 1, 0, 's'}, 32 | {"log", 0, 0, 'l'}, 33 | {"help", 0, 0, '?'}, 34 | {0, 0, 0, 0}}; 35 | 36 | while ((opt = getopt_long(argc, argv, "s:l?", long_options, NULL)) != EOF) 37 | { 38 | 39 | switch (opt) 40 | { 41 | case 's': 42 | N = atoi(optarg); 43 | if (N <= 0) 44 | { 45 | printf("Error: Workload size is set to %d (<0).\n", N); 46 | return -1; 47 | } 48 | break; 49 | case 'l': 50 | printLog = true; 51 | break; 52 | case '?': 53 | default: 54 | usage(argv[0]); 55 | return 1; 56 | } 57 | } 58 | 59 | float *values = new float[N + VECTOR_WIDTH]; 60 | int *exponents = new int[N + VECTOR_WIDTH]; 61 | float *output = new float[N + VECTOR_WIDTH]; 62 | float *gold = new float[N + VECTOR_WIDTH]; 63 | initValue(values, exponents, output, gold, N); 64 | 65 | clampedExpSerial(values, exponents, gold, N); 66 | clampedExpVector(values, exponents, output, N); 67 | 68 | //absSerial(values, gold, N); 69 | //absVector(values, output, N); 70 | 71 | printf("\e[1;31mCLAMPED EXPONENT\e[0m (required) \n"); 72 | bool clampedCorrect = verifyResult(values, exponents, output, gold, N); 73 | if (printLog) 74 | PPLogger.printLog(); 75 | PPLogger.printStats(); 76 | 77 | printf("************************ Result Verification *************************\n"); 78 | if (!clampedCorrect) 79 | { 80 | printf("@@@ ClampedExp Failed!!!\n"); 81 | } 82 | else 83 | { 84 | printf("ClampedExp Passed!!!\n"); 85 | } 86 | 87 | PPLogger.refresh(); 88 | 89 | printf("\n\e[1;31mARRAY SUM\e[0m (bonus) \n"); 90 | if (N % VECTOR_WIDTH == 0) 91 | { 92 | float sumGold = arraySumSerial(values, N); 93 | float sumOutput = arraySumVector(values, N); 94 | 95 | if (printLog) 96 | PPLogger.printLog(); 97 | PPLogger.printStats(); 98 | 99 | printf("************************ Result Verification *************************\n"); 100 | 101 | float epsilon = 0.1; 102 | bool sumCorrect = abs(sumGold - sumOutput) < epsilon * 2; 103 | if (!sumCorrect) 104 | { 105 | printf("Expected %f, got %f\n.", sumGold, sumOutput); 106 | printf("@@@ ArraySum Failed!!!\n"); 107 | } 108 | else 109 | { 110 | printf("ArraySum Passed!!!\n"); 111 | } 112 | } 113 | else 114 | { 115 | printf("Must have N %% VECTOR_WIDTH == 0 for this problem (VECTOR_WIDTH is %d)\n", VECTOR_WIDTH); 116 | } 117 | 118 | delete[] values; 119 | delete[] exponents; 120 | delete[] output; 121 | delete[] gold; 122 | 123 | return 0; 124 | } 125 | 126 | void usage(const char *progname) 127 | { 128 | printf("Usage: %s [options]\n", progname); 129 | printf("Program Options:\n"); 130 | printf(" -s --size Use workload size N (Default = 16)\n"); 131 | printf(" -l --log Print vector unit execution log\n"); 132 | printf(" -? --help This message\n"); 133 | } 134 | 135 | void initValue(float *values, int *exponents, float *output, float *gold, unsigned int N) 136 | { 137 | 138 | for (unsigned int i = 0; i < N + VECTOR_WIDTH; i++) 139 | { 140 | // random input values 141 | values[i] = -1.f + 4.f * static_cast(rand()) / RAND_MAX; 142 | exponents[i] = rand() % EXP_MAX; 143 | output[i] = 0.f; 144 | gold[i] = 0.f; 145 | } 146 | } 147 | 148 | bool verifyResult(float *values, int *exponents, float *output, float *gold, int N) 149 | { 150 | int incorrect = -1; 151 | float epsilon = 0.00001; 152 | for (int i = 0; i < N + VECTOR_WIDTH; i++) 153 | { 154 | if (abs(output[i] - gold[i]) > epsilon) 155 | { 156 | incorrect = i; 157 | break; 158 | } 159 | } 160 | 161 | if (incorrect != -1) 162 | { 163 | if (incorrect >= N) 164 | printf("You have written to out of bound value!\n"); 165 | printf("Wrong calculation at value[%d]!\n", incorrect); 166 | printf("value = "); 167 | for (int i = 0; i < N; i++) 168 | { 169 | printf("% f ", values[i]); 170 | } 171 | printf("\n"); 172 | 173 | printf("exp = "); 174 | for (int i = 0; i < N; i++) 175 | { 176 | printf("% 9d ", exponents[i]); 177 | } 178 | printf("\n"); 179 | 180 | printf("output = "); 181 | for (int i = 0; i < N; i++) 182 | { 183 | printf("% f ", output[i]); 184 | } 185 | printf("\n"); 186 | 187 | printf("gold = "); 188 | for (int i = 0; i < N; i++) 189 | { 190 | printf("% f ", gold[i]); 191 | } 192 | printf("\n"); 193 | return false; 194 | } 195 | printf("Results matched with answer!\n"); 196 | return true; 197 | } 198 | -------------------------------------------------------------------------------- /HW1/part1/serialOP.cpp: -------------------------------------------------------------------------------- 1 | // computes the absolute value of all elements in the input array 2 | // values, stores result in output 3 | void absSerial(float *values, float *output, int N) 4 | { 5 | for (int i = 0; i < N; i++) 6 | { 7 | float x = values[i]; 8 | if (x < 0) 9 | { 10 | output[i] = -x; 11 | } 12 | else 13 | { 14 | output[i] = x; 15 | } 16 | } 17 | } 18 | 19 | // accepts an array of values and an array of exponents 20 | // 21 | // For each element, compute values[i]^exponents[i] and clamp value to 22 | // 9.999. Store result in output. 23 | void clampedExpSerial(float *values, int *exponents, float *output, int N) 24 | { 25 | for (int i = 0; i < N; i++) 26 | { 27 | float x = values[i]; 28 | int y = exponents[i]; 29 | if (y == 0) 30 | { 31 | output[i] = 1.f; 32 | } 33 | else 34 | { 35 | float result = x; 36 | int count = y - 1; 37 | while (count > 0) 38 | { 39 | result *= x; 40 | count--; 41 | } 42 | if (result > 9.999999f) 43 | { 44 | result = 9.999999f; 45 | } 46 | output[i] = result; 47 | } 48 | } 49 | } 50 | 51 | // returns the sum of all elements in values 52 | float arraySumSerial(float *values, int N) 53 | { 54 | float sum = 0; 55 | for (int i = 0; i < N; i++) 56 | { 57 | sum += values[i]; 58 | } 59 | 60 | return sum; 61 | } -------------------------------------------------------------------------------- /HW1/part1/vectorOP.cpp: -------------------------------------------------------------------------------- 1 | #include "PPintrin.h" 2 | 3 | // implementation of absSerial(), but it is vectorized using PP intrinsics 4 | void absVector(float *values, float *output, int N) 5 | { 6 | __pp_vec_float x; 7 | __pp_vec_float result; 8 | __pp_vec_float zero = _pp_vset_float(0.f); 9 | __pp_mask maskAll, maskIsNegative, maskIsNotNegative; 10 | 11 | // Note: Take a careful look at this loop indexing. This example 12 | // code is not guaranteed to work when (N % VECTOR_WIDTH) != 0. 13 | // Why is that the case? 14 | for (int i = 0; i < N; i += VECTOR_WIDTH) 15 | { 16 | 17 | // All ones 18 | maskAll = _pp_init_ones(); 19 | 20 | // All zeros 21 | maskIsNegative = _pp_init_ones(0); 22 | 23 | // Load vector of values from contiguous memory addresses 24 | _pp_vload_float(x, values + i, maskAll); // x = values[i]; 25 | 26 | // Set mask according to predicate 27 | _pp_vlt_float(maskIsNegative, x, zero, maskAll); // if (x < 0) { 28 | 29 | // Execute instruction using mask ("if" clause) 30 | _pp_vsub_float(result, zero, x, maskIsNegative); // output[i] = -x; 31 | 32 | // Inverse maskIsNegative to generate "else" mask 33 | maskIsNotNegative = _pp_mask_not(maskIsNegative); // } else { 34 | 35 | // Execute instruction ("else" clause) 36 | _pp_vload_float(result, values + i, maskIsNotNegative); // output[i] = x; } 37 | 38 | // Write results back to memory 39 | _pp_vstore_float(output + i, result, maskAll); 40 | } 41 | } 42 | 43 | void clampedExpVector(float *values, int *exponents, float *output, int N) 44 | { 45 | // 46 | // PP STUDENTS TODO: Implement your vectorized version of 47 | // clampedExpSerial() here. 48 | // 49 | // Your solution should work for any value of 50 | // N and VECTOR_WIDTH, not just when VECTOR_WIDTH divides N 51 | // 52 | __pp_vec_int zero, one; 53 | __pp_vec_float clampedValue; 54 | 55 | zero = _pp_vset_int(0); 56 | one = _pp_vset_int(1); 57 | clampedValue = _pp_vset_float(9.999999f); 58 | 59 | for (int i = 0; i < N; i += VECTOR_WIDTH) 60 | { 61 | __pp_vec_float x, result; 62 | __pp_vec_int y; 63 | int maskWidth; 64 | __pp_mask mask, maskEq0, maskNeq0, maskGtCV; 65 | 66 | maskWidth = i + VECTOR_WIDTH <= N ? VECTOR_WIDTH : N % VECTOR_WIDTH; 67 | mask = _pp_init_ones(maskWidth); 68 | 69 | maskEq0 = _pp_init_ones(0); 70 | maskNeq0 = _pp_init_ones(0); 71 | maskGtCV = _pp_init_ones(0); 72 | 73 | _pp_vload_float(x, values + i, mask); 74 | _pp_vload_int(y, exponents + i, mask); 75 | 76 | _pp_veq_int(maskEq0, y, zero, mask); 77 | _pp_vset_float(result, 1.f, maskEq0); 78 | 79 | maskNeq0 = _pp_mask_not(maskEq0); 80 | 81 | _pp_vmove_float(result, x, maskNeq0); 82 | 83 | _pp_vsub_int(y, y, one, maskNeq0); 84 | 85 | while (1) 86 | { 87 | int cnt; 88 | __pp_mask maskGt0; 89 | 90 | maskGt0 = _pp_init_ones(0); 91 | 92 | _pp_vgt_int(maskGt0, y, zero, mask); 93 | 94 | cnt = _pp_cntbits(maskGt0); 95 | 96 | if (!cnt) { 97 | break; 98 | } 99 | 100 | _pp_vmult_float(result, result, x, maskGt0); 101 | _pp_vsub_int(y, y, one, maskGt0); 102 | } 103 | 104 | _pp_vgt_float(maskGtCV, result, clampedValue, mask); 105 | _pp_vmove_float(result, clampedValue, maskGtCV); 106 | 107 | _pp_vstore_float(output + i, result, mask); 108 | } 109 | } 110 | 111 | // returns the sum of all elements in values 112 | // You can assume N is a multiple of VECTOR_WIDTH 113 | // You can assume VECTOR_WIDTH is a power of 2 114 | float arraySumVector(float *values, int N) 115 | { 116 | 117 | // 118 | // PP STUDENTS TODO: Implement your vectorized version of arraySumSerial here 119 | // 120 | __pp_vec_float sum; 121 | __pp_mask mask; 122 | 123 | sum = _pp_vset_float(0); 124 | mask = _pp_init_ones(); 125 | 126 | for (int i = 0; i < N; i += VECTOR_WIDTH) 127 | { 128 | __pp_vec_float vec; 129 | 130 | _pp_vload_float(vec, values + i, mask); 131 | _pp_vadd_float(sum, sum, vec, mask); 132 | } 133 | 134 | for (int i = VECTOR_WIDTH; i != 1; i /= 2) { 135 | _pp_hadd_float(sum, sum); 136 | _pp_interleave_float(sum, sum); 137 | } 138 | 139 | return sum.value[0]; 140 | } -------------------------------------------------------------------------------- /HW1/part2/Makefile: -------------------------------------------------------------------------------- 1 | TARGET := test_auto_vectorize 2 | 3 | OBJS := main.o test1.o test2.o test3.o 4 | 5 | CXX := clang++ 6 | 7 | ifeq (/usr/bin/clang++-11,$(wildcard /usr/bin/clang++-11*)) 8 | CXX=clang++-11 9 | endif 10 | 11 | CXXFLAGS := -I./common -O3 -std=c++17 -Wall 12 | 13 | ifeq ($(ASSEMBLE),1) 14 | CXXFLAGS += -S 15 | endif 16 | ifeq ($(VECTORIZE),1) 17 | CXXFLAGS += -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize 18 | SUFFIX := .vec 19 | else 20 | CXXFLAGS += -fno-vectorize 21 | SUFFIX := .novec 22 | endif 23 | ifeq ($(RESTRICT),1) 24 | SUFFIX := $(SUFFIX).restr 25 | endif 26 | ifeq ($(ALIGN),1) 27 | SUFFIX := $(SUFFIX).align 28 | endif 29 | ifeq ($(AVX2),1) 30 | CXXFLAGS += -mavx2 31 | SUFFIX := $(SUFFIX).avx2 32 | endif 33 | ifeq ($(FASTMATH),1) 34 | CXXFLAGS += -ffast-math 35 | SUFFIX := $(SUFFIX).fmath 36 | endif 37 | 38 | all: $(TARGET) 39 | 40 | %.o: %.cpp test.h 41 | ifeq ($(ASSEMBLE),1) 42 | if [ ! -d "./assembly" ]; then mkdir "./assembly"; fi 43 | $(CXX) $(CXXFLAGS) -c $< -o assembly/$(basename $<)$(SUFFIX).s 44 | else 45 | $(CXX) $(CXXFLAGS) -c $< 46 | endif 47 | 48 | $(TARGET): $(OBJS) 49 | ifneq ($(ASSEMBLE),1) 50 | $(CXX) $(CXXFLAGS) $(OBJS) -o $@ 51 | endif 52 | 53 | clean: 54 | rm -f *.o *.s $(TARGET) *~ 55 | 56 | cleanall: 57 | rm -rf *.o *.s $(TARGET) *~ assembly 58 | 59 | -------------------------------------------------------------------------------- /HW1/part2/fasttime.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2014 MIT License by 6.172 Staff 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to 6 | * deal in the Software without restriction, including without limitation the 7 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 | * sell copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 20 | * IN THE SOFTWARE. 21 | **/ 22 | 23 | #ifndef INCLUDED_FASTTIME_DOT_H 24 | #define INCLUDED_FASTTIME_DOT_H 25 | 26 | #define _POSIX_C_SOURCE 200809L 27 | 28 | #include 29 | 30 | #ifdef __MACH__ 31 | #include // mach_absolute_time 32 | 33 | typedef uint64_t fasttime_t; 34 | 35 | 36 | // Return the current time. 37 | static inline fasttime_t gettime(void) { 38 | return mach_absolute_time(); 39 | } 40 | 41 | // Return the time different between the start and the end, as a float 42 | // in units of seconds. This function does not need to be fast. 43 | // Implementation notes: See 44 | // https://developer.apple.com/library/mac/qa/qa1398/_index.html 45 | static inline double tdiff(fasttime_t start, fasttime_t end) { 46 | static mach_timebase_info_data_t timebase; 47 | int r = mach_timebase_info(&timebase); 48 | assert(r == 0); 49 | fasttime_t elapsed = end-start; 50 | double ns = (double)elapsed * timebase.numer / timebase.denom; 51 | return ns*1e-9; 52 | } 53 | 54 | static inline unsigned int random_seed_from_clock(void) { 55 | fasttime_t now = gettime(); 56 | return (now & 0xFFFFFFFF) + (now>>32); 57 | } 58 | 59 | #else // LINUX 60 | 61 | // We need _POSIX_C_SOURCE to pick up 'struct timespec' and clock_gettime. 62 | // #define _POSIX_C_SOURCE 200809L 63 | 64 | #include 65 | 66 | typedef struct timespec fasttime_t; 67 | 68 | // Return the current time. 69 | static inline fasttime_t gettime(void) { 70 | struct timespec s; 71 | #ifdef NDEBUG 72 | clock_gettime(CLOCK_MONOTONIC, &s); 73 | #else 74 | int r = clock_gettime(CLOCK_MONOTONIC, &s); 75 | assert(r == 0); 76 | #endif 77 | return s; 78 | } 79 | 80 | // Return the time different between the start and the end, as a float 81 | // in units of seconds. This function does not need to be fast. 82 | static inline double tdiff(fasttime_t start, fasttime_t end) { 83 | return end.tv_sec - start.tv_sec + 1e-9*(end.tv_nsec - start.tv_nsec); 84 | } 85 | 86 | static inline unsigned int random_seed_from_clock(void) { 87 | fasttime_t now = gettime(); 88 | return now.tv_sec + now.tv_nsec; 89 | } 90 | 91 | // Poison these symbols to help find portability problems. 92 | int clock_gettime(clockid_t, struct timespec *) __attribute__((deprecated)); 93 | time_t time(time_t *) __attribute__((deprecated)); 94 | 95 | #endif // LINUX 96 | 97 | #endif // INCLUDED_FASTTIME_DOT_H 98 | -------------------------------------------------------------------------------- /HW1/part2/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace std; 5 | 6 | void usage(const char* progname); 7 | void initValue(float* values1, float* values2, double* value3, float* output, unsigned int N); 8 | 9 | extern void test1(float* a, float* b, float* c, int N); 10 | extern void test2(float *__restrict a, float *__restrict b, float *__restrict c, int N); 11 | extern double test3(double* __restrict a, int N) ; 12 | 13 | int main(int argc, char * argv[]) { 14 | int N = 1024; 15 | int whichTestToRun = 1; 16 | 17 | // parse commandline options //////////////////////////////////////////// 18 | int opt; 19 | static struct option long_options[] = { 20 | {"size", 1, 0, 's'}, 21 | {"test", 1, 0, 't'}, 22 | {"help", 0, 0, '?'}, 23 | {0 ,0, 0, 0} 24 | }; 25 | 26 | while ((opt = getopt_long(argc, argv, "st:?", long_options, NULL)) != EOF) { 27 | 28 | switch (opt) { 29 | case 's': 30 | N = atoi(optarg); 31 | if (N <= 0) { 32 | cout << "Error: Workload size is set to" << N << " (<0).\n"; 33 | return -1; 34 | } 35 | break; 36 | case 't': 37 | whichTestToRun = atoi(optarg); 38 | if (whichTestToRun <= 0 || whichTestToRun >= 4) { 39 | cout << "Error: test" << whichTestToRun << "() is not available.\n"; 40 | return -1; 41 | } 42 | break; 43 | case 'h': 44 | default: 45 | usage(argv[0]); 46 | return 1; 47 | } 48 | } 49 | 50 | float* values1 = new(std::align_val_t{ 32 }) float[N]; 51 | float* values2 = new(std::align_val_t{ 32 }) float[N]; 52 | double* values3 = new(std::align_val_t{ 32 }) double[N]; 53 | float* output = new(std::align_val_t{ 32 }) float[N]; 54 | initValue(values1, values2, values3, output, N); 55 | 56 | cout << "Running test" << whichTestToRun << "()...\n"; 57 | switch (whichTestToRun) { 58 | case 1: test1(values1, values2, output, N); break; 59 | case 2: test2(values1, values2, output, N); break; 60 | case 3: test3(values3, N); break; 61 | } 62 | 63 | delete [] values1; 64 | delete [] values2; 65 | delete [] values3; 66 | delete [] output; 67 | 68 | return 0; 69 | } 70 | 71 | void usage(const char* progname) { 72 | printf("Usage: %s [options]\n", progname); 73 | printf("Program Options:\n"); 74 | printf(" -s --size Use workload size N (Default = 1024)\n"); 75 | printf(" -t --test Just run the testN function (Default = 1)\n"); 76 | printf(" -h --help This message\n"); 77 | } 78 | 79 | void initValue(float* values1, float* values2, double* values3, float* output, unsigned int N) { 80 | for (unsigned int i=0; i(rand()) / RAND_MAX; 84 | values2[i] = -1.f + 4.f * static_cast(rand()) / RAND_MAX; 85 | values3[i] = -1.f + 4.f * static_cast(rand()) / RAND_MAX; 86 | output[i] = 0.f; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /HW1/part2/test.h: -------------------------------------------------------------------------------- 1 | // Run for multiple experiments to reduce measurement error on gettime(). 2 | #define I 20000000 -------------------------------------------------------------------------------- /HW1/part2/test1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "test.h" 3 | #include "fasttime.h" 4 | 5 | void test1(float* __restrict a, float* __restrict b, float* __restrict c, int N) { 6 | __builtin_assume(N == 1024); 7 | 8 | a = (float *)__builtin_assume_aligned(a, 32); 9 | b = (float *)__builtin_assume_aligned(b, 32); 10 | c = (float *)__builtin_assume_aligned(c, 32); 11 | 12 | fasttime_t time1 = gettime(); 13 | for (int i=0; i 2 | #include "test.h" 3 | #include "fasttime.h" 4 | 5 | void test2(float *__restrict a, float *__restrict b, float *__restrict c, int N) 6 | { 7 | __builtin_assume(N == 1024); 8 | a = (float *)__builtin_assume_aligned(a, 16); 9 | b = (float *)__builtin_assume_aligned(b, 16); 10 | 11 | fasttime_t time1 = gettime(); 12 | for (int i = 0; i < I; i++) 13 | { 14 | for (int j = 0; j < N; j++) 15 | { 16 | /* max() */ 17 | c[j] = a[j]; 18 | if (b[j] > a[j]) 19 | c[j] = b[j]; 20 | // if (b[j] > a[j]) c[j] = b[j]; 21 | // else c[j] = a[j]; 22 | } 23 | } 24 | 25 | fasttime_t time2 = gettime(); 26 | 27 | double elapsedf = tdiff(time1, time2); 28 | std::cout << "Elapsed execution time of the loop in test2():\n" 29 | << elapsedf << "sec (N: " << N << ", I: " << I << ")\n"; 30 | } 31 | -------------------------------------------------------------------------------- /HW1/part2/test2.cpp.patch: -------------------------------------------------------------------------------- 1 | --- test2.cpp 2 | +++ test2.cpp 3 | @@ -14,9 +14,8 @@ 4 | for (int j = 0; j < N; j++) 5 | { 6 | /* max() */ 7 | - c[j] = a[j]; 8 | - if (b[j] > a[j]) 9 | - c[j] = b[j]; 10 | + if (b[j] > a[j]) c[j] = b[j]; 11 | + else c[j] = a[j]; 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /HW1/part2/test3.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "test.h" 3 | #include "fasttime.h" 4 | 5 | double test3(double* __restrict a, int N) { 6 | __builtin_assume(N == 1024); 7 | a = (double *)__builtin_assume_aligned(a, 16); 8 | 9 | double b = 0; 10 | 11 | fasttime_t time1 = gettime(); 12 | for (int i=0; i report.txt 27 | cat /proc/cpuinfo | grep MHz >> report.txt 28 | bash -c "{ time (./pi.out 3 100000000; ./pi.out 4 100000000) >>report.txt ; } 2>>report.txt" -------------------------------------------------------------------------------- /HW2/part1/pi.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "shishua-avx2.h" 7 | 8 | #define U32_MAX 0xffffffff 9 | 10 | typedef long int v4di __attribute__ ((vector_size (32))); 11 | typedef union { 12 | v4di v; 13 | long int e[4]; 14 | } ve4di; 15 | 16 | typedef double v4df __attribute__ ((vector_size (32))); 17 | typedef union { 18 | v4df v; 19 | double e[4]; 20 | } ve4df; 21 | 22 | typedef unsigned long u64; 23 | typedef unsigned int u32; 24 | typedef long long int s64; 25 | 26 | s64 hit; 27 | pthread_mutex_t hit_mutex = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; 28 | 29 | static inline void rand_init(prng_state *prng) 30 | { 31 | u64 seed[4]; 32 | ssize_t ret; 33 | 34 | ret = getrandom(seed, sizeof(seed), 0); 35 | 36 | if (ret < 0) { 37 | fprintf(stderr, "getrandom failed\n"); 38 | exit(0); 39 | } 40 | 41 | prng_init(prng, seed); 42 | } 43 | 44 | double get_rand(prng_state *prng, double min, double max) 45 | { 46 | uint8_t buf[0x80] __attribute__ ((aligned (64))); 47 | 48 | prng_gen(prng, buf, sizeof(buf)); 49 | 50 | return min + ((double)(*((u32 *)buf)) / U32_MAX) * (max - min); 51 | } 52 | 53 | void add_hit(int value) 54 | { 55 | pthread_mutex_lock(&hit_mutex); 56 | 57 | hit += value; 58 | 59 | pthread_mutex_unlock(&hit_mutex); 60 | } 61 | 62 | s64 _tf_estimate_pi_4(s64 toss_cnt, prng_state *prng) 63 | { 64 | s64 _hit = 0; 65 | ve4df fone = {1, 1, 1, 1}; 66 | ve4di ione = {1, 1, 1, 1}; 67 | ve4di result = {0, 0, 0, 0}; 68 | 69 | for (s64 i = 0; i < toss_cnt; i+=4) { 70 | ve4df distance; 71 | ve4di cmp; 72 | 73 | ve4df x = { get_rand(prng, -1, 1), 74 | get_rand(prng, -1, 1), 75 | get_rand(prng, -1, 1), 76 | get_rand(prng, -1, 1) }; 77 | ve4df y = { get_rand(prng, -1, 1), 78 | get_rand(prng, -1, 1), 79 | get_rand(prng, -1, 1), 80 | get_rand(prng, -1, 1) }; 81 | distance.v = x.v * x.v + y.v * y.v; 82 | 83 | cmp.v = distance.v <= fone.v; 84 | result.v = result.v + (ione.v & cmp.v); 85 | 86 | // printf("x: %f, %f, %f, %f\n", x.e[0], x.e[1], x.e[2], x.e[3]); 87 | // printf("y: %f, %f, %f, %f\n", y.e[0], y.e[1], y.e[2], y.e[3]); 88 | // printf("d: %f, %f, %f, %f\n", distance.e[0], distance.e[1], distance.e[2], distance.e[3]); 89 | // printf("c: %ld, %ld, %ld, %ld\n", cmp.e[0], cmp.e[1], cmp.e[2], cmp.e[3]); 90 | // printf("r: %ld, %ld, %ld, %ld\n", result.e[0], result.e[1], result.e[2], result.e[3]); 91 | } 92 | 93 | for (int i = 0; i < 4; ++i) { 94 | _hit += result.e[i]; 95 | } 96 | 97 | // printf("_hit: %lld\n", _hit); 98 | 99 | return _hit; 100 | } 101 | 102 | void *tf_estimate_pi(void *_toss_cnt) 103 | { 104 | s64 toss_cnt = (s64)_toss_cnt; 105 | s64 _hit = 0; 106 | s64 remain; 107 | prng_state prng; 108 | 109 | rand_init(&prng); 110 | 111 | remain = toss_cnt % 4; 112 | 113 | _hit += _tf_estimate_pi_4(toss_cnt - remain, &prng); 114 | 115 | for (s64 i = 0; i < remain; i++) { 116 | double x = get_rand(&prng, -1, 1); 117 | double y = get_rand(&prng, -1, 1); 118 | double distance = x * x + y * y; 119 | if (distance <= 1) 120 | _hit++; 121 | } 122 | 123 | add_hit(_hit); 124 | 125 | return NULL; 126 | } 127 | 128 | /* 129 | * pi.out takes two command-line arguments, which indicate the number of 130 | * threads and the number of tosses, respectively. The value of the first 131 | * and second arguments will not exceed the range of int and long long int, 132 | * respectively. 133 | */ 134 | int main(int argc, char **argv) 135 | { 136 | int thread_cnt; 137 | s64 toss_cnt, remain, loading; 138 | double pi; 139 | pthread_t *tid; 140 | int i; 141 | 142 | if (argc != 3) { 143 | fprintf(stderr, "Usage: %s thread_num tosses_num\n", argv[0]); 144 | return 0; 145 | } 146 | 147 | thread_cnt = atoi(argv[1]); 148 | toss_cnt = atoll(argv[2]); 149 | 150 | tid = (pthread_t *)malloc(sizeof(pthread_t) * thread_cnt); 151 | remain = toss_cnt; 152 | loading = toss_cnt / thread_cnt; 153 | 154 | for (i = 0; i < thread_cnt - 1; i++) { 155 | pthread_create(&tid[i], NULL, tf_estimate_pi, (void *)loading); 156 | remain -= loading; 157 | } 158 | pthread_create(&tid[i], NULL, tf_estimate_pi, (void *)remain); 159 | 160 | for (int i = 0; i < thread_cnt; i++) { 161 | pthread_join(tid[i], NULL); 162 | } 163 | 164 | pi = hit * (4 / ((double)toss_cnt)); 165 | 166 | printf("%f\n", pi); 167 | 168 | return 0; 169 | } -------------------------------------------------------------------------------- /HW2/part1/report.txt: -------------------------------------------------------------------------------- 1 | Model name: AMD Ryzen 7 PRO 4750U with Radeon Graphics 2 | Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero wbnoinvd arat umip rdpid overflow_recov succor 3 | cpu MHz : 1696.813 4 | cpu MHz : 1696.813 5 | cpu MHz : 1696.813 6 | cpu MHz : 1696.813 7 | 3.141600 8 | 3.141837 9 | 10 | real 0m0.660s 11 | user 0m2.107s 12 | sys 0m0.032s 13 | -------------------------------------------------------------------------------- /HW2/part1/shishua-avx2.h: -------------------------------------------------------------------------------- 1 | // Reference: https://github.com/espadrine/shishua 2 | #ifndef SHISHUA_AVX2_H 3 | #define SHISHUA_AVX2_H 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | typedef struct prng_state { 10 | __m256i state[4]; 11 | __m256i output[4]; 12 | __m256i counter; 13 | } prng_state; 14 | 15 | // buf's size must be a multiple of 128 bytes. 16 | static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) { 17 | __m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3], 18 | s0 = s->state[0], s1 = s->state[1], s2 = s->state[2], s3 = s->state[3], 19 | t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter; 20 | // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit 21 | // additions to strong positions for enrichment. The low 32-bit part of a 22 | // 64-bit chunk never moves to the same 64-bit chunk as its high part. 23 | // They do not remain in the same chunk. Each part eventually reaches all 24 | // positions ringwise: A to B, B to C, …, H to A. 25 | // You may notice that they are simply 256-bit rotations (96 and 160). 26 | __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5), 27 | shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3); 28 | // The counter is not necessary to beat PractRand. 29 | // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, 30 | // or about 7 millenia at 10 GiB/s. 31 | // The increments are picked as odd numbers, 32 | // since only coprimes of the base cover the full cycle, 33 | // and all odd numbers are coprime of 2. 34 | // I use different odd numbers for each 64-bit chunk 35 | // for a tiny amount of variation stirring. 36 | // I used the smallest odd numbers to avoid having a magic number. 37 | __m256i increment = _mm256_set_epi64x(1, 3, 5, 7); 38 | 39 | // TODO: consider adding proper uneven write handling 40 | assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); 41 | 42 | for (size_t i = 0; i < size; i += 128) { 43 | if (buf != NULL) { 44 | _mm256_storeu_si256((__m256i*)&buf[i + 0], o0); 45 | _mm256_storeu_si256((__m256i*)&buf[i + 32], o1); 46 | _mm256_storeu_si256((__m256i*)&buf[i + 64], o2); 47 | _mm256_storeu_si256((__m256i*)&buf[i + 96], o3); 48 | } 49 | 50 | // I apply the counter to s1, 51 | // since it is the one whose shift loses most entropy. 52 | s1 = _mm256_add_epi64(s1, counter); 53 | s3 = _mm256_add_epi64(s3, counter); 54 | counter = _mm256_add_epi64(counter, increment); 55 | 56 | // SIMD does not support rotations. Shift is the next best thing to entangle 57 | // bits with other 64-bit positions. We must shift by an odd number so that 58 | // each bit reaches all 64-bit positions, not just half. We must lose bits 59 | // of information, so we minimize it: 1 and 3. We use different shift values 60 | // to increase divergence between the two sides. We use rightward shift 61 | // because the rightmost bits have the least diffusion in addition (the low 62 | // bit is just a XOR of the low bits). 63 | u0 = _mm256_srli_epi64(s0, 1); u1 = _mm256_srli_epi64(s1, 3); 64 | u2 = _mm256_srli_epi64(s2, 1); u3 = _mm256_srli_epi64(s3, 3); 65 | t0 = _mm256_permutevar8x32_epi32(s0, shu0); t1 = _mm256_permutevar8x32_epi32(s1, shu1); 66 | t2 = _mm256_permutevar8x32_epi32(s2, shu0); t3 = _mm256_permutevar8x32_epi32(s3, shu1); 67 | // Addition is the main source of diffusion. 68 | // Storing the output in the state keeps that diffusion permanently. 69 | s0 = _mm256_add_epi64(t0, u0); s1 = _mm256_add_epi64(t1, u1); 70 | s2 = _mm256_add_epi64(t2, u2); s3 = _mm256_add_epi64(t3, u3); 71 | 72 | // Two orthogonally grown pieces evolving independently, XORed. 73 | o0 = _mm256_xor_si256(u0, t1); 74 | o1 = _mm256_xor_si256(u2, t3); 75 | o2 = _mm256_xor_si256(s0, s3); 76 | o3 = _mm256_xor_si256(s2, s1); 77 | } 78 | s->output[0] = o0; s->output[1] = o1; s->output[2] = o2; s->output[3] = o3; 79 | s->state [0] = s0; s->state [1] = s1; s->state [2] = s2; s->state [3] = s3; 80 | s->counter = counter; 81 | } 82 | 83 | // Nothing up my sleeve: those are the hex digits of Φ, 84 | // the least approximable irrational number. 85 | // $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc 86 | static uint64_t phi[16] = { 87 | 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, 88 | 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, 89 | 0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1, 90 | 0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5, 91 | }; 92 | 93 | void prng_init(prng_state *s, uint64_t seed[4]) { 94 | memset(s, 0, sizeof(prng_state)); 95 | # define STEPS 1 96 | # define ROUNDS 13 97 | uint8_t buf[128 * STEPS]; 98 | // Diffuse first two seed elements in s0, then the last two. Same for s1. 99 | // We must keep half of the state unchanged so users cannot set a bad state. 100 | s->state[0] = _mm256_set_epi64x(phi[ 3], phi[ 2] ^ seed[1], phi[ 1], phi[ 0] ^ seed[0]); 101 | s->state[1] = _mm256_set_epi64x(phi[ 7], phi[ 6] ^ seed[3], phi[ 5], phi[ 4] ^ seed[2]); 102 | s->state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[ 9], phi[ 8] ^ seed[2]); 103 | s->state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]); 104 | for (size_t i = 0; i < ROUNDS; i++) { 105 | prng_gen(s, buf, 128 * STEPS); 106 | s->state[0] = s->output[3]; s->state[1] = s->output[2]; 107 | s->state[2] = s->output[1]; s->state[3] = s->output[0]; 108 | } 109 | # undef STEPS 110 | # undef ROUNDS 111 | } 112 | #endif -------------------------------------------------------------------------------- /HW2/part2/Makefile: -------------------------------------------------------------------------------- 1 | 2 | CXX=g++ -m64 3 | CXXFLAGS=-I./common -Iobjs/ -O3 -std=c++17 -Wall 4 | 5 | APP_NAME=mandelbrot 6 | OBJDIR=objs 7 | COMMONDIR=./common 8 | 9 | PPM_CXX=$(COMMONDIR)/ppm.cpp 10 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o))) 11 | 12 | 13 | default: $(APP_NAME) 14 | 15 | .PHONY: dirs clean 16 | 17 | dirs: 18 | /bin/mkdir -p $(OBJDIR)/ 19 | 20 | clean: 21 | /bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME) 22 | 23 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/mandelbrotSerial.o $(OBJDIR)/mandelbrotThread.o $(PPM_OBJ) 24 | 25 | $(APP_NAME): dirs $(OBJS) 26 | $(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm -lpthread 27 | 28 | $(OBJDIR)/%.o: %.cpp 29 | $(CXX) $< $(CXXFLAGS) -c -o $@ 30 | 31 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp 32 | $(CXX) $< $(CXXFLAGS) -c -o $@ 33 | 34 | $(OBJDIR)/main.o: $(COMMONDIR)/CycleTimer.h 35 | 36 | .PHONY: report 37 | report: clean $(APP_NAME) 38 | lscpu | grep -E "name|Flags" > report.txt 39 | cat /proc/cpuinfo | grep MHz >> report.txt 40 | bash -c "./mandelbrot -t 3 >> report.txt ; ./mandelbrot -t 4 >> report.txt" -------------------------------------------------------------------------------- /HW2/part2/common/CycleTimer.h: -------------------------------------------------------------------------------- 1 | #ifndef _SYRAH_CYCLE_TIMER_H_ 2 | #define _SYRAH_CYCLE_TIMER_H_ 3 | 4 | #if defined(__APPLE__) 5 | #if defined(__x86_64__) 6 | #include 7 | #else 8 | #include 9 | #include 10 | #endif // __x86_64__ or not 11 | 12 | #include // fprintf 13 | #include // exit 14 | 15 | #elif _WIN32 16 | #include 17 | #include 18 | #else 19 | #include 20 | #include 21 | #include 22 | #include 23 | #endif 24 | 25 | // This uses the cycle counter of the processor. Different 26 | // processors in the system will have different values for this. If 27 | // you process moves across processors, then the delta time you 28 | // measure will likely be incorrect. This is mostly for fine 29 | // grained measurements where the process is likely to be on the 30 | // same processor. For more global things you should use the 31 | // Time interface. 32 | 33 | // Also note that if you processors' speeds change (i.e. processors 34 | // scaling) or if you are in a heterogenous environment, you will 35 | // likely get spurious results. 36 | class CycleTimer 37 | { 38 | public: 39 | typedef unsigned long long SysClock; 40 | 41 | ////////// 42 | // Return the current CPU time, in terms of clock ticks. 43 | // Time zero is at some arbitrary point in the past. 44 | static SysClock currentTicks() 45 | { 46 | #if defined(__APPLE__) && !defined(__x86_64__) 47 | return mach_absolute_time(); 48 | #elif defined(_WIN32) 49 | LARGE_INTEGER qwTime; 50 | QueryPerformanceCounter(&qwTime); 51 | return qwTime.QuadPart; 52 | #elif defined(__x86_64__) 53 | unsigned int a, d; 54 | asm volatile("rdtsc" 55 | : "=a"(a), "=d"(d)); 56 | return static_cast(a) | 57 | (static_cast(d) << 32); 58 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser. 59 | unsigned int val; 60 | asm volatile("mrc p15, 0, %0, c9, c13, 0" 61 | : "=r"(val)); 62 | return val; 63 | #else 64 | timespec spec; 65 | clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec); 66 | return CycleTimer::SysClock(static_cast(spec.tv_sec) * 1e9 + static_cast(spec.tv_nsec)); 67 | #endif 68 | } 69 | 70 | ////////// 71 | // Return the current CPU time, in terms of seconds. 72 | // This is slower than currentTicks(). Time zero is at 73 | // some arbitrary point in the past. 74 | static double currentSeconds() 75 | { 76 | return currentTicks() * secondsPerTick(); 77 | } 78 | 79 | ////////// 80 | // Return the conversion from seconds to ticks. 81 | static double ticksPerSecond() 82 | { 83 | return 1.0 / secondsPerTick(); 84 | } 85 | 86 | static const char *tickUnits() 87 | { 88 | #if defined(__APPLE__) && !defined(__x86_64__) 89 | return "ns"; 90 | #elif defined(__WIN32__) || defined(__x86_64__) 91 | return "cycles"; 92 | #else 93 | return "ns"; // clock_gettime 94 | #endif 95 | } 96 | 97 | ////////// 98 | // Return the conversion from ticks to seconds. 99 | static double secondsPerTick() 100 | { 101 | static bool initialized = false; 102 | static double secondsPerTick_val; 103 | if (initialized) 104 | return secondsPerTick_val; 105 | #if defined(__APPLE__) 106 | #ifdef __x86_64__ 107 | int args[] = {CTL_HW, HW_CPU_FREQ}; 108 | unsigned int Hz; 109 | size_t len = sizeof(Hz); 110 | if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) 111 | { 112 | fprintf(stderr, "Failed to initialize secondsPerTick_val!\n"); 113 | exit(-1); 114 | } 115 | secondsPerTick_val = 1.0 / (double)Hz; 116 | #else 117 | mach_timebase_info_data_t time_info; 118 | mach_timebase_info(&time_info); 119 | 120 | // Scales to nanoseconds without 1e-9f 121 | secondsPerTick_val = (1e-9 * static_cast(time_info.numer)) / 122 | static_cast(time_info.denom); 123 | #endif // x86_64 or not 124 | #elif defined(_WIN32) 125 | LARGE_INTEGER qwTicksPerSec; 126 | QueryPerformanceFrequency(&qwTicksPerSec); 127 | secondsPerTick_val = 1.0 / static_cast(qwTicksPerSec.QuadPart); 128 | #else 129 | FILE *fp = fopen("/proc/cpuinfo", "r"); 130 | char input[1024]; 131 | if (!fp) 132 | { 133 | fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo."); 134 | exit(-1); 135 | } 136 | // In case we don't find it, e.g. on the N900 137 | secondsPerTick_val = 1e-9; 138 | while (!feof(fp) && fgets(input, 1024, fp)) 139 | { 140 | // NOTE(boulos): Because reading cpuinfo depends on dynamic 141 | // frequency scaling it's better to read the @ sign first 142 | float GHz, MHz; 143 | if (strstr(input, "model name")) 144 | { 145 | char *at_sign = strstr(input, "@"); 146 | if (at_sign) 147 | { 148 | char *after_at = at_sign + 1; 149 | char *GHz_str = strstr(after_at, "GHz"); 150 | char *MHz_str = strstr(after_at, "MHz"); 151 | if (GHz_str) 152 | { 153 | *GHz_str = '\0'; 154 | if (1 == sscanf(after_at, "%f", &GHz)) 155 | { 156 | //printf("GHz = %f\n", GHz); 157 | secondsPerTick_val = 1e-9f / GHz; 158 | break; 159 | } 160 | } 161 | else if (MHz_str) 162 | { 163 | *MHz_str = '\0'; 164 | if (1 == sscanf(after_at, "%f", &MHz)) 165 | { 166 | //printf("MHz = %f\n", MHz); 167 | secondsPerTick_val = 1e-6f / GHz; 168 | break; 169 | } 170 | } 171 | } 172 | } 173 | else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) 174 | { 175 | //printf("MHz = %f\n", MHz); 176 | secondsPerTick_val = 1e-6f / MHz; 177 | break; 178 | } 179 | } 180 | fclose(fp); 181 | #endif 182 | 183 | initialized = true; 184 | return secondsPerTick_val; 185 | } 186 | 187 | ////////// 188 | // Return the conversion from ticks to milliseconds. 189 | static double msPerTick() 190 | { 191 | return secondsPerTick() * 1000.0; 192 | } 193 | 194 | private: 195 | CycleTimer(); 196 | }; 197 | 198 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_ 199 | -------------------------------------------------------------------------------- /HW2/part2/common/ppm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | void writePPMImage(int *data, int width, int height, const char *filename, int maxIterations) 7 | { 8 | FILE *fp = fopen(filename, "wb"); 9 | 10 | // write ppm header 11 | fprintf(fp, "P6\n"); 12 | fprintf(fp, "%d %d\n", width, height); 13 | fprintf(fp, "255\n"); 14 | 15 | for (int i = 0; i < width * height; ++i) 16 | { 17 | 18 | // Clamp iteration count for this pixel, then scale the value 19 | // to 0-1 range. Raise resulting value to a power (<1) to 20 | // increase brightness of low iteration count 21 | // pixels. a.k.a. Make things look cooler. 22 | 23 | float mapped = pow(std::min(static_cast(maxIterations), 24 | static_cast(data[i])) / 25 | 256.f, 26 | .5f); 27 | 28 | // convert back into 0-255 range, 8-bit channels 29 | unsigned char result = static_cast(255.f * mapped); 30 | for (int j = 0; j < 3; ++j) 31 | fputc(result, fp); 32 | } 33 | fclose(fp); 34 | printf("Wrote image file %s\n", filename); 35 | } 36 | -------------------------------------------------------------------------------- /HW2/part2/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "CycleTimer.h" 6 | 7 | extern void mandelbrotSerial( 8 | float x0, float y0, float x1, float y1, 9 | int width, int height, 10 | int startRow, int numRows, 11 | int maxIterations, 12 | int output[]); 13 | 14 | extern void mandelbrotThread( 15 | int numThreads, 16 | float x0, float y0, float x1, float y1, 17 | int width, int height, 18 | int maxIterations, 19 | int output[]); 20 | 21 | extern void writePPMImage( 22 | int* data, 23 | int width, int height, 24 | const char *filename, 25 | int maxIterations); 26 | 27 | void 28 | scaleAndShift(float& x0, float& x1, float& y0, float& y1, 29 | float scale, 30 | float shiftX, float shiftY) 31 | { 32 | 33 | x0 *= scale; 34 | x1 *= scale; 35 | y0 *= scale; 36 | y1 *= scale; 37 | x0 += shiftX; 38 | x1 += shiftX; 39 | y0 += shiftY; 40 | y1 += shiftY; 41 | 42 | } 43 | 44 | void usage(const char* progname) { 45 | printf("Usage: %s [options]\n", progname); 46 | printf("Program Options:\n"); 47 | printf(" -t --threads Use N threads\n"); 48 | printf(" -v --view Use specified view settings\n"); 49 | printf(" -? --help This message\n"); 50 | } 51 | 52 | bool verifyResult (int *gold, int *result, int width, int height) { 53 | 54 | int i, j; 55 | 56 | for (i = 0; i < height; i++) { 57 | for (j = 0; j < width; j++) { 58 | if (gold[i * width + j] != result[i * width + j]) { 59 | printf ("Mismatch : [%d][%d], Expected : %d, Actual : %d\n", 60 | i, j, gold[i * width + j], result[i * width + j]); 61 | return 0; 62 | } 63 | } 64 | } 65 | 66 | return 1; 67 | } 68 | 69 | int main(int argc, char** argv) { 70 | 71 | const unsigned int width = 1600; 72 | const unsigned int height = 1200; 73 | const int maxIterations = 256; 74 | int numThreads = 2; 75 | 76 | float x0 = -2; 77 | float x1 = 1; 78 | float y0 = -1; 79 | float y1 = 1; 80 | 81 | // parse commandline options //////////////////////////////////////////// 82 | int opt; 83 | static struct option long_options[] = { 84 | {"threads", 1, 0, 't'}, 85 | {"view", 1, 0, 'v'}, 86 | {"help", 0, 0, '?'}, 87 | {0 ,0, 0, 0} 88 | }; 89 | 90 | while ((opt = getopt_long(argc, argv, "t:v:?", long_options, NULL)) != EOF) { 91 | 92 | switch (opt) { 93 | case 't': 94 | { 95 | numThreads = atoi(optarg); 96 | break; 97 | } 98 | case 'v': 99 | { 100 | int viewIndex = atoi(optarg); 101 | // change view settings 102 | if (viewIndex == 2) { 103 | float scaleValue = .015f; 104 | float shiftX = -.986f; 105 | float shiftY = .30f; 106 | scaleAndShift(x0, x1, y0, y1, scaleValue, shiftX, shiftY); 107 | } else if (viewIndex > 1) { 108 | fprintf(stderr, "Invalid view index\n"); 109 | return 1; 110 | } 111 | break; 112 | } 113 | case '?': 114 | default: 115 | usage(argv[0]); 116 | return 1; 117 | } 118 | } 119 | // end parsing of commandline options 120 | 121 | 122 | int* output_serial = new int[width*height]; 123 | int* output_thread = new int[width*height]; 124 | 125 | // 126 | // Run the serial implementation. Run the code three times and 127 | // take the minimum to get a good estimate. 128 | // 129 | 130 | double minSerial = 1e30; 131 | for (int i = 0; i < 5; ++i) { 132 | memset(output_serial, 0, width * height * sizeof(int)); 133 | double startTime = CycleTimer::currentSeconds(); 134 | mandelbrotSerial(x0, y0, x1, y1, width, height, 0, height, maxIterations, output_serial); 135 | double endTime = CycleTimer::currentSeconds(); 136 | minSerial = std::min(minSerial, endTime - startTime); 137 | } 138 | 139 | printf("[mandelbrot serial]:\t\t[%.3f] ms\n", minSerial * 1000); 140 | writePPMImage(output_serial, width, height, "mandelbrot-serial.ppm", maxIterations); 141 | 142 | // 143 | // Run the threaded version 144 | // 145 | 146 | double minThread = 1e30; 147 | for (int i = 0; i < 5; ++i) { 148 | memset(output_thread, 0, width * height * sizeof(int)); 149 | double startTime = CycleTimer::currentSeconds(); 150 | mandelbrotThread(numThreads, x0, y0, x1, y1, width, height, maxIterations, output_thread); 151 | double endTime = CycleTimer::currentSeconds(); 152 | minThread = std::min(minThread, endTime - startTime); 153 | } 154 | 155 | printf("[mandelbrot thread]:\t\t[%.3f] ms\n", minThread * 1000); 156 | writePPMImage(output_thread, width, height, "mandelbrot-thread.ppm", maxIterations); 157 | 158 | if (! verifyResult (output_serial, output_thread, width, height)) { 159 | printf ("Error : Output from threads does not match serial output\n"); 160 | 161 | delete[] output_serial; 162 | delete[] output_thread; 163 | 164 | return 1; 165 | } 166 | 167 | // compute speedup 168 | printf("\t\t\t\t(%.2fx speedup from %d threads)\n", minSerial/minThread, numThreads); 169 | 170 | delete[] output_serial; 171 | delete[] output_thread; 172 | 173 | return 0; 174 | } 175 | -------------------------------------------------------------------------------- /HW2/part2/mandelbrotSerial.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Note: This code was modified from example code 4 | originally provided by Intel. To comply with Intel's open source 5 | licensing agreement, their copyright is retained below. 6 | 7 | ----------------------------------------------------------------- 8 | 9 | Copyright (c) 2010-2011, Intel Corporation 10 | All rights reserved. 11 | 12 | Redistribution and use in source and binary forms, with or without 13 | modification, are permitted provided that the following conditions are 14 | met: 15 | 16 | * Redistributions of source code must retain the above copyright 17 | notice, this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of Intel Corporation nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 28 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 30 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 31 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 34 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 | */ 39 | 40 | static inline int mandel(float c_re, float c_im, int count) 41 | { 42 | float z_re = c_re, z_im = c_im; 43 | int i; 44 | for (i = 0; i < count; ++i) 45 | { 46 | 47 | if (z_re * z_re + z_im * z_im > 4.f) 48 | break; 49 | 50 | float new_re = z_re * z_re - z_im * z_im; 51 | float new_im = 2.f * z_re * z_im; 52 | z_re = c_re + new_re; 53 | z_im = c_im + new_im; 54 | } 55 | 56 | return i; 57 | } 58 | 59 | // 60 | // MandelbrotSerial -- 61 | // 62 | // Compute an image visualizing the mandelbrot set. The resulting 63 | // array contains the number of iterations required before the complex 64 | // number corresponding to a pixel could be rejected from the set. 65 | // 66 | // * x0, y0, x1, y1 describe the complex coordinates mapping 67 | // into the image viewport. 68 | // * width, height describe the size of the output image 69 | // * startRow, totalRows describe how much of the image to compute 70 | void mandelbrotSerial( 71 | float x0, float y0, float x1, float y1, 72 | int width, int height, 73 | int startRow, int totalRows, 74 | int maxIterations, 75 | int output[]) 76 | { 77 | float dx = (x1 - x0) / width; 78 | float dy = (y1 - y0) / height; 79 | 80 | int endRow = startRow + totalRows; 81 | 82 | for (int j = startRow; j < endRow; j++) 83 | { 84 | for (int i = 0; i < width; ++i) 85 | { 86 | float x = x0 + i * dx; 87 | float y = y0 + j * dy; 88 | 89 | int index = (j * width + i); 90 | output[index] = mandel(x, y, maxIterations); 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /HW2/part2/report.txt: -------------------------------------------------------------------------------- 1 | Model name: AMD Ryzen 7 PRO 4750U with Radeon Graphics 2 | Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero wbnoinvd arat umip rdpid overflow_recov succor 3 | cpu MHz : 1696.813 4 | cpu MHz : 1696.813 5 | cpu MHz : 1696.813 6 | cpu MHz : 1696.813 7 | [mandelbrot serial]: [543.537] ms 8 | Wrote image file mandelbrot-serial.ppm 9 | Finish Time Report: 10 | Thread 0: 0.054451 11 | Thread 1: 0.058942 12 | Thread 2: 0.059356 13 | Finish Time Report: 14 | Thread 0: 0.050994 15 | Thread 1: 0.056074 16 | Thread 2: 0.055579 17 | Finish Time Report: 18 | Thread 0: 0.055361 19 | Thread 1: 0.052566 20 | Thread 2: 0.054997 21 | Finish Time Report: 22 | Thread 0: 0.054042 23 | Thread 1: 0.052141 24 | Thread 2: 0.055408 25 | Finish Time Report: 26 | Thread 0: 0.053885 27 | Thread 1: 0.050878 28 | Thread 2: 0.055124 29 | [mandelbrot thread]: [55.478] ms 30 | Wrote image file mandelbrot-thread.ppm 31 | (9.80x speedup from 3 threads) 32 | [mandelbrot serial]: [556.734] ms 33 | Wrote image file mandelbrot-serial.ppm 34 | Finish Time Report: 35 | Thread 0: 0.044371 36 | Thread 1: 0.046961 37 | Thread 2: 0.043488 38 | Thread 3: 0.044690 39 | Finish Time Report: 40 | Thread 0: 0.042621 41 | Thread 1: 0.047911 42 | Thread 2: 0.043151 43 | Thread 3: 0.042625 44 | Finish Time Report: 45 | Thread 0: 0.040758 46 | Thread 1: 0.059524 47 | Thread 2: 0.046138 48 | Thread 3: 0.041580 49 | Finish Time Report: 50 | Thread 0: 0.042828 51 | Thread 1: 0.064692 52 | Thread 2: 0.043270 53 | Thread 3: 0.040743 54 | Finish Time Report: 55 | Thread 0: 0.038417 56 | Thread 1: 0.043711 57 | Thread 2: 0.046472 58 | Thread 3: 0.038204 59 | [mandelbrot thread]: [47.160] ms 60 | Wrote image file mandelbrot-thread.ppm 61 | (11.81x speedup from 4 threads) 62 | -------------------------------------------------------------------------------- /HW2/submit/part1/Makefile: -------------------------------------------------------------------------------- 1 | ../../part1/Makefile -------------------------------------------------------------------------------- /HW2/submit/part1/pi.c: -------------------------------------------------------------------------------- 1 | ../../part1/pi.c -------------------------------------------------------------------------------- /HW2/submit/part1/shishua-avx2.h: -------------------------------------------------------------------------------- 1 | ../../part1/shishua-avx2.h -------------------------------------------------------------------------------- /HW2/submit/part2/mandelbrotThread.cpp: -------------------------------------------------------------------------------- 1 | ../../part2/mandelbrotThread.cpp -------------------------------------------------------------------------------- /HW2/submit/url.txt: -------------------------------------------------------------------------------- 1 | https://hackmd.io/@LJP/SyBbA0_rs -------------------------------------------------------------------------------- /HW3/part1/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=cg 3 | BENCHMARKU=CG 4 | PROGRAMNAME=cg 5 | DATASIZE=MEDIUMN 6 | 7 | default: ${PROGRAMNAME} grade 8 | 9 | include make.common 10 | 11 | OBJS = cg_impl.o \ 12 | ${COMMON}/${RAND}.o \ 13 | ${COMMON}/c_timers.o \ 14 | ${COMMON}/wtime.o 15 | 16 | ${PROGRAMNAME}: config ${PROGRAMNAME}.o ${OBJS} 17 | ${CLINK} ${CLINKFLAGS} -Wl,--allow-multiple-definition -o ${PROGRAMNAME} ${PROGRAMNAME}.o ${OBJS} ${C_LIB} 18 | 19 | grade: config grade.o ${OBJS} 20 | ${CLINK} ${CLINKFLAGS} -Wl,--allow-multiple-definition -o cg_grader grade.o ${OBJS} ref_cg.a def_cg.a ${C_LIB} 21 | 22 | .c.o: 23 | ${CCOMPILE} $< -D${DATASIZE} 24 | 25 | cg.o: cg.c globals.h 26 | cg_impl.o: cg_impl.c globals.h 27 | 28 | clean: 29 | - rm -f *.o *~ 30 | rm -f ${COMMON}/*.o 31 | rm -f ${PROGRAMNAME} cg_grader 32 | rm -f gmon.out 33 | 34 | profiling: CFLAGS += -pg -Wall 35 | profiling: CLINKFLAGS += -pg -Wall 36 | profiling: clean ${PROGRAMNAME} 37 | ./${PROGRAMNAME} 38 | gprof ${PROGRAMNAME} gmon.out -b > profiling_result 39 | sudo perf record -e cpu-cycles ./${PROGRAMNAME} 40 | # sudo perf report -F+period,srcline 41 | 42 | report: clean ${PROGRAMNAME} 43 | lscpu | grep -E "name|Flags" > report.txt 44 | cat /proc/cpuinfo | grep MHz >> report.txt 45 | ./cg_grader >> report.txt -------------------------------------------------------------------------------- /HW3/part1/README: -------------------------------------------------------------------------------- 1 | Files: 2 | cg.c : main function. 3 | cg_impl.c: the implementation of conjugate gradient method. 4 | globals.h : some data definitions. 5 | common : functions for verification and time calculation. 6 | bin : executable output directory. 7 | Makefile, make.common : make systems. 8 | 9 | Build up: 10 | make DATASIZE=[LARGE|MEDIUMN|SMALL] 11 | (MEDIUMN by default) 12 | Please make clean first if you want to change DATASIZE. 13 | 14 | Check correctness: 15 | Main function contains the verification procedure. It shows VERIFICATION SUCCESSFUL/FAILED on the screen to indicate the correctness of the program. 16 | -------------------------------------------------------------------------------- /HW3/part1/cg.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "globals.h" 6 | #include "randdp.h" 7 | #include "timers.h" 8 | #include "cg_impl.h" 9 | 10 | void init(double *zeta); 11 | void iterate(double *zeta, int *it); 12 | 13 | int main(int argc, char *argv[]) 14 | { 15 | int i, j, k, it; 16 | 17 | double zeta; 18 | 19 | double t, t_total; 20 | 21 | //char Class; 22 | logical verified; 23 | double zeta_verify_value, epsilon, err; 24 | 25 | char *t_names[T_last]; 26 | 27 | for (i = 0; i < T_last; i++) 28 | { 29 | timer_clear(i); 30 | } 31 | 32 | timer_start(T_init); 33 | 34 | zeta_verify_value = VALID_RESULT; 35 | 36 | printf("\nCG start...\n\n"); 37 | printf(" Size: %11d\n", NA); 38 | printf(" Iterations: %5d\n", NITER); 39 | printf("\n"); 40 | 41 | init(&zeta); 42 | 43 | zeta = 0.0; 44 | 45 | //--------------------------------------------------------------------- 46 | //----> 47 | // Do one iteration untimed to init all code and data page tables 48 | //----> (then reinit, start timing, to niter its) 49 | //--------------------------------------------------------------------- 50 | for (it = 1; it <= 1; it++) 51 | { 52 | iterate(&zeta, &it); 53 | } // end of do one iteration untimed 54 | 55 | //--------------------------------------------------------------------- 56 | // set starting vector to (1, 1, .... 1) 57 | //--------------------------------------------------------------------- 58 | for (i = 0; i < NA + 1; i++) 59 | { 60 | x[i] = 1.0; 61 | } 62 | 63 | zeta = 0.0; 64 | 65 | timer_stop(T_init); 66 | 67 | printf(" Initialization time = %15.3f seconds\n", timer_read(T_init)); 68 | t_total += timer_read(T_init); 69 | 70 | timer_start(T_bench); 71 | 72 | //--------------------------------------------------------------------- 73 | //----> 74 | // Main Iteration for inverse power method 75 | //----> 76 | //--------------------------------------------------------------------- 77 | for (it = 1; it <= NITER; it++) 78 | { 79 | iterate(&zeta, &it); 80 | } // end of main iter inv pow meth 81 | 82 | timer_stop(T_bench); 83 | 84 | //--------------------------------------------------------------------- 85 | // End of timed section 86 | //--------------------------------------------------------------------- 87 | 88 | t = timer_read(T_bench); 89 | t_total += t; 90 | 91 | printf("\nComplete...\n"); 92 | 93 | epsilon = 1.0e-10; 94 | err = fabs(zeta - zeta_verify_value) / zeta_verify_value; 95 | if (err <= epsilon) 96 | { 97 | verified = true; 98 | printf(" VERIFICATION SUCCESSFUL\n"); 99 | printf(" Zeta is %20.13E\n", zeta); 100 | printf(" Error is %20.13E\n", err); 101 | } 102 | else 103 | { 104 | verified = false; 105 | printf(" VERIFICATION FAILED\n"); 106 | printf(" Zeta %20.13E\n", zeta); 107 | printf(" The correct zeta is %20.13E\n", zeta_verify_value); 108 | } 109 | 110 | printf("\n\nExecution time : %lf seconds\n\n", t); 111 | 112 | printf("Total Time: %lf seconds\n\n", t_total); 113 | 114 | return 0; 115 | } 116 | -------------------------------------------------------------------------------- /HW3/part1/cg_impl.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | #include "globals.h" 7 | #include "randdp.h" 8 | #include "timers.h" 9 | 10 | //--------------------------------------------------------------------- 11 | /* common / main_int_mem / */ 12 | int colidx[NZ]; 13 | int rowstr[NA + 1]; 14 | int iv[NA]; 15 | int arow[NA]; 16 | int acol[NAZ]; 17 | 18 | /* common / main_flt_mem / */ 19 | double aelt[NAZ]; 20 | double a[NZ]; 21 | double x[NA + 2]; 22 | double z[NA + 2]; 23 | double p[NA + 2]; 24 | double q[NA + 2]; 25 | double r[NA + 2]; 26 | 27 | /* common / partit_size / */ 28 | int naa; 29 | int nzz; 30 | int firstrow; 31 | int lastrow; 32 | int firstcol; 33 | int lastcol; 34 | 35 | /* common /urando/ */ 36 | double amult; 37 | double tran; 38 | 39 | /* common /timers/ */ 40 | logical timeron; 41 | //--------------------------------------------------------------------- 42 | 43 | //--------------------------------------------------------------------- 44 | void conj_grad(int colidx[], 45 | int rowstr[], 46 | double x[], 47 | double z[], 48 | double a[], 49 | double p[], 50 | double q[], 51 | double r[], 52 | double *rnorm); 53 | void makea(int n, 54 | int nz, 55 | double a[], 56 | int colidx[], 57 | int rowstr[], 58 | int firstrow, 59 | int lastrow, 60 | int firstcol, 61 | int lastcol, 62 | int arow[], 63 | int acol[][NONZER + 1], 64 | double aelt[][NONZER + 1], 65 | int iv[]); 66 | void sparse(double a[], 67 | int colidx[], 68 | int rowstr[], 69 | int n, 70 | int nz, 71 | int nozer, 72 | int arow[], 73 | int acol[][NONZER + 1], 74 | double aelt[][NONZER + 1], 75 | int firstrow, 76 | int lastrow, 77 | int nzloc[], 78 | double rcond, 79 | double shift); 80 | void sprnvc(int n, int nz, int nn1, double v[], int iv[]); 81 | int icnvrt(double x, int ipwr2); 82 | void vecset(int n, double v[], int iv[], int *nzv, int i, double val); 83 | void init(double *zeta); 84 | void iterate(double *zeta, int *it); -------------------------------------------------------------------------------- /HW3/part1/common/c_timers.c: -------------------------------------------------------------------------------- 1 | #include "wtime.h" 2 | #include 3 | 4 | /* Prototype */ 5 | void wtime( double * ); 6 | 7 | 8 | /*****************************************************************/ 9 | /****** E L A P S E D _ T I M E ******/ 10 | /*****************************************************************/ 11 | static double elapsed_time( void ) 12 | { 13 | double t; 14 | 15 | wtime( &t ); 16 | return( t ); 17 | } 18 | 19 | 20 | static double start[64], elapsed[64]; 21 | 22 | /*****************************************************************/ 23 | /****** T I M E R _ C L E A R ******/ 24 | /*****************************************************************/ 25 | void timer_clear( int n ) 26 | { 27 | elapsed[n] = 0.0; 28 | } 29 | 30 | 31 | /*****************************************************************/ 32 | /****** T I M E R _ S T A R T ******/ 33 | /*****************************************************************/ 34 | void timer_start( int n ) 35 | { 36 | start[n] = elapsed_time(); 37 | } 38 | 39 | 40 | /*****************************************************************/ 41 | /****** T I M E R _ S T O P ******/ 42 | /*****************************************************************/ 43 | void timer_stop( int n ) 44 | { 45 | double t, now; 46 | 47 | now = elapsed_time(); 48 | t = now - start[n]; 49 | elapsed[n] += t; 50 | 51 | } 52 | 53 | 54 | /*****************************************************************/ 55 | /****** T I M E R _ R E A D ******/ 56 | /*****************************************************************/ 57 | double timer_read( int n ) 58 | { 59 | return( elapsed[n] ); 60 | } 61 | 62 | -------------------------------------------------------------------------------- /HW3/part1/common/randdp.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | double randlc( double *x, double a ) 5 | { 6 | //-------------------------------------------------------------------- 7 | // 8 | // This routine returns a uniform pseudorandom double precision number in the 9 | // range (0, 1) by using the linear congruential generator 10 | // 11 | // x_{k+1} = a x_k (mod 2^46) 12 | // 13 | // where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers 14 | // before repeating. The argument A is the same as 'a' in the above formula, 15 | // and X is the same as x_0. A and X must be odd double precision integers 16 | // in the range (1, 2^46). The returned value RANDLC is normalized to be 17 | // between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain 18 | // the new seed x_1, so that subsequent calls to RANDLC using the same 19 | // arguments will generate a continuous sequence. 20 | // 21 | // This routine should produce the same results on any computer with at least 22 | // 48 mantissa bits in double precision floating point data. On 64 bit 23 | // systems, double precision should be disabled. 24 | // 25 | // David H. Bailey October 26, 1990 26 | // 27 | //-------------------------------------------------------------------- 28 | 29 | // r23 = pow(0.5, 23.0); 30 | //// pow(0.5, 23.0) = 1.1920928955078125e-07 31 | // r46 = r23 * r23; 32 | // t23 = pow(2.0, 23.0); 33 | //// pow(2.0, 23.0) = 8.388608e+06 34 | // t46 = t23 * t23; 35 | 36 | const double r23 = 1.1920928955078125e-07; 37 | const double r46 = r23 * r23; 38 | const double t23 = 8.388608e+06; 39 | const double t46 = t23 * t23; 40 | 41 | double t1, t2, t3, t4, a1, a2, x1, x2, z; 42 | double r; 43 | 44 | //-------------------------------------------------------------------- 45 | // Break A into two parts such that A = 2^23 * A1 + A2. 46 | //-------------------------------------------------------------------- 47 | t1 = r23 * a; 48 | a1 = (int) t1; 49 | a2 = a - t23 * a1; 50 | 51 | //-------------------------------------------------------------------- 52 | // Break X into two parts such that X = 2^23 * X1 + X2, compute 53 | // Z = A1 * X2 + A2 * X1 (mod 2^23), and then 54 | // X = 2^23 * Z + A2 * X2 (mod 2^46). 55 | //-------------------------------------------------------------------- 56 | t1 = r23 * (*x); 57 | x1 = (int) t1; 58 | x2 = *x - t23 * x1; 59 | t1 = a1 * x2 + a2 * x1; 60 | t2 = (int) (r23 * t1); 61 | z = t1 - t23 * t2; 62 | t3 = t23 * z + a2 * x2; 63 | t4 = (int) (r46 * t3); 64 | *x = t3 - t46 * t4; 65 | r = r46 * (*x); 66 | 67 | return r; 68 | } 69 | 70 | 71 | void vranlc( int n, double *x, double a, double y[] ) 72 | { 73 | //-------------------------------------------------------------------- 74 | // 75 | // This routine generates N uniform pseudorandom double precision numbers in 76 | // the range (0, 1) by using the linear congruential generator 77 | // 78 | // x_{k+1} = a x_k (mod 2^46) 79 | // 80 | // where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers 81 | // before repeating. The argument A is the same as 'a' in the above formula, 82 | // and X is the same as x_0. A and X must be odd double precision integers 83 | // in the range (1, 2^46). The N results are placed in Y and are normalized 84 | // to be between 0 and 1. X is updated to contain the new seed, so that 85 | // subsequent calls to VRANLC using the same arguments will generate a 86 | // continuous sequence. If N is zero, only initialization is performed, and 87 | // the variables X, A and Y are ignored. 88 | // 89 | // This routine is the standard version designed for scalar or RISC systems. 90 | // However, it should produce the same results on any single processor 91 | // computer with at least 48 mantissa bits in double precision floating point 92 | // data. On 64 bit systems, double precision should be disabled. 93 | // 94 | //-------------------------------------------------------------------- 95 | 96 | // r23 = pow(0.5, 23.0); 97 | //// pow(0.5, 23.0) = 1.1920928955078125e-07 98 | // r46 = r23 * r23; 99 | // t23 = pow(2.0, 23.0); 100 | //// pow(2.0, 23.0) = 8.388608e+06 101 | // t46 = t23 * t23; 102 | 103 | const double r23 = 1.1920928955078125e-07; 104 | const double r46 = r23 * r23; 105 | const double t23 = 8.388608e+06; 106 | const double t46 = t23 * t23; 107 | 108 | double t1, t2, t3, t4, a1, a2, x1, x2, z; 109 | 110 | int i; 111 | 112 | //-------------------------------------------------------------------- 113 | // Break A into two parts such that A = 2^23 * A1 + A2. 114 | //-------------------------------------------------------------------- 115 | t1 = r23 * a; 116 | a1 = (int) t1; 117 | a2 = a - t23 * a1; 118 | 119 | //-------------------------------------------------------------------- 120 | // Generate N results. This loop is not vectorizable. 121 | //-------------------------------------------------------------------- 122 | for ( i = 0; i < n; i++ ) { 123 | //-------------------------------------------------------------------- 124 | // Break X into two parts such that X = 2^23 * X1 + X2, compute 125 | // Z = A1 * X2 + A2 * X1 (mod 2^23), and then 126 | // X = 2^23 * Z + A2 * X2 (mod 2^46). 127 | //-------------------------------------------------------------------- 128 | t1 = r23 * (*x); 129 | x1 = (int) t1; 130 | x2 = *x - t23 * x1; 131 | t1 = a1 * x2 + a2 * x1; 132 | t2 = (int) (r23 * t1); 133 | z = t1 - t23 * t2; 134 | t3 = t23 * z + a2 * x2; 135 | t4 = (int) (r46 * t3) ; 136 | *x = t3 - t46 * t4; 137 | y[i] = r46 * (*x); 138 | } 139 | 140 | return; 141 | } 142 | 143 | -------------------------------------------------------------------------------- /HW3/part1/common/randdp.h: -------------------------------------------------------------------------------- 1 | #ifndef __RANDDP_H__ 2 | #define __RANDDP_H__ 3 | 4 | double randlc( double *x, double a ); 5 | void vranlc( int n, double *x, double a, double y[] ); 6 | 7 | #endif 8 | 9 | -------------------------------------------------------------------------------- /HW3/part1/common/timers.h: -------------------------------------------------------------------------------- 1 | #ifndef __TIMERS_H__ 2 | #define __TIMERS_H__ 3 | 4 | void timer_clear( int n ); 5 | void timer_start( int n ); 6 | void timer_stop( int n ); 7 | double timer_read( int n ); 8 | 9 | #endif 10 | 11 | -------------------------------------------------------------------------------- /HW3/part1/common/type.h: -------------------------------------------------------------------------------- 1 | #ifndef __TYPE_H__ 2 | #define __TYPE_H__ 3 | 4 | typedef enum { false, true } logical; 5 | typedef struct { 6 | double real; 7 | double imag; 8 | } dcomplex; 9 | 10 | 11 | #define min(x,y) ((x) < (y) ? (x) : (y)) 12 | #define max(x,y) ((x) > (y) ? (x) : (y)) 13 | 14 | #endif //__TYPE_H__ 15 | -------------------------------------------------------------------------------- /HW3/part1/common/wtime.c: -------------------------------------------------------------------------------- 1 | #include "wtime.h" 2 | #include 3 | #ifndef DOS 4 | #include 5 | #endif 6 | 7 | void wtime(double *t) 8 | { 9 | static int sec = -1; 10 | struct timeval tv; 11 | gettimeofday(&tv, (void *)0); 12 | if (sec < 0) sec = tv.tv_sec; 13 | *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec; 14 | } 15 | 16 | 17 | -------------------------------------------------------------------------------- /HW3/part1/common/wtime.h: -------------------------------------------------------------------------------- 1 | /* C/Fortran interface is different on different machines. 2 | * You may need to tweak this. 3 | */ 4 | 5 | 6 | #if defined(IBM) 7 | #define wtime wtime 8 | #elif defined(CRAY) 9 | #define wtime WTIME 10 | #else 11 | #define wtime wtime_ 12 | #endif 13 | -------------------------------------------------------------------------------- /HW3/part1/common/wtime_sgi64.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | /* The following works on SGI Power Challenge systems */ 10 | 11 | typedef unsigned long iotimer_t; 12 | 13 | unsigned int cycleval; 14 | volatile iotimer_t *iotimer_addr, base_counter; 15 | double resolution; 16 | 17 | /* address_t is an integer type big enough to hold an address */ 18 | typedef unsigned long address_t; 19 | 20 | 21 | 22 | void timer_init() 23 | { 24 | 25 | int fd; 26 | char *virt_addr; 27 | address_t phys_addr, page_offset, pagemask, pagebase_addr; 28 | 29 | pagemask = getpagesize() - 1; 30 | errno = 0; 31 | phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval); 32 | if (errno != 0) { 33 | perror("SGI_QUERY_CYCLECNTR"); 34 | exit(1); 35 | } 36 | /* rel_addr = page offset of physical address */ 37 | page_offset = phys_addr & pagemask; 38 | pagebase_addr = phys_addr - page_offset; 39 | fd = open("/dev/mmem", O_RDONLY); 40 | 41 | virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr); 42 | virt_addr = virt_addr + page_offset; 43 | iotimer_addr = (iotimer_t *)virt_addr; 44 | /* cycleval in picoseconds to this gives resolution in seconds */ 45 | resolution = 1.0e-12*cycleval; 46 | base_counter = *iotimer_addr; 47 | } 48 | 49 | void wtime_(double *time) 50 | { 51 | static int initialized = 0; 52 | volatile iotimer_t counter_value; 53 | if (!initialized) { 54 | timer_init(); 55 | initialized = 1; 56 | } 57 | counter_value = *iotimer_addr - base_counter; 58 | *time = (double)counter_value * resolution; 59 | } 60 | 61 | 62 | void wtime(double *time) 63 | { 64 | static int initialized = 0; 65 | volatile iotimer_t counter_value; 66 | if (!initialized) { 67 | timer_init(); 68 | initialized = 1; 69 | } 70 | counter_value = *iotimer_addr - base_counter; 71 | *time = (double)counter_value * resolution; 72 | } 73 | 74 | 75 | -------------------------------------------------------------------------------- /HW3/part1/def_cg.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW3/part1/def_cg.a -------------------------------------------------------------------------------- /HW3/part1/globals.h: -------------------------------------------------------------------------------- 1 | #include "type.h" 2 | 3 | //small datasize 4 | #ifdef SMALL 5 | #define NA 7000 6 | #define NONZER 8 7 | #define SHIFT 12 8 | #define NITER 15 9 | #define RCOND 1.0e-1 10 | #define VALID_RESULT 10.362595087124 11 | #endif 12 | 13 | //midiumn datasize 14 | #ifdef MEDIUMN 15 | #define NA 14000 16 | #define NONZER 11 17 | #define SHIFT 20 18 | #define NITER 15 19 | #define RCOND 1.0e-1 20 | #define VALID_RESULT 17.130235054029 21 | #endif 22 | 23 | //large datasize 24 | #ifdef LARGE 25 | #define NA 75000 26 | #define NONZER 13 27 | #define SHIFT 60 28 | #define NITER 75 29 | #define RCOND 1.0e-1 30 | #define VALID_RESULT 22.712745482631 31 | #endif 32 | 33 | #define NZ (NA*(NONZER+1)*(NONZER+1)) 34 | #define NAZ (NA*(NONZER+1)) 35 | 36 | #define T_init 0 37 | #define T_bench 1 38 | #define T_conj_grad 2 39 | #define T_last 3 40 | 41 | -------------------------------------------------------------------------------- /HW3/part1/grade.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "globals.h" 7 | #include "randdp.h" 8 | #include "timers.h" 9 | #include "cg_impl.h" 10 | 11 | void init(double *zeta); 12 | void iterate(double *zeta, int *it); 13 | void reference_init(double *zeta); 14 | void reference_iterate(double *zeta, int *it); 15 | void default_init(double *zeta); 16 | void default_iterate(double *zeta, int *it); 17 | 18 | void print_scores(double stu_time, double ref_time, logical verified) 19 | { 20 | double max_score = 30; 21 | double max_perf_score = 0.8 * max_score; 22 | double correctness_score = 0.2 * max_score; 23 | correctness_score = (verified == true) ? correctness_score : 0; 24 | 25 | double ratio = (ref_time/stu_time); 26 | 27 | double slope = max_perf_score/(0.7 - 0.3); 28 | double offset = 0.3 * slope; 29 | 30 | double perf_score = (verified == true) ? ratio*slope - offset : 0; 31 | 32 | if (perf_score < 0) perf_score = 0; 33 | if (perf_score > max_perf_score) perf_score = max_perf_score; 34 | 35 | printf("correctness : %lf\n", correctness_score); 36 | printf("performance : %lf\n", perf_score); 37 | printf("total : %lf\n", correctness_score + perf_score); 38 | 39 | return; 40 | } 41 | 42 | int main(int argc, char *argv[]) 43 | { 44 | int num_threads = omp_get_max_threads(); 45 | int i, j, k, it; 46 | double zeta; 47 | double t, t_total = 0, reference_total = 0, default_total = 0; 48 | logical verified; 49 | double zeta_verify_value, epsilon, err; 50 | char *t_names[T_last]; 51 | 52 | omp_set_num_threads(num_threads); 53 | zeta_verify_value = VALID_RESULT; 54 | 55 | printf("\nCG start...\n\n"); 56 | printf(" Size: %11d\n", NA); 57 | printf(" Iterations: %5d\n", NITER); 58 | printf(" Running with %d threads\n", num_threads); 59 | printf("\n"); 60 | 61 | for (i = 0; i < T_last; i++) 62 | { 63 | timer_clear(i); 64 | } 65 | timer_start(T_init); 66 | init(&zeta); 67 | zeta = 0.0; 68 | for (it = 1; it <= 1; it++) 69 | { 70 | iterate(&zeta, &it); 71 | } 72 | for (i = 0; i < NA + 1; i++) 73 | { 74 | x[i] = 1.0; 75 | } 76 | zeta = 0.0; 77 | timer_stop(T_init); 78 | t_total += timer_read(T_init); 79 | 80 | timer_start(T_bench); 81 | for (it = 1; it <= NITER; it++) 82 | { 83 | iterate(&zeta, &it); 84 | } 85 | timer_stop(T_bench); 86 | t = timer_read(T_bench); 87 | t_total += t; 88 | 89 | epsilon = 1.0e-10; 90 | err = fabs(zeta - zeta_verify_value) / zeta_verify_value; 91 | if (err <= epsilon) 92 | { 93 | verified = true; 94 | printf(" VERIFICATION SUCCESSFUL\n"); 95 | printf(" Zeta is %20.13E\n", zeta); 96 | printf(" Error is %20.13E\n", err); 97 | } 98 | else 99 | { 100 | verified = false; 101 | printf(" VERIFICATION FAILED\n"); 102 | printf(" Zeta %20.13E\n", zeta); 103 | printf(" The correct zeta is %20.13E\n", zeta_verify_value); 104 | } 105 | 106 | for (i = 0; i < T_last; i++) 107 | { 108 | timer_clear(i); 109 | } 110 | timer_start(T_init); 111 | reference_init(&zeta); 112 | zeta = 0.0; 113 | for (it = 1; it <= 1; it++) 114 | { 115 | reference_iterate(&zeta, &it); 116 | } 117 | for (i = 0; i < NA + 1; i++) 118 | { 119 | x[i] = 1.0; 120 | } 121 | zeta = 0.0; 122 | timer_stop(T_init); 123 | reference_total += timer_read(T_init); 124 | 125 | timer_start(T_bench); 126 | for (it = 1; it <= NITER; it++) 127 | { 128 | reference_iterate(&zeta, &it); 129 | } 130 | timer_stop(T_bench); 131 | t = timer_read(T_bench); 132 | reference_total += t; 133 | 134 | for (i = 0; i < T_last; i++) 135 | { 136 | timer_clear(i); 137 | } 138 | timer_start(T_init); 139 | default_init(&zeta); 140 | zeta = 0.0; 141 | for (it = 1; it <= 1; it++) 142 | { 143 | default_iterate(&zeta, &it); 144 | } 145 | for (i = 0; i < NA + 1; i++) 146 | { 147 | x[i] = 1.0; 148 | } 149 | zeta = 0.0; 150 | timer_stop(T_init); 151 | default_total += timer_read(T_init); 152 | 153 | timer_start(T_bench); 154 | for (it = 1; it <= NITER; it++) 155 | { 156 | default_iterate(&zeta, &it); 157 | } 158 | timer_stop(T_bench); 159 | t = timer_read(T_bench); 160 | default_total += t; 161 | 162 | printf("\nreference time : %lfs\n", reference_total); 163 | printf("default time : %lfs\n", default_total); 164 | printf("student time : %lfs\n\n", t_total); 165 | 166 | if (default_total - 0.1 < t_total) 167 | { 168 | printf("Your implementation should be faster than default - 0.1s!\n\n"); 169 | verified = false; 170 | } 171 | 172 | print_scores(t_total, reference_total, verified); 173 | 174 | return 0; 175 | } 176 | -------------------------------------------------------------------------------- /HW3/part1/make.common: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------------------------- 2 | # Compiler configurations 3 | #--------------------------------------------------------------------------- 4 | CC = gcc 5 | CLINK = $(CC) 6 | C_LIB = -lm 7 | C_INC = -Icommon 8 | CFLAGS = -g -O3 -mcmodel=medium -fopenmp 9 | CLINKFLAGS = -O3 -mcmodel=medium -fopenmp 10 | UCC = gcc 11 | BINDIR = bin 12 | RAND = randdp 13 | WTIME = wtime.c 14 | 15 | CCOMPILE = $(CC) -c $(C_INC) $(CFLAGS) 16 | CCOMPILE_pp = $(CC_pp) -c $(C_INC_pp) $(CFLAGS_pp) 17 | 18 | # Class "U" is used internally by the setparams program to mean 19 | # "unknown". This means that if you don't specify CLASS= 20 | # on the command line, you'll get an error. It would be nice 21 | # to be able to avoid this, but we'd have to get information 22 | # from the setparams back to the make program, which isn't easy. 23 | CLASS=U 24 | 25 | config: 26 | COMMON=common 27 | ${COMMON}/${RAND}.o: ${COMMON}/${RAND}.c 28 | cd ${COMMON}; ${CCOMPILE} ${RAND}.c 29 | 30 | #${COMMON}/print_results.o: ${COMMON}/print_results.c 31 | # cd ${COMMON}; ${CCOMPILE} print_results.c 32 | 33 | #${COMMON}/c_print_results.o: ${COMMON}/c_print_results.c 34 | # cd ${COMMON}; ${CCOMPILE} c_print_results.c 35 | 36 | ${COMMON}/timers.o: ${COMMON}/timers.c 37 | cd ${COMMON}; ${CCOMPILE} timers.c 38 | 39 | ${COMMON}/c_timers.o: ${COMMON}/c_timers.c 40 | cd ${COMMON}; ${CCOMPILE} c_timers.c 41 | 42 | ${COMMON}/wtime.o: ${COMMON}/${WTIME} 43 | cd ${COMMON}; ${CCOMPILE} ${MACHINE} -o wtime.o ${WTIME} 44 | # For most machines or CRAY or IBM 45 | # cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/wtime.c 46 | # For a precise timer on an SGI Power Challenge, try: 47 | # cd ${COMMON}; ${CCOMPILE} -o wtime.o ${COMMON}/wtime_sgi64.c 48 | 49 | ${COMMON}/c_wtime.o: ${COMMON}/${WTIME} 50 | cd ${COMMON}; ${CCOMPILE} -o c_wtime.o ${WTIME} 51 | 52 | # So that "make benchmark-name" works 53 | ${BENCHMARK}: default 54 | ${BENCHMARKU}: default 55 | 56 | 57 | -------------------------------------------------------------------------------- /HW3/part1/ref_cg.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW3/part1/ref_cg.a -------------------------------------------------------------------------------- /HW3/part1/report.txt: -------------------------------------------------------------------------------- 1 | Model name: AMD Ryzen 7 PRO 4750U with Radeon Graphics 2 | Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero wbnoinvd arat umip rdpid overflow_recov succor 3 | cpu MHz : 1696.813 4 | cpu MHz : 1696.813 5 | cpu MHz : 1696.813 6 | cpu MHz : 1696.813 7 | 8 | CG start... 9 | 10 | Size: 14000 11 | Iterations: 15 12 | Running with 4 threads 13 | 14 | 15 | iteration ||r|| zeta 16 | 1 2.60650812147631E-13 19.9997581277040 17 | 18 | iteration ||r|| zeta 19 | 1 2.60650812147631E-13 19.9997581277040 20 | 2 2.57531877367169E-15 17.1140495745506 21 | 3 2.59348789075185E-15 17.1296668946143 22 | 4 2.56262926848262E-15 17.1302113581193 23 | 5 2.51106135247005E-15 17.1302338856353 24 | 6 2.55819375820883E-15 17.1302349879482 25 | 7 2.54564770410681E-15 17.1302350498916 26 | 8 2.44940683285382E-15 17.1302350537510 27 | 9 2.48852359037289E-15 17.1302350540101 28 | 10 2.47715076108563E-15 17.1302350540284 29 | 11 2.49284410170029E-15 17.1302350540298 30 | 12 2.44437060612294E-15 17.1302350540299 31 | 13 2.47093619226119E-15 17.1302350540299 32 | 14 2.43816304501123E-15 17.1302350540299 33 | 15 2.42966732234484E-15 17.1302350540299 34 | VERIFICATION SUCCESSFUL 35 | Zeta is 1.7130235054030E+01 36 | Error is 5.1226400332279E-14 37 | 38 | reference time : 1.825870s 39 | default time : 3.216315s 40 | student time : 2.003389s 41 | 42 | correctness : 6.000000 43 | performance : 24.000000 44 | total : 30.000000 45 | -------------------------------------------------------------------------------- /HW3/part2/breadth_first_search/Makefile: -------------------------------------------------------------------------------- 1 | all: default grade 2 | 3 | default: main.cpp bfs.cpp 4 | g++ -I../ -std=c++17 -fopenmp -O3 -g -o bfs main.cpp bfs.cpp ../common/graph.cpp ref_bfs.a 5 | grade: grade.cpp bfs.cpp 6 | g++ -I../ -std=c++17 -fopenmp -O3 -g -o bfs_grader grade.cpp bfs.cpp ../common/graph.cpp ref_bfs.a 7 | clean: 8 | rm -rf bfs_grader bfs *~ *.*~ 9 | report: clean all 10 | ./bfs_grader ../graphs > report.txt 11 | -------------------------------------------------------------------------------- /HW3/part2/breadth_first_search/bfs.h: -------------------------------------------------------------------------------- 1 | #ifndef __BFS_H__ 2 | #define __BFS_H__ 3 | 4 | //#define DEBUG 5 | 6 | #include "common/graph.h" 7 | 8 | struct solution 9 | { 10 | int *distances; 11 | }; 12 | 13 | struct vertex_set { 14 | // # of vertices in the set 15 | int count; 16 | // max size of buffer vertices 17 | int max_vertices; 18 | // array of vertex ids in set 19 | int *vertices; 20 | }; 21 | 22 | 23 | void bfs_top_down(Graph graph, solution* sol); 24 | void bfs_bottom_up(Graph graph, solution* sol); 25 | void bfs_hybrid(Graph graph, solution* sol); 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /HW3/part2/breadth_first_search/ref_bfs.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW3/part2/breadth_first_search/ref_bfs.a -------------------------------------------------------------------------------- /HW3/part2/breadth_first_search/report.txt: -------------------------------------------------------------------------------- 1 | Max system threads = 4 2 | Running with 4 threads 3 | 4 | Graph: grid1000x1000.graph 5 | 6 | Top down bfs 7 | ref_time: 0.0258286s 8 | stu_time: 0.0239949s 9 | 10 | Bottom up bfs 11 | ref_time: 2.97809s 12 | stu_time: 2.3487s 13 | 14 | Hybrid bfs 15 | ref_time: 1.06599s 16 | stu_time: 0.555012s 17 | 18 | Graph: soc-livejournal1_68m.graph 19 | 20 | Top down bfs 21 | ref_time: 0.428485s 22 | stu_time: 0.364067s 23 | 24 | Bottom up bfs 25 | ref_time: 0.312608s 26 | stu_time: 0.211181s 27 | 28 | Hybrid bfs 29 | ref_time: 0.154956s 30 | stu_time: 0.136973s 31 | 32 | Graph: com-orkut_117m.graph 33 | 34 | Top down bfs 35 | ref_time: 0.621974s 36 | stu_time: 0.438463s 37 | 38 | Bottom up bfs 39 | ref_time: 0.273366s 40 | stu_time: 0.230265s 41 | 42 | Hybrid bfs 43 | ref_time: 0.0907601s 44 | stu_time: 0.0687539s 45 | 46 | Graph: random_500m.graph 47 | 48 | Top down bfs 49 | ref_time: 10.3302s 50 | stu_time: 10.1746s 51 | 52 | Bottom up bfs 53 | ref_time: 19.7902s 54 | stu_time: 13.3703s 55 | 56 | Hybrid bfs 57 | ref_time: 4.04886s 58 | stu_time: 3.4187s 59 | 60 | Graph: rmat_200m.graph 61 | 62 | Top down bfs 63 | ref_time: 3.91783s 64 | stu_time: 3.78097s 65 | 66 | Bottom up bfs 67 | ref_time: 3.3444s 68 | stu_time: 2.08853s 69 | 70 | Hybrid bfs 71 | ref_time: 1.70545s 72 | stu_time: 1.33205s 73 | 74 | 75 | -------------------------------------------------------------------------- 76 | SCORES : | Top-Down | Bott-Up | Hybrid | 77 | -------------------------------------------------------------------------- 78 | grid1000x1000.graph | 2.00 / 2 | 3.00 / 3 | 3.00 / 3 | 79 | -------------------------------------------------------------------------- 80 | soc-livejournal1_68m.graph | 2.00 / 2 | 3.00 / 3 | 3.00 / 3 | 81 | -------------------------------------------------------------------------- 82 | com-orkut_117m.graph | 2.00 / 2 | 3.00 / 3 | 3.00 / 3 | 83 | -------------------------------------------------------------------------- 84 | random_500m.graph | 6.00 / 6 | 7.00 / 7 | 7.00 / 7 | 85 | -------------------------------------------------------------------------- 86 | rmat_200m.graph | 6.00 / 6 | 7.00 / 7 | 7.00 / 7 | 87 | -------------------------------------------------------------------------- 88 | TOTAL | 64.00 / 64 | 89 | -------------------------------------------------------------------------- 90 | -------------------------------------------------------------------------------- /HW3/part2/common/CycleTimer.h: -------------------------------------------------------------------------------- 1 | #ifndef _SYRAH_CYCLE_TIMER_H_ 2 | #define _SYRAH_CYCLE_TIMER_H_ 3 | 4 | #if defined(__APPLE__) 5 | #if defined(__x86_64__) 6 | #include 7 | #else 8 | #include 9 | #include 10 | #endif // __x86_64__ or not 11 | 12 | #include // fprintf 13 | #include // exit 14 | 15 | #elif _WIN32 16 | # include 17 | # include 18 | #else 19 | # include 20 | # include 21 | # include 22 | # include 23 | #endif 24 | 25 | 26 | // This uses the cycle counter of the processor. Different 27 | // processors in the system will have different values for this. If 28 | // you process moves across processors, then the delta time you 29 | // measure will likely be incorrect. This is mostly for fine 30 | // grained measurements where the process is likely to be on the 31 | // same processor. For more global things you should use the 32 | // Time interface. 33 | 34 | // Also note that if you processors' speeds change (i.e. processors 35 | // scaling) or if you are in a heterogenous environment, you will 36 | // likely get spurious results. 37 | class CycleTimer { 38 | public: 39 | typedef unsigned long long SysClock; 40 | 41 | ////////// 42 | // Return the current CPU time, in terms of clock ticks. 43 | // Time zero is at some arbitrary point in the past. 44 | static SysClock currentTicks() { 45 | #if defined(__APPLE__) && !defined(__x86_64__) 46 | return mach_absolute_time(); 47 | #elif defined(_WIN32) 48 | LARGE_INTEGER qwTime; 49 | QueryPerformanceCounter(&qwTime); 50 | return qwTime.QuadPart; 51 | #elif defined(__x86_64__) 52 | unsigned int a, d; 53 | asm volatile("rdtsc" : "=a" (a), "=d" (d)); 54 | return static_cast(a) | 55 | (static_cast(d) << 32); 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser. 57 | unsigned int val; 58 | asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val)); 59 | return val; 60 | #else 61 | timespec spec; 62 | clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec); 63 | return CycleTimer::SysClock(static_cast(spec.tv_sec) * 1e9 + static_cast(spec.tv_nsec)); 64 | #endif 65 | } 66 | 67 | ////////// 68 | // Return the current CPU time, in terms of seconds. 69 | // This is slower than currentTicks(). Time zero is at 70 | // some arbitrary point in the past. 71 | static double currentSeconds() { 72 | return currentTicks() * secondsPerTick(); 73 | } 74 | 75 | ////////// 76 | // Return the conversion from seconds to ticks. 77 | static double ticksPerSecond() { 78 | return 1.0/secondsPerTick(); 79 | } 80 | 81 | static const char* tickUnits() { 82 | #if defined(__APPLE__) && !defined(__x86_64__) 83 | return "ns"; 84 | #elif defined(__WIN32__) || defined(__x86_64__) 85 | return "cycles"; 86 | #else 87 | return "ns"; // clock_gettime 88 | #endif 89 | } 90 | 91 | ////////// 92 | // Return the conversion from ticks to seconds. 93 | static double secondsPerTick() { 94 | static bool initialized = false; 95 | static double secondsPerTick_val; 96 | if (initialized) return secondsPerTick_val; 97 | #if defined(__APPLE__) 98 | #ifdef __x86_64__ 99 | int args[] = {CTL_HW, HW_CPU_FREQ}; 100 | unsigned int Hz; 101 | size_t len = sizeof(Hz); 102 | if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) { 103 | fprintf(stderr, "Failed to initialize secondsPerTick_val!\n"); 104 | exit(-1); 105 | } 106 | secondsPerTick_val = 1.0 / (double) Hz; 107 | #else 108 | mach_timebase_info_data_t time_info; 109 | mach_timebase_info(&time_info); 110 | 111 | // Scales to nanoseconds without 1e-9f 112 | secondsPerTick_val = (1e-9*static_cast(time_info.numer))/ 113 | static_cast(time_info.denom); 114 | #endif // x86_64 or not 115 | #elif defined(_WIN32) 116 | LARGE_INTEGER qwTicksPerSec; 117 | QueryPerformanceFrequency(&qwTicksPerSec); 118 | secondsPerTick_val = 1.0/static_cast(qwTicksPerSec.QuadPart); 119 | #else 120 | FILE *fp = fopen("/proc/cpuinfo","r"); 121 | char input[1024]; 122 | if (!fp) { 123 | fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo."); 124 | exit(-1); 125 | } 126 | // In case we don't find it, e.g. on the N900 127 | secondsPerTick_val = 1e-9; 128 | while (!feof(fp) && fgets(input, 1024, fp)) { 129 | // NOTE(boulos): Because reading cpuinfo depends on dynamic 130 | // frequency scaling it's better to read the @ sign first 131 | float GHz, MHz; 132 | if (strstr(input, "model name")) { 133 | char* at_sign = strstr(input, "@"); 134 | if (at_sign) { 135 | char* after_at = at_sign + 1; 136 | char* GHz_str = strstr(after_at, "GHz"); 137 | char* MHz_str = strstr(after_at, "MHz"); 138 | if (GHz_str) { 139 | *GHz_str = '\0'; 140 | if (1 == sscanf(after_at, "%f", &GHz)) { 141 | //printf("GHz = %f\n", GHz); 142 | secondsPerTick_val = 1e-9f / GHz; 143 | break; 144 | } 145 | } else if (MHz_str) { 146 | *MHz_str = '\0'; 147 | if (1 == sscanf(after_at, "%f", &MHz)) { 148 | //printf("MHz = %f\n", MHz); 149 | secondsPerTick_val = 1e-6f / GHz; 150 | break; 151 | } 152 | } 153 | } 154 | } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) { 155 | //printf("MHz = %f\n", MHz); 156 | secondsPerTick_val = 1e-6f / MHz; 157 | break; 158 | } 159 | } 160 | fclose(fp); 161 | #endif 162 | 163 | initialized = true; 164 | return secondsPerTick_val; 165 | } 166 | 167 | ////////// 168 | // Return the conversion from ticks to milliseconds. 169 | static double msPerTick() { 170 | return secondsPerTick() * 1000.0; 171 | } 172 | 173 | private: 174 | CycleTimer(); 175 | }; 176 | 177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_ 178 | -------------------------------------------------------------------------------- /HW3/part2/common/contracts.h: -------------------------------------------------------------------------------- 1 | /* Debugging with contracts; simulating cc0 -d 2 | * Enable with gcc -DDEBUG ... 3 | * 4 | * 15-122 Principles of Imperative Computation 5 | * Frank Pfenning 6 | */ 7 | 8 | #include 9 | 10 | /* Unlike typical header files, "contracts.h" may be 11 | * included multiple times, with and without DEBUG defined. 12 | * For this to succeed we first undefine the macros in 13 | * question in order to avoid a redefinition warning. 14 | */ 15 | 16 | #undef ASSERT 17 | #undef REQUIRES 18 | #undef ENSURES 19 | 20 | #ifdef DEBUG 21 | 22 | #define ASSERT(COND) assert(COND) 23 | #define REQUIRES(COND) assert(COND) 24 | #define ENSURES(COND) assert(COND) 25 | 26 | #else 27 | 28 | #define ASSERT(COND) ((void)0) 29 | #define REQUIRES(COND) ((void)0) 30 | #define ENSURES(COND) ((void)0) 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /HW3/part2/common/grade.h: -------------------------------------------------------------------------------- 1 | #ifndef __GRADE_H__ 2 | #define __GRADE_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | 17 | #include "graph.h" 18 | #include "graph_internal.h" 19 | #include "contracts.h" 20 | 21 | // Epsilon for approximate float comparisons 22 | #define EPSILON 0.00000000001 23 | 24 | // Output column size 25 | #define COL_SIZE 15 26 | 27 | // Point value for apps that are not run. 28 | #define POINTS_NA -1 29 | 30 | // Point value for apps that yeilded incorrect results. 31 | #define POINTS_INCORRECT -2 32 | 33 | /* 34 | * Printing functions 35 | */ 36 | 37 | static void sep(std::ostream& out, char separator = '-', int length = 78) 38 | { 39 | for (int i = 0; i < length; i++) 40 | out << separator; 41 | out << std::endl; 42 | } 43 | 44 | static void printTimingApp(std::ostream& timing, const char* appName) 45 | { 46 | std::cout << std::endl; 47 | std::cout << "Timing results for " << appName << ":" << std::endl; 48 | sep(std::cout, '=', 75); 49 | 50 | timing << std::endl; 51 | timing << "Timing results for " << appName << ":" << std::endl; 52 | sep(timing, '=', 75); 53 | } 54 | 55 | /* 56 | * Correctness checkers 57 | */ 58 | 59 | template 60 | bool compareArrays(Graph graph, T* ref, T* stu) 61 | { 62 | for (int i = 0; i < graph->num_nodes; i++) { 63 | if (ref[i] != stu[i]) { 64 | std::cerr << "*** Results disagree at " << i << " expected " 65 | << ref[i] << " found " << stu[i] << std::endl; 66 | return false; 67 | } 68 | } 69 | return true; 70 | } 71 | 72 | template 73 | bool compareApprox(Graph graph, T* ref, T* stu) 74 | { 75 | for (int i = 0; i < graph->num_nodes; i++) { 76 | if (fabs(ref[i] - stu[i]) > EPSILON) { 77 | std::cerr << "*** Results disagree at " << i << " expected " 78 | << ref[i] << " found " << stu[i] << std::endl; 79 | return false; 80 | } 81 | } 82 | return true; 83 | } 84 | 85 | template 86 | bool compareArraysAndDisplay(Graph graph, T* ref, T*stu) 87 | { 88 | printf("\n----------------------------------\n"); 89 | printf("Visualization of student results"); 90 | printf("\n----------------------------------\n\n"); 91 | 92 | int grid_dim = (int)sqrt(graph->num_nodes); 93 | for (int j=0; jnum_nodes); 104 | for (int j=0; j(graph, ref, stu); 112 | } 113 | 114 | template 115 | bool compareArraysAndRadiiEst(Graph graph, T* ref, T* stu) 116 | { 117 | bool isCorrect = true; 118 | for (int i = 0; i < graph->num_nodes; i++) { 119 | if (ref[i] != stu[i]) { 120 | std::cerr << "*** Results disagree at " << i << " expected " 121 | << ref[i] << " found " << stu[i] << std::endl; 122 | isCorrect = false; 123 | } 124 | } 125 | int stuMaxVal = -1; 126 | int refMaxVal = -1; 127 | #pragma omp parallel for schedule(dynamic, 512) reduction(max: stuMaxVal) 128 | for (int i = 0; i < graph->num_nodes; i++) { 129 | if (stu[i] > stuMaxVal) 130 | stuMaxVal = stu[i]; 131 | } 132 | #pragma omp parallel for schedule(dynamic, 512) reduction(max: refMaxVal) 133 | for (int i = 0; i < graph->num_nodes; i++) { 134 | if (ref[i] > refMaxVal) 135 | refMaxVal = ref[i]; 136 | } 137 | 138 | if (refMaxVal != stuMaxVal) { 139 | std::cerr << "*** Radius estimates differ. Expected: " << refMaxVal << " Got: " << stuMaxVal << std::endl; 140 | isCorrect = false; 141 | } 142 | return isCorrect; 143 | } 144 | 145 | #endif /* __GRADE_H__ */ 146 | -------------------------------------------------------------------------------- /HW3/part2/common/graph.h: -------------------------------------------------------------------------------- 1 | #ifndef __GRAPH_H__ 2 | #define __GRAPH_H__ 3 | 4 | using Vertex = int; 5 | 6 | struct graph 7 | { 8 | // Number of edges in the graph 9 | int num_edges; 10 | // Number of vertices in the graph 11 | int num_nodes; 12 | 13 | // The node reached by vertex i's first outgoing edge is given by 14 | // outgoing_edges[outgoing_starts[i]]. To iterate over all 15 | // outgoing edges, please see the top-down bfs implementation. 16 | int* outgoing_starts; 17 | Vertex* outgoing_edges; 18 | 19 | int* incoming_starts; 20 | Vertex* incoming_edges; 21 | }; 22 | 23 | using Graph = graph*; 24 | 25 | /* Getters */ 26 | static inline int num_nodes(const Graph); 27 | static inline int num_edges(const Graph); 28 | 29 | static inline const Vertex* outgoing_begin(const Graph, Vertex); 30 | static inline const Vertex* outgoing_end(const Graph, Vertex); 31 | static inline int outgoing_size(const Graph, Vertex); 32 | 33 | static inline const Vertex* incoming_begin(const Graph, Vertex); 34 | static inline const Vertex* incoming_end(const Graph, Vertex); 35 | static inline int incoming_size(const Graph, Vertex); 36 | 37 | 38 | /* IO */ 39 | Graph load_graph(const char* filename); 40 | Graph load_graph_binary(const char* filename); 41 | void store_graph_binary(const char* filename, Graph); 42 | 43 | void print_graph(const graph*); 44 | 45 | 46 | /* Deallocation */ 47 | void free_graph(Graph); 48 | 49 | 50 | /* Included here to enable inlining. Don't look. */ 51 | #include "graph_internal.h" 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /HW3/part2/common/graph_internal.h: -------------------------------------------------------------------------------- 1 | #ifndef __GRAPH_INTERNAL_H__ 2 | #define __GRAPH_INTERNAL_H__ 3 | 4 | #include 5 | #include "contracts.h" 6 | 7 | static inline int num_nodes(const Graph graph) 8 | { 9 | REQUIRES(graph != NULL); 10 | return graph->num_nodes; 11 | } 12 | 13 | static inline int num_edges(const Graph graph) 14 | { 15 | REQUIRES(graph != NULL); 16 | return graph->num_edges; 17 | } 18 | 19 | static inline const Vertex* outgoing_begin(const Graph g, Vertex v) 20 | { 21 | REQUIRES(g != NULL); 22 | REQUIRES(0 <= v && v < num_nodes(g)); 23 | return g->outgoing_edges + g->outgoing_starts[v]; 24 | } 25 | 26 | static inline const Vertex* outgoing_end(const Graph g, Vertex v) 27 | { 28 | REQUIRES(g != NULL); 29 | REQUIRES(0 <= v && v < num_nodes(g)); 30 | int offset = (v == g->num_nodes - 1) ? g->num_edges : g->outgoing_starts[v + 1]; 31 | return g->outgoing_edges + offset; 32 | } 33 | 34 | static inline int outgoing_size(const Graph g, Vertex v) 35 | { 36 | REQUIRES(g != NULL); 37 | REQUIRES(0 <= v && v < num_nodes(g)); 38 | if (v == g->num_nodes - 1) { 39 | return g->num_edges - g->outgoing_starts[v]; 40 | } else { 41 | return g->outgoing_starts[v + 1] - g->outgoing_starts[v]; 42 | } 43 | } 44 | 45 | static inline const Vertex* incoming_begin(const Graph g, Vertex v) 46 | { 47 | REQUIRES(g != NULL); 48 | REQUIRES(0 <= v && v < num_nodes(g)); 49 | return g->incoming_edges + g->incoming_starts[v]; 50 | } 51 | 52 | static inline const Vertex* incoming_end(const Graph g, Vertex v) 53 | { 54 | REQUIRES(g != NULL); 55 | REQUIRES(0 <= v && v < num_nodes(g)); 56 | int offset = (v == g->num_nodes - 1) ? g->num_edges : g->incoming_starts[v + 1]; 57 | return g->incoming_edges + offset; 58 | } 59 | 60 | static inline int incoming_size(const Graph g, Vertex v) 61 | { 62 | REQUIRES(g != NULL); 63 | REQUIRES(0 <= v && v < num_nodes(g)); 64 | if (v == g->num_nodes - 1) { 65 | return g->num_edges - g->incoming_starts[v]; 66 | } else { 67 | return g->incoming_starts[v + 1] - g->incoming_starts[v]; 68 | } 69 | } 70 | 71 | #endif // __GRAPH_INTERNAL_H__ 72 | -------------------------------------------------------------------------------- /HW3/part2/doc/bfs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW3/part2/doc/bfs.pdf -------------------------------------------------------------------------------- /HW3/part2/graphs/README.md: -------------------------------------------------------------------------------- 1 | http://sslab.cs.nctu.edu.tw/~acliu/all_graphs.tgz 2 | 3 | * Be careful, this is a 3GB download 4 | -------------------------------------------------------------------------------- /HW3/part2/page_rank/Makefile: -------------------------------------------------------------------------------- 1 | all: default grade 2 | 3 | default: page_rank.cpp main.cpp 4 | g++ -I../ -std=c++17 -fopenmp -O3 -o pr main.cpp page_rank.cpp ../common/graph.cpp ref_pr.a 5 | grade: page_rank.cpp grade.cpp 6 | g++ -I../ -std=c++17 -fopenmp -O3 -o pr_grader grade.cpp page_rank.cpp ../common/graph.cpp ref_pr.a 7 | clean: 8 | rm -rf pr pr_grader *~ *.*~ 9 | report: clean all 10 | ./pr_grader ../graphs > report.txt -------------------------------------------------------------------------------- /HW3/part2/page_rank/grade.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #include "../common/CycleTimer.h" 13 | #include "../common/graph.h" 14 | #include "../common/grade.h" 15 | #include "page_rank.h" 16 | 17 | #define USE_BINARY_GRAPH 1 18 | 19 | #define PageRankDampening 0.3f 20 | #define PageRankConvergence 1e-7d 21 | 22 | void reference_pageRank(Graph g, double* solution, double damping, 23 | double convergence); 24 | 25 | void usage(const char* binary_name) { 26 | std::cout << "Usage: " << binary_name << " [options] graphdir" << std::endl; 27 | std::cout << std::endl; 28 | std::cout << "Options:" << std::endl; 29 | std::cout << " -n INT number of threads" << std::endl; 30 | std::cout << " -r INT number of runs" << std::endl; 31 | std::cout << " -h this commandline help message" << std::endl; 32 | } 33 | 34 | graph* load_graph(std::string graph_filename) { 35 | graph* g; 36 | if (USE_BINARY_GRAPH) { 37 | g = load_graph_binary(graph_filename.c_str()); 38 | } else { 39 | g = load_graph(graph_filename); 40 | printf("storing binary form of graph!\n"); 41 | store_graph_binary(graph_filename.append(".bin").c_str(), g); 42 | free_graph(g); 43 | exit(1); 44 | } 45 | return g; 46 | } 47 | 48 | double run_on_graph(graph* g, int num_threads, int num_runs, std::string graph_name) { 49 | 50 | double* sol_stu = new double[g->num_nodes]; 51 | double* sol_ref = new double[g->num_nodes]; 52 | 53 | omp_set_num_threads(num_threads); 54 | 55 | double start, time; 56 | 57 | //Run implementation 58 | double stu_time = std::numeric_limits::max(); 59 | for (int r = 0; r < num_runs; r++) { 60 | start = CycleTimer::currentSeconds(); 61 | pageRank(g, sol_stu, PageRankDampening, PageRankConvergence); 62 | //reference_pageRank(g, sol_stu, PageRankDampening, PageRankConvergence); 63 | time = CycleTimer::currentSeconds() - start; 64 | stu_time = std::min(stu_time, time); 65 | } 66 | 67 | //Run reference implementation 68 | double ref_time = std::numeric_limits::max(); 69 | for (int r = 0; r < num_runs; r++) { 70 | start = CycleTimer::currentSeconds(); 71 | reference_pageRank(g, sol_ref, PageRankDampening, PageRankConvergence); 72 | time = CycleTimer::currentSeconds() - start; 73 | ref_time = std::min(ref_time, time); 74 | } 75 | 76 | bool correct = compareApprox(g, sol_ref, sol_stu); 77 | 78 | delete(sol_stu); 79 | delete(sol_ref); 80 | 81 | if (!correct) { 82 | std::cout << "Page rank incorrect" << std::endl; 83 | } else { 84 | std::cout << "ref_time: " << ref_time << "s" << std::endl; 85 | std::cout << "stu_time: " << stu_time << "s" << std::endl; 86 | } 87 | 88 | double max_score = 4; 89 | double max_perf_score = 0.8 * max_score; 90 | double correctness_score = 0.2 * max_score; 91 | correctness_score = (correct) ? correctness_score : 0; 92 | 93 | double ratio = (ref_time/stu_time); 94 | 95 | double slope = max_perf_score/(0.7 - 0.3); 96 | double offset = 0.3 * slope; 97 | 98 | double perf_score = (correct) ? ratio*slope - offset : 0; 99 | 100 | if (perf_score < 0) perf_score = 0; 101 | if (perf_score > max_perf_score) perf_score = max_perf_score; 102 | 103 | return (correctness_score + perf_score); 104 | } 105 | 106 | void print_separator_line() { 107 | for (int i = 0; i < 43; i++) { 108 | std::cout<<"-"; 109 | } 110 | std::cout< grade_graphs, std::vector scores) { 114 | 115 | std::cout.precision(5); 116 | std::cout.setf(std::ios::fixed, std:: ios::floatfield); 117 | std::cout< grade_graphs = { "soc-livejournal1_68m.graph", 191 | "com-orkut_117m.graph", 192 | "rmat_200m.graph", 193 | "random_500m.graph"}; 194 | 195 | std::vector scores(grade_graphs.size()); 196 | 197 | int i = 0; 198 | for (auto& graph_name: grade_graphs) { 199 | graph* g = load_graph(graph_dir + '/' + graph_name); 200 | std::cout << "\nGraph: " << graph_name << std::endl; 201 | scores[i] = run_on_graph(g, num_threads, num_runs, graph_name); 202 | free_graph(g); 203 | i++; 204 | } 205 | 206 | print_scores(grade_graphs, scores); 207 | 208 | return 0; 209 | } 210 | -------------------------------------------------------------------------------- /HW3/part2/page_rank/page_rank.cpp: -------------------------------------------------------------------------------- 1 | #include "page_rank.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include "../common/CycleTimer.h" 11 | #include "../common/graph.h" 12 | 13 | // pageRank -- 14 | // 15 | // g: graph to process (see common/graph.h) 16 | // solution: array of per-vertex vertex scores (length of array is num_nodes(g)) 17 | // damping: page-rank algorithm's damping parameter 18 | // convergence: page-rank algorithm's convergence threshold 19 | // 20 | void pageRank(Graph g, double *solution, double damping, double convergence) 21 | { 22 | 23 | // initialize vertex weights to uniform probability. Double 24 | // precision scores are used to avoid underflow for large graphs 25 | 26 | /* 27 | For PP students: Implement the page rank algorithm here. You 28 | are expected to parallelize the algorithm using openMP. Your 29 | solution may need to allocate (and free) temporary arrays. 30 | 31 | Basic page rank pseudocode is provided below to get you started: 32 | 33 | // initialization: see example code above 34 | score_old[vi] = 1/numNodes; 35 | 36 | while (!converged) { 37 | 38 | // compute score_new[vi] for all nodes vi: 39 | score_new[vi] = sum over all nodes vj reachable from incoming edges 40 | { score_old[vj] / number of edges leaving vj } 41 | score_new[vi] = (damping * score_new[vi]) + (1.0-damping) / numNodes; 42 | 43 | score_new[vi] += sum over all nodes v in graph with no outgoing edges 44 | { damping * score_old[v] / numNodes } 45 | 46 | // compute how much per-node scores have changed 47 | // quit once algorithm has converged 48 | 49 | global_diff = sum over all nodes vi { abs(score_new[vi] - score_old[vi]) }; 50 | converged = (global_diff < convergence) 51 | } 52 | 53 | */ 54 | int num_nodes = g->num_nodes; 55 | int num_edges = g->num_edges; 56 | int *outgoing_starts = g->outgoing_starts; 57 | Vertex *outgoing_edges = g->outgoing_edges; 58 | int *incoming_starts = g->incoming_starts; 59 | Vertex *incoming_edges = g->incoming_edges; 60 | 61 | double global_diff; 62 | double *solution_old = new double[num_nodes]; 63 | double equal_prob = 1.0 / num_nodes; 64 | std::vector partial_vs[omp_get_max_threads()]; 65 | std::vector other_vs; // all nodes v in graph with no outgoing edges 66 | double partial_scores[omp_get_max_threads()]; 67 | double no_out_score; 68 | 69 | #pragma omp parallel for 70 | for (Vertex v = 0; v < num_nodes; ++v) 71 | { 72 | int start_edge = outgoing_starts[v]; 73 | int end_edge = (v == num_nodes - 1) 74 | ? num_edges 75 | : outgoing_starts[v + 1]; 76 | 77 | if (start_edge == end_edge) 78 | { 79 | partial_vs[omp_get_thread_num()].push_back(v); 80 | } 81 | } 82 | 83 | for (int i = 0; i < omp_get_max_threads(); ++i) 84 | { 85 | for (Vertex v : partial_vs[i]) 86 | { 87 | other_vs.push_back(v); 88 | } 89 | } 90 | 91 | #pragma omp parallel for 92 | for (Vertex i = 0; i < num_nodes; ++i) 93 | { 94 | solution_old[i] = equal_prob; 95 | } 96 | 97 | while (1) 98 | { 99 | // sum over all nodes incoming_v reachable from incoming edges 100 | // score[v] = sum(score_old[incoming_v] / 101 | // number of edges leaving incoming_v) for each incoming_v 102 | 103 | #pragma omp parallel for 104 | for (Vertex v = 0; v < num_nodes; ++v) 105 | { 106 | solution[v] = 0; 107 | int start_edge = incoming_starts[v]; 108 | int end_edge = (v == num_nodes - 1) 109 | ? num_edges 110 | : incoming_starts[v + 1]; 111 | 112 | for (int edgeidx = start_edge; edgeidx < end_edge; ++edgeidx) 113 | { 114 | Vertex incoming_v = incoming_edges[edgeidx]; 115 | 116 | int out_start_edge = outgoing_starts[incoming_v]; 117 | int out_end_edge = (incoming_v == num_nodes - 1) 118 | ? num_edges 119 | : outgoing_starts[incoming_v + 1]; 120 | solution[v] += (solution_old[incoming_v] / (out_end_edge - out_start_edge)); 121 | } 122 | } 123 | 124 | // damping & sum over all nodes other_v in graph with no outgoing edges 125 | // score[v] = (damping * score[v]) + (1.0-damping) / num_nodes; 126 | // score[v] += sum(damping * score_old[other_v] / 127 | // num_nodes) for each other_v 128 | // ---> 129 | // score[v] = (damping * score[v]) + 130 | // ((damping * sum(score_old[other_v]) for each other_v) + (1.0-damping)) / num_nodes 131 | 132 | for (int i = 0; i < omp_get_max_threads(); ++i) 133 | { 134 | partial_scores[i] = 0; 135 | } 136 | 137 | #pragma omp parallel for 138 | for (int i = 0; i < other_vs.size(); ++i) 139 | { 140 | Vertex other_v = other_vs[i]; 141 | partial_scores[omp_get_thread_num()] += solution_old[other_v]; 142 | } 143 | 144 | no_out_score = 0; 145 | for (int i = 0; i < omp_get_max_threads(); ++i) 146 | { 147 | no_out_score += partial_scores[i]; 148 | } 149 | 150 | no_out_score = ((damping * no_out_score) + (1.0 - damping)) / num_nodes; 151 | 152 | #pragma omp parallel for 153 | for (Vertex v = 0; v < num_nodes; ++v) 154 | { 155 | solution[v] = damping * solution[v] + no_out_score; 156 | } 157 | 158 | // compute how much per-node scores have changed 159 | // quit once algorithm has converged 160 | 161 | for (int i = 0; i < omp_get_max_threads(); ++i) 162 | { 163 | partial_scores[i] = 0; 164 | } 165 | 166 | #pragma omp parallel for 167 | for (Vertex v = 0; v < num_nodes; ++v) 168 | { 169 | double diff = solution[v] - solution_old[v]; 170 | partial_scores[omp_get_thread_num()] += diff >= 0 ? diff : -diff; 171 | } 172 | 173 | global_diff = 0; 174 | for (int i = 0; i < omp_get_max_threads(); ++i) 175 | { 176 | global_diff += partial_scores[i]; 177 | } 178 | 179 | if (global_diff < convergence) 180 | break; 181 | 182 | #pragma omp parallel for 183 | for (int i = 0; i < num_nodes; ++i) 184 | { 185 | solution_old[i] = solution[i]; 186 | } 187 | } 188 | 189 | delete [] solution_old; 190 | } 191 | -------------------------------------------------------------------------------- /HW3/part2/page_rank/page_rank.h: -------------------------------------------------------------------------------- 1 | #ifndef __PAGE_RANK_H__ 2 | #define __PAGE_RANK_H__ 3 | 4 | #include "common/graph.h" 5 | 6 | void pageRank(Graph g, double* solution, double damping, double convergence); 7 | 8 | #endif /* __PAGE_RANK_H__ */ 9 | -------------------------------------------------------------------------------- /HW3/part2/page_rank/ref_pr.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW3/part2/page_rank/ref_pr.a -------------------------------------------------------------------------------- /HW3/part2/page_rank/report.txt: -------------------------------------------------------------------------------- 1 | Max system threads = 4 2 | Running with 4 threads 3 | 4 | Graph: soc-livejournal1_68m.graph 5 | ref_time: 6.61132s 6 | stu_time: 7.6062s 7 | 8 | Graph: com-orkut_117m.graph 9 | ref_time: 5.02033s 10 | stu_time: 5.51803s 11 | 12 | Graph: rmat_200m.graph 13 | ref_time: 33.6868s 14 | stu_time: 34.6728s 15 | 16 | Graph: random_500m.graph 17 | ref_time: 180.839s 18 | stu_time: 156.458s 19 | 20 | 21 | ------------------------------------------- 22 | SCORES : 23 | ------------------------------------------- 24 | soc-livejournal1_68m.graph | 4.00000 / 4 | 25 | ------------------------------------------- 26 | com-orkut_117m.graph | 4.00000 / 4 | 27 | ------------------------------------------- 28 | rmat_200m.graph | 4.00000 / 4 | 29 | ------------------------------------------- 30 | random_500m.graph | 4.00000 / 4 | 31 | ------------------------------------------- 32 | TOTAL | 16.00000 / 16 | 33 | ------------------------------------------- 34 | -------------------------------------------------------------------------------- /HW3/part2/tools/Makefile: -------------------------------------------------------------------------------- 1 | BINARYNAME=graphTools 2 | 3 | main: 4 | g++ -std=c++11 -g -O3 -o ${BINARYNAME} graphTools.cpp ../common/graph.cpp 5 | clean: 6 | rm -rf pr *~ *.*~ ${BINARYNAME} 7 | -------------------------------------------------------------------------------- /HW3/part2/tools/plaintext.graph: -------------------------------------------------------------------------------- 1 | AdjacencyGraph 2 | # num vertices 3 | 5 4 | # num edges 5 | 8 6 | # edge starts 7 | 0 4 6 7 8 8 | # all the outgoing edges (target vertex) 9 | 1 2 3 4 10 | 2 3 11 | 0 12 | 0 13 | -------------------------------------------------------------------------------- /HW3/submit/bfs.cpp: -------------------------------------------------------------------------------- 1 | ../part2/breadth_first_search/bfs.cpp -------------------------------------------------------------------------------- /HW3/submit/cg_impl.c: -------------------------------------------------------------------------------- 1 | ../part1/cg_impl.c -------------------------------------------------------------------------------- /HW3/submit/page_rank.cpp: -------------------------------------------------------------------------------- 1 | ../part2/page_rank/page_rank.cpp -------------------------------------------------------------------------------- /HW4/part1/Makefile: -------------------------------------------------------------------------------- 1 | TARGET := mpi_hello pi_block_linear pi_block_tree pi_nonblock_linear pi_gather pi_reduce 2 | 3 | MPI_HELLO_C_FILES = hello.cc 4 | PI_BLOCK_LINEAR_SRC_FILES = pi_block_linear.cc 5 | PI_BLOCK_TREE_SRC_FILES = pi_block_tree.cc 6 | PI_NONBLOCK_LINEAR_SRC_FILES = pi_nonblock_linear.cc 7 | PI_GATHER_SRC_FILES = pi_gather.cc 8 | PI_REDUCE_SRC_FILES = pi_reduce.cc 9 | 10 | all: $(TARGET) 11 | # Copy to all hosts 12 | parallel-scp -A -r -h ../setting/hosts.txt ~/HW4 ~ 13 | 14 | mpi_hello: $(MPI_HELLO_C_FILES) 15 | # Compile 16 | mpicxx $< -o $@ 17 | 18 | pi_block_linear: $(PI_BLOCK_LINEAR_SRC_FILES) 19 | mpicxx $< -o $@ 20 | 21 | pi_block_tree: $(PI_BLOCK_TREE_SRC_FILES) 22 | mpicxx $< -o $@ 23 | 24 | pi_nonblock_linear: $(PI_NONBLOCK_LINEAR_SRC_FILES) 25 | mpicxx $< -o $@ 26 | 27 | pi_gather: $(PI_GATHER_SRC_FILES) 28 | mpicxx $< -o $@ 29 | 30 | pi_reduce: $(PI_REDUCE_SRC_FILES) 31 | mpicxx $< -o $@ 32 | 33 | .PHONY: clean 34 | clean: 35 | rm -f *.o $(TARGET) 36 | 37 | .PHONY: report 38 | report: $(TARGET) 39 | python3 ./test.py 40 | -------------------------------------------------------------------------------- /HW4/part1/hello.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc, char **argv) 5 | { 6 | // Initialize the MPI environment. The two arguments to MPI Init are not 7 | // currently used by MPI implementations, but are there in case future 8 | // implementations might need the arguments. 9 | MPI_Init(NULL, NULL); 10 | 11 | // TODO: Get the number of processes 12 | int world_size; 13 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 14 | 15 | // TODO: Get the rank of the process 16 | int world_rank; 17 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 18 | 19 | // Get the name of the processor 20 | char processor_name[MPI_MAX_PROCESSOR_NAME]; 21 | int name_len; 22 | MPI_Get_processor_name(processor_name, &name_len); 23 | 24 | // Print off a hello world message 25 | printf("Hello world from processor %s, rank %d out of %d processors\n", 26 | processor_name, world_rank, world_size); 27 | 28 | // Finalize the MPI environment. No more MPI calls can be made after this 29 | MPI_Finalize(); 30 | return 0; 31 | } -------------------------------------------------------------------------------- /HW4/part1/hosts_mpi.txt: -------------------------------------------------------------------------------- 1 | pp2 slots=8 2 | pp3 slots=4 -------------------------------------------------------------------------------- /HW4/part1/hosts_part1.txt: -------------------------------------------------------------------------------- 1 | pp7 2 | pp2 3 | pp3 4 | pp4 5 | pp5 -------------------------------------------------------------------------------- /HW4/part1/pi_block_linear.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #pragma GCC optimize ("O3") 9 | 10 | long long int hit; 11 | 12 | double rand_get(double min, double max, unsigned int *seed) 13 | { 14 | int r = rand_r(seed); 15 | return min + ((double)(r) / RAND_MAX) * (max - min); 16 | } 17 | 18 | void tf_estimate_pi(long long int tosses_cnt, int rank) 19 | { 20 | unsigned int seed = getpid() ^ time(NULL) ^ (0xaed01498 << rank); 21 | 22 | for (long long int toss = 0; toss < tosses_cnt; toss ++) { 23 | double x, y, distance_squared; 24 | x = rand_get(-1, 1, &seed); 25 | y = rand_get(-1, 1, &seed); 26 | distance_squared = x * x + y * y; 27 | if (distance_squared <= 1) 28 | hit++; 29 | } 30 | } 31 | 32 | int main(int argc, char **argv) 33 | { 34 | // --- DON'T TOUCH --- 35 | MPI_Init(&argc, &argv); 36 | double start_time = MPI_Wtime(); 37 | double pi_result; 38 | long long int tosses = atoi(argv[1]); 39 | int world_rank, world_size; 40 | // --- 41 | 42 | // TODO: init MPI 43 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 44 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 45 | 46 | if (world_rank > 0) 47 | { 48 | // TODO: handle workers 49 | tosses /= world_size; 50 | tf_estimate_pi(tosses, world_rank); 51 | 52 | MPI_Send(&hit, 53 | 1, 54 | MPI_LONG_LONG, 55 | 0, 56 | 0, 57 | MPI_COMM_WORLD); 58 | } 59 | else if (world_rank == 0) 60 | { 61 | // TODO: master 62 | long long int part_tosses; 63 | 64 | part_tosses = (tosses / world_size) + (tosses % world_size); 65 | tf_estimate_pi(part_tosses, world_rank); 66 | 67 | for (int i = 1; i < world_size; ++i) { 68 | long long int holder; 69 | 70 | MPI_Recv(&holder, 71 | 1, 72 | MPI_LONG_LONG, 73 | i, 74 | 0, 75 | MPI_COMM_WORLD, 76 | MPI_STATUS_IGNORE); 77 | 78 | hit += holder; 79 | } 80 | } 81 | 82 | if (world_rank == 0) 83 | { 84 | // TODO: process PI result 85 | pi_result = 4 * hit / ((double)tosses); 86 | 87 | // --- DON'T TOUCH --- 88 | double end_time = MPI_Wtime(); 89 | printf("%lf\n", pi_result); 90 | printf("MPI running time: %lf Seconds\n", end_time - start_time); 91 | // --- 92 | } 93 | 94 | MPI_Finalize(); 95 | return 0; 96 | } 97 | -------------------------------------------------------------------------------- /HW4/part1/pi_block_tree.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #pragma GCC optimize ("O3") 9 | 10 | long long int hit; 11 | 12 | double rand_get(double min, double max, unsigned int *seed) 13 | { 14 | int r = rand_r(seed); 15 | return min + ((double)(r) / RAND_MAX) * (max - min); 16 | } 17 | 18 | void tf_estimate_pi(long long int tosses_cnt, int rank) 19 | { 20 | unsigned int seed = getpid() ^ time(NULL) ^ (0xaed01498 << rank); 21 | 22 | for (long long int toss = 0; toss < tosses_cnt; toss ++) { 23 | double x, y, distance_squared; 24 | x = rand_get(-1, 1, &seed); 25 | y = rand_get(-1, 1, &seed); 26 | distance_squared = x * x + y * y; 27 | if (distance_squared <= 1) 28 | hit++; 29 | } 30 | } 31 | 32 | int main(int argc, char **argv) 33 | { 34 | // --- DON'T TOUCH --- 35 | MPI_Init(&argc, &argv); 36 | double start_time = MPI_Wtime(); 37 | double pi_result; 38 | long long int tosses = atoi(argv[1]); 39 | int world_rank, world_size; 40 | // --- 41 | 42 | long long int part_tosses; 43 | int is_buddy_master; 44 | int buddy_rank; 45 | int buddy_layer = 0; 46 | 47 | // TODO: MPI init 48 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 49 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 50 | 51 | // TODO: binary tree redunction 52 | part_tosses = tosses / world_size; 53 | if (world_rank == 0) { 54 | part_tosses += (tosses % world_size); 55 | } 56 | 57 | tf_estimate_pi(part_tosses, world_rank); 58 | 59 | world_size >>= 1; 60 | 61 | while (world_size) { 62 | buddy_rank = world_rank ^ (1 << buddy_layer); 63 | is_buddy_master = !(world_rank & (1 << buddy_layer)); 64 | 65 | if (is_buddy_master) { 66 | // Recv 67 | long long int holder; 68 | 69 | MPI_Recv(&holder, 70 | 1, 71 | MPI_LONG_LONG, 72 | buddy_rank, 73 | 0, 74 | MPI_COMM_WORLD, 75 | MPI_STATUS_IGNORE); 76 | 77 | hit += holder; 78 | } else { 79 | // Send 80 | MPI_Send(&hit, 81 | 1, 82 | MPI_LONG_LONG, 83 | buddy_rank, 84 | 0, 85 | MPI_COMM_WORLD); 86 | break; 87 | } 88 | 89 | ++buddy_layer; 90 | world_size >>= 1; 91 | } 92 | 93 | if (world_rank == 0) 94 | { 95 | // TODO: PI result 96 | pi_result = 4 * hit / ((double)tosses); 97 | 98 | // --- DON'T TOUCH --- 99 | double end_time = MPI_Wtime(); 100 | printf("%lf\n", pi_result); 101 | printf("MPI running time: %lf Seconds\n", end_time - start_time); 102 | // --- 103 | } 104 | 105 | MPI_Finalize(); 106 | return 0; 107 | } 108 | -------------------------------------------------------------------------------- /HW4/part1/pi_gather.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #pragma GCC optimize ("O3") 9 | 10 | long long int hit; 11 | 12 | double rand_get(double min, double max, unsigned int *seed) 13 | { 14 | int r = rand_r(seed); 15 | return min + ((double)(r) / RAND_MAX) * (max - min); 16 | } 17 | 18 | void tf_estimate_pi(long long int tosses_cnt, int rank) 19 | { 20 | unsigned int seed = getpid() ^ time(NULL) ^ (0xaed01498 << rank); 21 | 22 | for (long long int toss = 0; toss < tosses_cnt; toss ++) { 23 | double x, y, distance_squared; 24 | x = rand_get(-1, 1, &seed); 25 | y = rand_get(-1, 1, &seed); 26 | distance_squared = x * x + y * y; 27 | if (distance_squared <= 1) 28 | hit++; 29 | } 30 | } 31 | 32 | int main(int argc, char **argv) 33 | { 34 | // --- DON'T TOUCH --- 35 | MPI_Init(&argc, &argv); 36 | double start_time = MPI_Wtime(); 37 | double pi_result; 38 | long long int tosses = atoi(argv[1]); 39 | int world_rank, world_size; 40 | // --- 41 | 42 | // TODO: MPI init 43 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 44 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 45 | 46 | // TODO: use MPI_Gather 47 | long long int part_tosses; 48 | long long int hits[world_size]; 49 | 50 | part_tosses = tosses / world_size; 51 | if (world_rank == 0) { 52 | part_tosses += (tosses % world_size); 53 | } 54 | 55 | tf_estimate_pi(part_tosses, world_rank); 56 | 57 | MPI_Gather(&hit, 1, MPI_LONG_LONG, hits, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD); 58 | 59 | if (world_rank == 0) { 60 | for (int i = 1; i < world_size; ++i) { 61 | hit += hits[i]; 62 | } 63 | 64 | // TODO: PI result 65 | pi_result = 4 * hit / ((double)tosses); 66 | 67 | // --- DON'T TOUCH --- 68 | double end_time = MPI_Wtime(); 69 | printf("%lf\n", pi_result); 70 | printf("MPI running time: %lf Seconds\n", end_time - start_time); 71 | // --- 72 | } 73 | 74 | MPI_Finalize(); 75 | return 0; 76 | } 77 | -------------------------------------------------------------------------------- /HW4/part1/pi_nonblock_linear.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #pragma GCC optimize ("O3") 9 | 10 | long long int hit; 11 | 12 | double rand_get(double min, double max, unsigned int *seed) 13 | { 14 | int r = rand_r(seed); 15 | return min + ((double)(r) / RAND_MAX) * (max - min); 16 | } 17 | 18 | void tf_estimate_pi(long long int tosses_cnt, int rank) 19 | { 20 | unsigned int seed = getpid() ^ time(NULL) ^ (0xaed01498 << rank); 21 | 22 | for (long long int toss = 0; toss < tosses_cnt; toss ++) { 23 | double x, y, distance_squared; 24 | x = rand_get(-1, 1, &seed); 25 | y = rand_get(-1, 1, &seed); 26 | distance_squared = x * x + y * y; 27 | if (distance_squared <= 1) 28 | hit++; 29 | } 30 | } 31 | 32 | int main(int argc, char **argv) 33 | { 34 | // --- DON'T TOUCH --- 35 | MPI_Init(&argc, &argv); 36 | double start_time = MPI_Wtime(); 37 | double pi_result; 38 | long long int tosses = atoi(argv[1]); 39 | int world_rank, world_size; 40 | // --- 41 | 42 | // TODO: MPI init 43 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 44 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 45 | 46 | if (world_rank > 0) 47 | { 48 | // TODO: MPI workers 49 | tosses /= world_size; 50 | tf_estimate_pi(tosses, world_rank); 51 | 52 | MPI_Send(&hit, 53 | 1, 54 | MPI_LONG_LONG, 55 | 0, 56 | 0, 57 | MPI_COMM_WORLD); 58 | } 59 | else if (world_rank == 0) 60 | { 61 | // TODO: non-blocking MPI communication. 62 | // Use MPI_Irecv, MPI_Wait or MPI_Waitall. 63 | MPI_Request requests[world_size]; 64 | long long int holders[world_size]; 65 | long long int part_tosses; 66 | 67 | part_tosses = (tosses / world_size) + (tosses % world_size); 68 | tf_estimate_pi(part_tosses, world_rank); 69 | 70 | for (int i = 1; i < world_size; ++i) { 71 | MPI_Irecv(&holders[i], 72 | 1, 73 | MPI_LONG_LONG, 74 | i, 75 | 0, 76 | MPI_COMM_WORLD, 77 | &requests[i]); 78 | } 79 | 80 | MPI_Waitall(world_size - 1, &requests[1], MPI_STATUSES_IGNORE); 81 | 82 | for (int i = 1; i < world_size; ++i) { 83 | hit += holders[i]; 84 | } 85 | } 86 | 87 | if (world_rank == 0) 88 | { 89 | // TODO: PI result 90 | pi_result = 4 * hit / ((double)tosses); 91 | 92 | // --- DON'T TOUCH --- 93 | double end_time = MPI_Wtime(); 94 | printf("%lf\n", pi_result); 95 | printf("MPI running time: %lf Seconds\n", end_time - start_time); 96 | // --- 97 | } 98 | 99 | MPI_Finalize(); 100 | return 0; 101 | } 102 | -------------------------------------------------------------------------------- /HW4/part1/pi_reduce.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #pragma GCC optimize ("O3") 9 | 10 | long long int hit; 11 | 12 | double rand_get(double min, double max, unsigned int *seed) 13 | { 14 | int r = rand_r(seed); 15 | return min + ((double)(r) / RAND_MAX) * (max - min); 16 | } 17 | 18 | void tf_estimate_pi(long long int tosses_cnt, int rank) 19 | { 20 | unsigned int seed = getpid() ^ time(NULL) ^ (0xaed01498 << rank); 21 | 22 | for (long long int toss = 0; toss < tosses_cnt; toss ++) { 23 | double x, y, distance_squared; 24 | x = rand_get(-1, 1, &seed); 25 | y = rand_get(-1, 1, &seed); 26 | distance_squared = x * x + y * y; 27 | if (distance_squared <= 1) 28 | hit++; 29 | } 30 | } 31 | 32 | int main(int argc, char **argv) 33 | { 34 | // --- DON'T TOUCH --- 35 | MPI_Init(&argc, &argv); 36 | double start_time = MPI_Wtime(); 37 | double pi_result; 38 | long long int tosses = atoi(argv[1]); 39 | int world_rank, world_size; 40 | // --- 41 | 42 | // TODO: MPI init 43 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 44 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 45 | 46 | // TODO: use MPI_Reduce 47 | long long int part_tosses; 48 | long long int total_hit; 49 | 50 | part_tosses = tosses / world_size; 51 | if (world_rank == 0) { 52 | part_tosses += (tosses % world_size); 53 | } 54 | 55 | tf_estimate_pi(part_tosses, world_rank); 56 | 57 | MPI_Reduce(&hit, &total_hit, 1, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD); 58 | 59 | if (world_rank == 0) 60 | { 61 | // TODO: PI result 62 | pi_result = 4 * total_hit / ((double)tosses); 63 | 64 | // --- DON'T TOUCH --- 65 | double end_time = MPI_Wtime(); 66 | printf("%lf\n", pi_result); 67 | printf("MPI running time: %lf Seconds\n", end_time - start_time); 68 | // --- 69 | } 70 | 71 | MPI_Finalize(); 72 | return 0; 73 | } 74 | -------------------------------------------------------------------------------- /HW4/part1/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import subprocess 3 | 4 | def test_mpi_hello(np=8): 5 | cmd = ['mpirun', '-np', str(np), '--hostfile', 'hosts_mpi.txt', 'mpi_hello'] 6 | 7 | with open('report.txt', "a") as report: 8 | report.write('mpirun -np {} --hostfile hosts_mpi.txt mpi_hello\n'.format(np)) 9 | report.flush() 10 | subprocess.run(cmd, stdout=report) 11 | report.write('\n') 12 | report.flush() 13 | 14 | def test_pi(np=8, pi_block_linear='pi_block_linear'): 15 | cmd = ['mpirun', '-np', str(np), '--hostfile', 'hosts_part1.txt', pi_block_linear, '1000000000'] 16 | 17 | with open('report.txt', "a") as report: 18 | report.write('mpirun -np {} --hostfile hosts_part1.txt {} 1000000000\n'.format(np, pi_block_linear)) 19 | report.flush() 20 | subprocess.run(cmd, stdout=report) 21 | report.write('\n') 22 | report.flush() 23 | 24 | with open('report.txt', "w") as report: 25 | report.write('') 26 | 27 | test_mpi_hello(8) 28 | test_mpi_hello(10) 29 | 30 | test_pi(2, 'pi_block_linear') 31 | test_pi(2, '/HW4/ref/pi_block_linear') 32 | test_pi(4, 'pi_block_linear') 33 | test_pi(4, '/HW4/ref/pi_block_linear') 34 | test_pi(8, 'pi_block_linear') 35 | test_pi(8, '/HW4/ref/pi_block_linear') 36 | test_pi(12, 'pi_block_linear') 37 | test_pi(12, '/HW4/ref/pi_block_linear') 38 | test_pi(16, 'pi_block_linear') 39 | test_pi(16, '/HW4/ref/pi_block_linear') 40 | 41 | test_pi(2, 'pi_block_tree') 42 | test_pi(2, '/HW4/ref/pi_block_tree') 43 | test_pi(4, 'pi_block_tree') 44 | test_pi(4, '/HW4/ref/pi_block_tree') 45 | test_pi(8, 'pi_block_tree') 46 | test_pi(8, '/HW4/ref/pi_block_tree') 47 | test_pi(16, 'pi_block_tree') 48 | test_pi(16, '/HW4/ref/pi_block_tree') 49 | 50 | test_pi(2, 'pi_nonblock_linear') 51 | test_pi(2, '/HW4/ref/pi_nonblock_linear') 52 | test_pi(4, 'pi_nonblock_linear') 53 | test_pi(4, '/HW4/ref/pi_nonblock_linear') 54 | test_pi(8, 'pi_nonblock_linear') 55 | test_pi(8, '/HW4/ref/pi_nonblock_linear') 56 | test_pi(12, 'pi_nonblock_linear') 57 | test_pi(12, '/HW4/ref/pi_nonblock_linear') 58 | test_pi(16, 'pi_nonblock_linear') 59 | test_pi(16, '/HW4/ref/pi_nonblock_linear') 60 | 61 | test_pi(2, 'pi_gather') 62 | test_pi(2, '/HW4/ref/pi_gather') 63 | test_pi(4, 'pi_gather') 64 | test_pi(4, '/HW4/ref/pi_gather') 65 | test_pi(8, 'pi_gather') 66 | test_pi(8, '/HW4/ref/pi_gather') 67 | test_pi(12, 'pi_gather') 68 | test_pi(12, '/HW4/ref/pi_gather') 69 | test_pi(16, 'pi_gather') 70 | test_pi(16, '/HW4/ref/pi_gather') 71 | 72 | test_pi(2, 'pi_reduce') 73 | test_pi(2, '/HW4/ref/pi_reduce') 74 | test_pi(4, 'pi_reduce') 75 | test_pi(4, '/HW4/ref/pi_reduce') 76 | test_pi(8, 'pi_reduce') 77 | test_pi(8, '/HW4/ref/pi_reduce') 78 | test_pi(12, 'pi_reduce') 79 | test_pi(12, '/HW4/ref/pi_reduce') 80 | test_pi(16, 'pi_reduce') 81 | test_pi(16, '/HW4/ref/pi_reduce') 82 | -------------------------------------------------------------------------------- /HW4/part2/Makefile: -------------------------------------------------------------------------------- 1 | TARGET := matmul 2 | 3 | CC_FILES = $(wildcard *.cc) 4 | O_FILES = $(CC_FILES:%.cc=%.o) 5 | 6 | all: $(TARGET) 7 | 8 | $(TARGET): $(O_FILES) 9 | mpicxx $^ -o $@ 10 | 11 | %.o: %.cc 12 | mpicxx -O3 -c $< -o $@ -Wall 13 | 14 | .PHONY: clean 15 | clean: 16 | rm -f *.o $(TARGET) 17 | 18 | .PHONY: sync 19 | sync: all 20 | # Copy to all hosts 21 | parallel-scp -A -r -h ../setting/hosts.txt ~/HW4 ~ 22 | 23 | .PHONY: report 24 | report: sync 25 | python3 ./test.py -------------------------------------------------------------------------------- /HW4/part2/hosts_part2_4slots.txt: -------------------------------------------------------------------------------- 1 | pp2 slots=1 2 | pp3 slots=1 3 | pp5 slots=1 4 | pp7 slots=1 -------------------------------------------------------------------------------- /HW4/part2/hosts_part2_7slots.txt: -------------------------------------------------------------------------------- 1 | pp2 slots=1 2 | pp3 slots=1 3 | pp4 slots=1 4 | pp5 slots=1 5 | pp6 slots=1 6 | pp7 slots=1 7 | pp8 slots=1 -------------------------------------------------------------------------------- /HW4/part2/main.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // #define DEBUG 5 | 6 | // ********************************************* 7 | // ** ATTENTION: YOU CANNOT MODIFY THIS FILE. ** 8 | // ********************************************* 9 | 10 | // Read size of matrix_a and matrix_b (n, m, l) and whole data of matrixes from stdin 11 | // 12 | // n_ptr: pointer to n 13 | // m_ptr: pointer to m 14 | // l_ptr: pointer to l 15 | // a_mat_ptr: pointer to matrix a (a should be a continuous memory space for placing n * m elements of int) 16 | // b_mat_ptr: pointer to matrix b (b should be a continuous memory space for placing m * l elements of int) 17 | void construct_matrices(int *n_ptr, int *m_ptr, int *l_ptr, 18 | int **a_mat_ptr, int **b_mat_ptr); 19 | 20 | // Just matrix multiplication (your should output the result in this function) 21 | // 22 | // n: row number of matrix a 23 | // m: col number of matrix a / row number of matrix b 24 | // l: col number of matrix b 25 | // a_mat: a continuous memory placing n * m elements of int 26 | // b_mat: a continuous memory placing m * l elements of int 27 | void matrix_multiply(const int n, const int m, const int l, 28 | const int *a_mat, const int *b_mat); 29 | 30 | // Remember to release your allocated memory 31 | void destruct_matrices(int *a_mat, int *b_mat); 32 | 33 | int main () { 34 | int n, m, l; 35 | int *a_mat, *b_mat; 36 | 37 | #ifdef DEBUG 38 | int world_rank, world_size; 39 | #endif 40 | 41 | MPI_Init(NULL, NULL); 42 | 43 | #ifdef DEBUG 44 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 45 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 46 | #endif 47 | 48 | double start_time = MPI_Wtime(); 49 | 50 | construct_matrices(&n, &m, &l, &a_mat, &b_mat); 51 | matrix_multiply(n, m, l, a_mat, b_mat); 52 | destruct_matrices(a_mat, b_mat); 53 | 54 | double end_time = MPI_Wtime(); 55 | MPI_Finalize(); 56 | 57 | #ifndef DEBUG 58 | printf("MPI running time: %lf Seconds\n", end_time - start_time); 59 | #else 60 | printf("[%d] MPI running time: %lf Seconds\n", world_rank, end_time - start_time); 61 | #endif 62 | 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /HW4/part2/report.txt: -------------------------------------------------------------------------------- 1 | mpirun -np 1 --hostfile hosts_part2_4slots.txt matmul < ./testdata/data0_1 2 | MPI running time: 0.000088 Seconds 3 | 4 | mpirun -np 1 --hostfile hosts_part2_4slots.txt matmul < ./testdata/data0_2 5 | MPI running time: 0.000116 Seconds 6 | 7 | mpirun -np 1 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data1_1 8 | MPI running time: 0.150457 Seconds 9 | 10 | mpirun -np 1 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data2_1 11 | MPI running time: 3.448961 Seconds 12 | 13 | mpirun -np 1 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data2_8 14 | MPI running time: 3.539633 Seconds 15 | 16 | mpirun -np 1 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data2_10 17 | MPI running time: 3.038098 Seconds 18 | 19 | mpirun -np 4 --hostfile hosts_part2_4slots.txt matmul < ./testdata/data0_1 20 | MPI running time: 0.032685 Seconds 21 | MPI running time: 0.032888 Seconds 22 | MPI running time: 0.032859 Seconds 23 | MPI running time: 0.032206 Seconds 24 | 25 | mpirun -np 4 --hostfile hosts_part2_4slots.txt matmul < ./testdata/data0_2 26 | MPI running time: 0.032295 Seconds 27 | MPI running time: 0.032178 Seconds 28 | MPI running time: 0.032376 Seconds 29 | MPI running time: 0.032043 Seconds 30 | 31 | mpirun -np 4 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data1_1 32 | MPI running time: 0.175286 Seconds 33 | MPI running time: 0.175577 Seconds 34 | MPI running time: 0.175579 Seconds 35 | MPI running time: 0.175084 Seconds 36 | 37 | mpirun -np 4 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data2_1 38 | MPI running time: 2.925186 Seconds 39 | MPI running time: 2.927498 Seconds 40 | MPI running time: 2.925823 Seconds 41 | MPI running time: 2.925206 Seconds 42 | 43 | mpirun -np 4 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data2_8 44 | MPI running time: 2.916394 Seconds 45 | MPI running time: 2.916054 Seconds 46 | MPI running time: 2.916057 Seconds 47 | MPI running time: 2.917237 Seconds 48 | 49 | mpirun -np 4 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data2_10 50 | MPI running time: 2.470314 Seconds 51 | MPI running time: 2.469457 Seconds 52 | MPI running time: 2.469103 Seconds 53 | MPI running time: 2.469243 Seconds 54 | 55 | mpirun -np 7 --hostfile hosts_part2_7slots.txt matmul < ./testdata/data0_1 56 | MPI running time: 0.037167 Seconds 57 | MPI running time: 0.037349 Seconds 58 | MPI running time: 0.037437 Seconds 59 | MPI running time: 0.036933 Seconds 60 | MPI running time: 0.038714 Seconds 61 | MPI running time: 0.038760 Seconds 62 | MPI running time: 0.036833 Seconds 63 | 64 | mpirun -np 7 --hostfile hosts_part2_7slots.txt matmul < ./testdata/data0_2 65 | MPI running time: 0.035229 Seconds 66 | MPI running time: 0.035061 Seconds 67 | MPI running time: 0.034967 Seconds 68 | MPI running time: 0.034978 Seconds 69 | MPI running time: 0.034990 Seconds 70 | MPI running time: 0.034789 Seconds 71 | MPI running time: 0.034828 Seconds 72 | 73 | mpirun -np 7 --hostfile hosts_part2_7slots.txt matmul < /home/.grade/HW4/data-set/data1_1 74 | MPI running time: 0.278344 Seconds 75 | MPI running time: 0.278426 Seconds 76 | MPI running time: 0.277971 Seconds 77 | MPI running time: 0.279104 Seconds 78 | MPI running time: 0.278362 Seconds 79 | MPI running time: 0.278067 Seconds 80 | MPI running time: 0.278558 Seconds 81 | 82 | mpirun -np 7 --hostfile hosts_part2_7slots.txt matmul < /home/.grade/HW4/data-set/data2_1 83 | MPI running time: 3.072552 Seconds 84 | MPI running time: 3.072336 Seconds 85 | MPI running time: 3.072476 Seconds 86 | MPI running time: 3.072225 Seconds 87 | MPI running time: 3.072801 Seconds 88 | MPI running time: 3.072261 Seconds 89 | MPI running time: 3.073843 Seconds 90 | 91 | mpirun -np 7 --hostfile hosts_part2_7slots.txt matmul < /home/.grade/HW4/data-set/data2_8 92 | MPI running time: 3.054006 Seconds 93 | MPI running time: 3.054290 Seconds 94 | MPI running time: 3.054202 Seconds 95 | MPI running time: 3.054022 Seconds 96 | MPI running time: 3.053785 Seconds 97 | MPI running time: 3.053801 Seconds 98 | MPI running time: 3.055214 Seconds 99 | 100 | mpirun -np 7 --hostfile hosts_part2_7slots.txt matmul < /home/.grade/HW4/data-set/data2_10 101 | MPI running time: 2.695241 Seconds 102 | MPI running time: 2.695270 Seconds 103 | MPI running time: 2.695297 Seconds 104 | MPI running time: 2.694974 Seconds 105 | MPI running time: 2.695059 Seconds 106 | MPI running time: 2.695294 Seconds 107 | MPI running time: 2.697507 Seconds 108 | 109 | -------------------------------------------------------------------------------- /HW4/part2/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import subprocess 3 | 4 | def test(np=8, hostfile='hosts_part2_4slots.txt', infile='./testdata/data1_1', ansfile='./testdata/ans1_1'): 5 | cmd = ['mpirun', '-np', str(np), '--hostfile', hostfile, 'matmul'] 6 | 7 | with open(infile, "r") as f: 8 | with subprocess.Popen(cmd, stdin=f, stdout=subprocess.PIPE) as proc: 9 | output = proc.stdout.read() 10 | 11 | with open(ansfile, "rb") as f: 12 | ans = f.read() 13 | 14 | output = output.split(b'MPI running time', 1) 15 | myans = output[0] 16 | runtime = b'MPI running time' + output[1] 17 | 18 | with open('report.txt', "a") as report: 19 | report.write('mpirun -np {} --hostfile {} matmul < {}\n'.format(np, hostfile, infile)) 20 | 21 | if ans != myans: 22 | report.write('[WA]\n') 23 | else: 24 | report.write(runtime.decode()) 25 | 26 | report.write('\n') 27 | 28 | with open('report.txt', "w") as report: 29 | report.write('') 30 | 31 | test(1, 'hosts_part2_4slots.txt', './testdata/data0_1', './testdata/ans0_1') 32 | test(1, 'hosts_part2_4slots.txt', './testdata/data0_2', './testdata/ans0_2') 33 | test(1, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data1_1', '/home/.grade/HW4/data-set/ans1_1') 34 | test(1, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data2_1', '/home/.grade/HW4/data-set/ans2_1') 35 | test(1, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data2_8', '/home/.grade/HW4/data-set/ans2_8') 36 | test(1, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data2_10', '/home/.grade/HW4/data-set/ans2_10') 37 | 38 | test(4, 'hosts_part2_4slots.txt', './testdata/data0_1', './testdata/ans0_1') 39 | test(4, 'hosts_part2_4slots.txt', './testdata/data0_2', './testdata/ans0_2') 40 | test(4, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data1_1', '/home/.grade/HW4/data-set/ans1_1') 41 | test(4, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data2_1', '/home/.grade/HW4/data-set/ans2_1') 42 | test(4, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data2_8', '/home/.grade/HW4/data-set/ans2_8') 43 | test(4, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data2_10', '/home/.grade/HW4/data-set/ans2_10') 44 | 45 | test(7, 'hosts_part2_7slots.txt', './testdata/data0_1', './testdata/ans0_1') 46 | test(7, 'hosts_part2_7slots.txt', './testdata/data0_2', './testdata/ans0_2') 47 | test(7, 'hosts_part2_7slots.txt', '/home/.grade/HW4/data-set/data1_1', '/home/.grade/HW4/data-set/ans1_1') 48 | test(7, 'hosts_part2_7slots.txt', '/home/.grade/HW4/data-set/data2_1', '/home/.grade/HW4/data-set/ans2_1') 49 | test(7, 'hosts_part2_7slots.txt', '/home/.grade/HW4/data-set/data2_8', '/home/.grade/HW4/data-set/ans2_8') 50 | test(7, 'hosts_part2_7slots.txt', '/home/.grade/HW4/data-set/data2_10', '/home/.grade/HW4/data-set/ans2_10') 51 | -------------------------------------------------------------------------------- /HW4/part2/testdata/ans0_1: -------------------------------------------------------------------------------- 1 | 1 8 5 2 | 0 9 6 3 | 4 23 14 4 | -------------------------------------------------------------------------------- /HW4/part2/testdata/ans0_2: -------------------------------------------------------------------------------- 1 | 32 40 76 52 32 48 32 68 2 | 36 49 101 70 48 41 27 64 3 | 33 42 93 60 39 33 21 42 4 | 37 54 114 81 59 35 23 63 5 | 10 14 28 20 14 12 8 20 6 | 44 43 108 55 24 45 27 21 7 | 59 64 150 85 45 63 39 51 8 | 58 58 130 70 30 76 48 62 9 | 57 72 150 99 63 69 45 93 10 | 16 17 42 23 12 15 9 9 11 | -------------------------------------------------------------------------------- /HW4/part2/testdata/data0_1: -------------------------------------------------------------------------------- 1 | 3 2 3 2 | 1 2 3 | 0 3 4 | 4 5 5 | 1 2 1 6 | 0 3 2 -------------------------------------------------------------------------------- /HW4/part2/testdata/data0_2: -------------------------------------------------------------------------------- 1 | 10 3 8 2 | 8 4 0 3 | 6 7 1 4 | 3 6 3 5 | 5 9 1 6 | 2 2 0 7 | 0 4 9 8 | 3 7 9 9 | 6 4 8 10 | 9 9 3 11 | 0 2 3 12 | 3 3 5 3 1 6 4 7 13 | 2 4 9 7 6 0 0 3 14 | 4 3 8 3 0 5 3 1 15 | -------------------------------------------------------------------------------- /HW4/setting/config: -------------------------------------------------------------------------------- 1 | Host pp2 2 | HostName 192.168.202.2 3 | User 310555003 4 | 5 | Host pp3 6 | HostName 192.168.202.3 7 | User 310555003 8 | 9 | Host pp4 10 | HostName 192.168.202.4 11 | User 310555003 12 | 13 | Host pp5 14 | HostName 192.168.202.5 15 | User 310555003 16 | 17 | Host pp6 18 | HostName 192.168.202.6 19 | User 310555003 20 | 21 | Host pp7 22 | HostName 192.168.202.7 23 | User 310555003 24 | 25 | Host pp8 26 | HostName 192.168.202.8 27 | User 310555003 28 | 29 | Host pp10 30 | HostName 192.168.202.10 31 | User 310555003 -------------------------------------------------------------------------------- /HW4/setting/hosts.txt: -------------------------------------------------------------------------------- 1 | 192.168.202.2 2 | 192.168.202.3 3 | 192.168.202.4 4 | 192.168.202.5 5 | 192.168.202.6 6 | 192.168.202.7 7 | 192.168.202.8 8 | 192.168.202.10 -------------------------------------------------------------------------------- /HW4/submit/part1/hello.cc: -------------------------------------------------------------------------------- 1 | ../../part1/hello.cc -------------------------------------------------------------------------------- /HW4/submit/part1/pi_block_linear.cc: -------------------------------------------------------------------------------- 1 | ../../part1/pi_block_linear.cc -------------------------------------------------------------------------------- /HW4/submit/part1/pi_block_tree.cc: -------------------------------------------------------------------------------- 1 | ../../part1/pi_block_tree.cc -------------------------------------------------------------------------------- /HW4/submit/part1/pi_gather.cc: -------------------------------------------------------------------------------- 1 | ../../part1/pi_gather.cc -------------------------------------------------------------------------------- /HW4/submit/part1/pi_nonblock_linear.cc: -------------------------------------------------------------------------------- 1 | ../../part1/pi_nonblock_linear.cc -------------------------------------------------------------------------------- /HW4/submit/part1/pi_reduce.cc: -------------------------------------------------------------------------------- 1 | ../../part1/pi_reduce.cc -------------------------------------------------------------------------------- /HW4/submit/part2/Makefile: -------------------------------------------------------------------------------- 1 | ../../part2/Makefile -------------------------------------------------------------------------------- /HW4/submit/part2/main.cc: -------------------------------------------------------------------------------- 1 | ../../part2/main.cc -------------------------------------------------------------------------------- /HW4/submit/part2/matrix.cc: -------------------------------------------------------------------------------- 1 | ../../part2/matrix.cc -------------------------------------------------------------------------------- /HW4/submit/url.txt: -------------------------------------------------------------------------------- 1 | https://hackmd.io/@LJP/S1kpIg5Io -------------------------------------------------------------------------------- /HW5/Makefile: -------------------------------------------------------------------------------- 1 | NVCC = nvcc 2 | CXX=g++ 3 | CXXFLAGS=-I./common -Iobjs/ -O3 -std=c++17 -Wall -g -fPIC -lm 4 | 5 | APP_NAME=mandelbrot 6 | OBJDIR=objs 7 | COMMONDIR=./common 8 | 9 | CUDA_LINK_FLAGS = -rdc=true -gencode=arch=compute_61,code=sm_61 -Xcompiler '-fPIC' 10 | CUDA_COMPILE_FLAGS = --device-c -gencode=arch=compute_61,code=sm_61 -Xcompiler '-fPIC' -g -O3 11 | 12 | PPM_CXX=$(COMMONDIR)/ppm.cpp 13 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o))) 14 | 15 | 16 | default: $(APP_NAME) 17 | 18 | .PHONY: dirs clean 19 | 20 | dirs: 21 | /bin/mkdir -p $(OBJDIR)/ 22 | 23 | clean: 24 | /bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME) 25 | 26 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/kernel1.o $(OBJDIR)/kernel2.o $(OBJDIR)/kernel3.o $(OBJDIR)/kernel4.o \ 27 | $(OBJDIR)/mandelbrotSerial.o $(OBJDIR)/mandelbrotThread.o $(PPM_OBJ) 28 | 29 | $(APP_NAME): dirs $(OBJS) 30 | $(NVCC) ${CUDA_LINK_FLAGS} -o $@ $(OBJS) mandelbrotThreadRef.a 31 | 32 | $(OBJDIR)/%.o: %.cpp 33 | $(CXX) $< $(CXXFLAGS) -c -o $@ 34 | 35 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp 36 | $(CXX) $< $(CXXFLAGS) -c -o $@ 37 | 38 | $(OBJDIR)/main.o: $(COMMONDIR)/CycleTimer.h kernel.h 39 | 40 | $(OBJDIR)/kernel1.o : kernel1.cu kernel.h 41 | ${NVCC} ${CUDA_COMPILE_FLAGS} -c $< -o $@ 42 | 43 | $(OBJDIR)/kernel2.o : kernel2.cu kernel.h 44 | ${NVCC} ${CUDA_COMPILE_FLAGS} -c $< -o $@ 45 | 46 | $(OBJDIR)/kernel3.o : kernel3.cu kernel.h 47 | ${NVCC} ${CUDA_COMPILE_FLAGS} -c $< -o $@ 48 | 49 | $(OBJDIR)/kernel4.o : kernel4.cu kernel.h 50 | ${NVCC} ${CUDA_COMPILE_FLAGS} -c $< -o $@ 51 | -------------------------------------------------------------------------------- /HW5/common/CycleTimer.h: -------------------------------------------------------------------------------- 1 | #ifndef _SYRAH_CYCLE_TIMER_H_ 2 | #define _SYRAH_CYCLE_TIMER_H_ 3 | 4 | #if defined(__APPLE__) 5 | #if defined(__x86_64__) 6 | #include 7 | #else 8 | #include 9 | #include 10 | #endif // __x86_64__ or not 11 | 12 | #include // fprintf 13 | #include // exit 14 | 15 | #elif _WIN32 16 | # include 17 | # include 18 | #else 19 | # include 20 | # include 21 | # include 22 | # include 23 | #endif 24 | 25 | 26 | // This uses the cycle counter of the processor. Different 27 | // processors in the system will have different values for this. If 28 | // you process moves across processors, then the delta time you 29 | // measure will likely be incorrect. This is mostly for fine 30 | // grained measurements where the process is likely to be on the 31 | // same processor. For more global things you should use the 32 | // Time interface. 33 | 34 | // Also note that if you processors' speeds change (i.e. processors 35 | // scaling) or if you are in a heterogenous environment, you will 36 | // likely get spurious results. 37 | class CycleTimer { 38 | public: 39 | typedef unsigned long long SysClock; 40 | 41 | ////////// 42 | // Return the current CPU time, in terms of clock ticks. 43 | // Time zero is at some arbitrary point in the past. 44 | static SysClock currentTicks() { 45 | #if defined(__APPLE__) && !defined(__x86_64__) 46 | return mach_absolute_time(); 47 | #elif defined(_WIN32) 48 | LARGE_INTEGER qwTime; 49 | QueryPerformanceCounter(&qwTime); 50 | return qwTime.QuadPart; 51 | #elif defined(__x86_64__) 52 | unsigned int a, d; 53 | asm volatile("rdtsc" : "=a" (a), "=d" (d)); 54 | return static_cast(a) | 55 | (static_cast(d) << 32); 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser. 57 | unsigned int val; 58 | asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val)); 59 | return val; 60 | #else 61 | timespec spec; 62 | clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec); 63 | return CycleTimer::SysClock(static_cast(spec.tv_sec) * 1e9 + static_cast(spec.tv_nsec)); 64 | #endif 65 | } 66 | 67 | ////////// 68 | // Return the current CPU time, in terms of seconds. 69 | // This is slower than currentTicks(). Time zero is at 70 | // some arbitrary point in the past. 71 | static double currentSeconds() { 72 | return currentTicks() * secondsPerTick(); 73 | } 74 | 75 | ////////// 76 | // Return the conversion from seconds to ticks. 77 | static double ticksPerSecond() { 78 | return 1.0/secondsPerTick(); 79 | } 80 | 81 | static const char* tickUnits() { 82 | #if defined(__APPLE__) && !defined(__x86_64__) 83 | return "ns"; 84 | #elif defined(__WIN32__) || defined(__x86_64__) 85 | return "cycles"; 86 | #else 87 | return "ns"; // clock_gettime 88 | #endif 89 | } 90 | 91 | ////////// 92 | // Return the conversion from ticks to seconds. 93 | static double secondsPerTick() { 94 | static bool initialized = false; 95 | static double secondsPerTick_val; 96 | if (initialized) return secondsPerTick_val; 97 | #if defined(__APPLE__) 98 | #ifdef __x86_64__ 99 | int args[] = {CTL_HW, HW_CPU_FREQ}; 100 | unsigned int Hz; 101 | size_t len = sizeof(Hz); 102 | if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) { 103 | fprintf(stderr, "Failed to initialize secondsPerTick_val!\n"); 104 | exit(-1); 105 | } 106 | secondsPerTick_val = 1.0 / (double) Hz; 107 | #else 108 | mach_timebase_info_data_t time_info; 109 | mach_timebase_info(&time_info); 110 | 111 | // Scales to nanoseconds without 1e-9f 112 | secondsPerTick_val = (1e-9*static_cast(time_info.numer))/ 113 | static_cast(time_info.denom); 114 | #endif // x86_64 or not 115 | #elif defined(_WIN32) 116 | LARGE_INTEGER qwTicksPerSec; 117 | QueryPerformanceFrequency(&qwTicksPerSec); 118 | secondsPerTick_val = 1.0/static_cast(qwTicksPerSec.QuadPart); 119 | #else 120 | FILE *fp = fopen("/proc/cpuinfo","r"); 121 | char input[1024]; 122 | if (!fp) { 123 | fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo."); 124 | exit(-1); 125 | } 126 | // In case we don't find it, e.g. on the N900 127 | secondsPerTick_val = 1e-9; 128 | while (!feof(fp) && fgets(input, 1024, fp)) { 129 | // NOTE(boulos): Because reading cpuinfo depends on dynamic 130 | // frequency scaling it's better to read the @ sign first 131 | float GHz, MHz; 132 | if (strstr(input, "model name")) { 133 | char* at_sign = strstr(input, "@"); 134 | if (at_sign) { 135 | char* after_at = at_sign + 1; 136 | char* GHz_str = strstr(after_at, "GHz"); 137 | char* MHz_str = strstr(after_at, "MHz"); 138 | if (GHz_str) { 139 | *GHz_str = '\0'; 140 | if (1 == sscanf(after_at, "%f", &GHz)) { 141 | //printf("GHz = %f\n", GHz); 142 | secondsPerTick_val = 1e-9f / GHz; 143 | break; 144 | } 145 | } else if (MHz_str) { 146 | *MHz_str = '\0'; 147 | if (1 == sscanf(after_at, "%f", &MHz)) { 148 | //printf("MHz = %f\n", MHz); 149 | secondsPerTick_val = 1e-6f / GHz; 150 | break; 151 | } 152 | } 153 | } 154 | } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) { 155 | //printf("MHz = %f\n", MHz); 156 | secondsPerTick_val = 1e-6f / MHz; 157 | break; 158 | } 159 | } 160 | fclose(fp); 161 | #endif 162 | 163 | initialized = true; 164 | return secondsPerTick_val; 165 | } 166 | 167 | ////////// 168 | // Return the conversion from ticks to milliseconds. 169 | static double msPerTick() { 170 | return secondsPerTick() * 1000.0; 171 | } 172 | 173 | private: 174 | CycleTimer(); 175 | }; 176 | 177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_ 178 | -------------------------------------------------------------------------------- /HW5/common/ppm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | 8 | void 9 | writePPMImage(int* data, int width, int height, const char *filename, int maxIterations) 10 | { 11 | FILE *fp = fopen(filename, "wb"); 12 | 13 | // write ppm header 14 | fprintf(fp, "P6\n"); 15 | fprintf(fp, "%d %d\n", width, height); 16 | fprintf(fp, "255\n"); 17 | 18 | for (int i = 0; i < width*height; ++i) { 19 | 20 | // Clamp iteration count for this pixel, then scale the value 21 | // to 0-1 range. Raise resulting value to a power (<1) to 22 | // increase brightness of low iteration count 23 | // pixels. a.k.a. Make things look cooler. 24 | 25 | float mapped = pow( std::min(static_cast(maxIterations), 26 | static_cast(data[i])) / 256.f, .5f); 27 | 28 | // convert back into 0-255 range, 8-bit channels 29 | unsigned char result = static_cast(255.f * mapped); 30 | for (int j = 0; j < 3; ++j) 31 | fputc(result, fp); 32 | } 33 | fclose(fp); 34 | printf("Wrote image file %s\n", filename); 35 | } 36 | -------------------------------------------------------------------------------- /HW5/kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef KERNEL_H_ 2 | #define KERNEL_H_ 3 | 4 | #define USE_KERNEL 4 5 | 6 | //extern "C" 7 | void hostFE(float uX, float uY, float lX, float lY, int *image, int resX, int resY, int maxIterations); 8 | 9 | #endif /* KERNEL_H_ */ 10 | -------------------------------------------------------------------------------- /HW5/kernel1.cu: -------------------------------------------------------------------------------- 1 | #ifdef _WIN32 2 | #include 3 | #include 4 | #else 5 | #include 6 | #endif 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #include "kernel.h" 14 | 15 | using namespace std; 16 | 17 | #if USE_KERNEL == 1 18 | 19 | // 1600 * 1200 20 | #define GRID_X 100 21 | #define GRID_Y 75 22 | #define BLOCK_X 16 23 | #define BLOCK_Y 16 24 | 25 | static int cudaInited; 26 | 27 | void cudaInit() 28 | { 29 | cudaError_t cudaStatus; 30 | 31 | if (cudaInited) 32 | return; 33 | 34 | cudaStatus = cudaSetDevice(0); 35 | if (cudaStatus != cudaSuccess) { 36 | cerr << "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?" << endl; 37 | exit(EXIT_FAILURE); 38 | } 39 | 40 | cudaInited = 1; 41 | } 42 | 43 | __device__ int mandel(float c_re, float c_im, int count) 44 | { 45 | float z_re = c_re, z_im = c_im; 46 | int i; 47 | 48 | for (i = 0; i < count; ++i) { 49 | if (z_re * z_re + z_im * z_im > 4.f) 50 | break; 51 | 52 | float new_re = z_re * z_re - z_im * z_im; 53 | float new_im = 2.f * z_re * z_im; 54 | z_re = c_re + new_re; 55 | z_im = c_im + new_im; 56 | } 57 | 58 | return i; 59 | } 60 | 61 | __global__ void mandelKernel(int *output, float x0, float y0, float dx, float dy, int maxIterations) 62 | { 63 | int i = blockIdx.x * blockDim.x + threadIdx.x; 64 | int j = blockIdx.y * blockDim.y + threadIdx.y; 65 | float x = x0 + i * dx; 66 | float y = y0 + j * dy; 67 | int index = j * gridDim.x * blockDim.x + i; 68 | output[index] = mandel(x, y, maxIterations); 69 | } 70 | 71 | // Host front-end function that allocates the memory and launches the GPU kernel 72 | void hostFE(float upperX, float upperY, float lowerX, float lowerY, int *img, int resX, int resY, int maxIterations) 73 | { 74 | cudaError_t cudaStatus; 75 | int *cudaResult, *result; 76 | float dx, dy; 77 | 78 | cudaInit(); 79 | 80 | cudaStatus = cudaMalloc((void **)&cudaResult, sizeof(int) * resX * resY); 81 | if (cudaStatus != cudaSuccess) { 82 | cerr << "cudaMalloc failed!" << endl; 83 | exit(EXIT_FAILURE); 84 | } 85 | 86 | // HW required 87 | result = (int *)malloc(sizeof(int) * resX * resY); 88 | 89 | dx = (upperX - lowerX) / resX; 90 | dy = (upperY - lowerY) / resY; 91 | 92 | dim3 dimGrid(GRID_X, GRID_Y); 93 | dim3 dimBlock(BLOCK_X, BLOCK_Y); 94 | 95 | mandelKernel<<>>(cudaResult, lowerX, lowerY, dx, dy, maxIterations); 96 | 97 | cudaStatus = cudaDeviceSynchronize(); 98 | if (cudaStatus != cudaSuccess) { 99 | cerr << "cudaDeviceSynchronize returned error code " << cudaStatus << " after launching addKernel!" << endl; 100 | exit(EXIT_FAILURE); 101 | } 102 | 103 | cudaStatus = cudaMemcpy(result, cudaResult, sizeof(int) * resX * resY, cudaMemcpyDeviceToHost); 104 | if (cudaStatus != cudaSuccess) { 105 | cerr << "cudaMemcpy failed!" << endl; 106 | exit(EXIT_FAILURE); 107 | } 108 | 109 | cudaFree(cudaResult); 110 | 111 | // Copy result to output 112 | memcpy(img, result, sizeof(int) * resX * resY); 113 | 114 | free(result); 115 | } 116 | 117 | #endif -------------------------------------------------------------------------------- /HW5/kernel2.cu: -------------------------------------------------------------------------------- 1 | #ifdef _WIN32 2 | #include 3 | #include 4 | #else 5 | #include 6 | #endif 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #include "kernel.h" 14 | 15 | using namespace std; 16 | 17 | #if USE_KERNEL == 2 18 | 19 | // 1600 * 1200 20 | #define GRID_X 100 21 | #define GRID_Y 75 22 | #define BLOCK_X 16 23 | #define BLOCK_Y 16 24 | 25 | static int cudaInited; 26 | 27 | void cudaInit() 28 | { 29 | cudaError_t cudaStatus; 30 | 31 | if (cudaInited) 32 | return; 33 | 34 | cudaStatus = cudaSetDevice(0); 35 | if (cudaStatus != cudaSuccess) { 36 | cerr << "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?" << endl; 37 | exit(EXIT_FAILURE); 38 | } 39 | 40 | cudaInited = 1; 41 | } 42 | 43 | __device__ int mandel(float c_re, float c_im, int count) 44 | { 45 | float z_re = c_re, z_im = c_im; 46 | int i; 47 | 48 | for (i = 0; i < count; ++i) { 49 | if (z_re * z_re + z_im * z_im > 4.f) 50 | break; 51 | 52 | float new_re = z_re * z_re - z_im * z_im; 53 | float new_im = 2.f * z_re * z_im; 54 | z_re = c_re + new_re; 55 | z_im = c_im + new_im; 56 | } 57 | 58 | return i; 59 | } 60 | 61 | __global__ void mandelKernel(int *output, float x0, float y0, float dx, float dy, int maxIterations) 62 | { 63 | int i = blockIdx.x * blockDim.x + threadIdx.x; 64 | int j = blockIdx.y * blockDim.y + threadIdx.y; 65 | float x = x0 + i * dx; 66 | float y = y0 + j * dy; 67 | int index = j * gridDim.x * blockDim.x + i; 68 | output[index] = mandel(x, y, maxIterations); 69 | } 70 | 71 | // Host front-end function that allocates the memory and launches the GPU kernel 72 | void hostFE(float upperX, float upperY, float lowerX, float lowerY, int *img, int resX, int resY, int maxIterations) 73 | { 74 | cudaError_t cudaStatus; 75 | int *cudaResult, *result; 76 | size_t pitch; 77 | float dx, dy; 78 | 79 | cudaInit(); 80 | 81 | cudaStatus = cudaMallocPitch((void **)&cudaResult, &pitch, sizeof(int) * resX, resY); 82 | if (cudaStatus != cudaSuccess) { 83 | cerr << "cudaMallocPitch failed!" << endl; 84 | exit(EXIT_FAILURE); 85 | } 86 | 87 | // HW required 88 | cudaStatus = cudaHostAlloc((void **)&result, sizeof(int) * resX * resY, cudaHostAllocDefault); 89 | if (cudaStatus != cudaSuccess) { 90 | cerr << "cudaHostAlloc failed!" << endl; 91 | exit(EXIT_FAILURE); 92 | } 93 | 94 | dx = (upperX - lowerX) / resX; 95 | dy = (upperY - lowerY) / resY; 96 | 97 | dim3 dimGrid(GRID_X, GRID_Y); 98 | dim3 dimBlock(BLOCK_X, BLOCK_Y); 99 | 100 | mandelKernel<<>>(cudaResult, lowerX, lowerY, dx, dy, maxIterations); 101 | 102 | cudaStatus = cudaDeviceSynchronize(); 103 | if (cudaStatus != cudaSuccess) { 104 | cerr << "cudaDeviceSynchronize returned error code " << cudaStatus << " after launching addKernel!" << endl; 105 | exit(EXIT_FAILURE); 106 | } 107 | 108 | cudaStatus = cudaMemcpy(result, cudaResult, sizeof(int) * resX * resY, cudaMemcpyDeviceToHost); 109 | if (cudaStatus != cudaSuccess) { 110 | cerr << "cudaMemcpy failed!" << endl; 111 | exit(EXIT_FAILURE); 112 | } 113 | 114 | cudaFree(cudaResult); 115 | 116 | // Copy result to output 117 | memcpy(img, result, sizeof(int) * resX * resY); 118 | 119 | cudaFreeHost(result); 120 | } 121 | 122 | #endif -------------------------------------------------------------------------------- /HW5/kernel3.cu: -------------------------------------------------------------------------------- 1 | #ifdef _WIN32 2 | #include 3 | #include 4 | #else 5 | #include 6 | #endif 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #include "kernel.h" 14 | 15 | using namespace std; 16 | 17 | #if USE_KERNEL == 3 18 | 19 | // Group size: 4 20 | // 1600 * 1200 -> 1600 * 300 21 | #define GRID_X 100 22 | #define GRID_Y 5 23 | #define BLOCK_X 16 24 | #define BLOCK_Y 60 25 | 26 | static int cudaInited; 27 | 28 | void cudaInit() 29 | { 30 | cudaError_t cudaStatus; 31 | 32 | if (cudaInited) 33 | return; 34 | 35 | cudaStatus = cudaSetDevice(0); 36 | if (cudaStatus != cudaSuccess) { 37 | cerr << "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?" << endl; 38 | exit(EXIT_FAILURE); 39 | } 40 | 41 | cudaInited = 1; 42 | } 43 | 44 | __device__ int mandel(float c_re, float c_im, int count) 45 | { 46 | float z_re = c_re, z_im = c_im; 47 | int i; 48 | 49 | for (i = 0; i < count; ++i) { 50 | if (z_re * z_re + z_im * z_im > 4.f) 51 | break; 52 | 53 | float new_re = z_re * z_re - z_im * z_im; 54 | float new_im = 2.f * z_re * z_im; 55 | z_re = c_re + new_re; 56 | z_im = c_im + new_im; 57 | } 58 | 59 | return i; 60 | } 61 | 62 | __global__ void mandelKernel(int *output, float x0, float y0, float dx, float dy, int maxIterations) 63 | { 64 | int i = blockIdx.x * blockDim.x + threadIdx.x; 65 | int j = (blockIdx.y * blockDim.y + threadIdx.y) * 4; 66 | float x = x0 + i * dx; 67 | 68 | #pragma unroll 4 69 | for (int loop = 0; loop < 4; ++loop) { 70 | float y = y0 + (j + loop) * dy; 71 | int index = (j + loop) * gridDim.x * blockDim.x + i; 72 | output[index] = mandel(x, y, maxIterations); 73 | } 74 | } 75 | 76 | // Host front-end function that allocates the memory and launches the GPU kernel 77 | void hostFE(float upperX, float upperY, float lowerX, float lowerY, int *img, int resX, int resY, int maxIterations) 78 | { 79 | cudaError_t cudaStatus; 80 | int *cudaResult, *result; 81 | size_t pitch; 82 | float dx, dy; 83 | 84 | cudaInit(); 85 | 86 | cudaStatus = cudaMallocPitch((void **)&cudaResult, &pitch, sizeof(int) * resX, resY); 87 | if (cudaStatus != cudaSuccess) { 88 | cerr << "cudaMallocPitch failed!" << endl; 89 | exit(EXIT_FAILURE); 90 | } 91 | 92 | // HW required 93 | cudaStatus = cudaHostAlloc((void **)&result, sizeof(int) * resX * resY, cudaHostAllocDefault); 94 | if (cudaStatus != cudaSuccess) { 95 | cerr << "cudaHostAlloc failed!" << endl; 96 | exit(EXIT_FAILURE); 97 | } 98 | 99 | dx = (upperX - lowerX) / resX; 100 | dy = (upperY - lowerY) / resY; 101 | 102 | dim3 dimGrid(GRID_X, GRID_Y); 103 | dim3 dimBlock(BLOCK_X, BLOCK_Y); 104 | 105 | mandelKernel<<>>(cudaResult, lowerX, lowerY, dx, dy, maxIterations); 106 | 107 | cudaStatus = cudaDeviceSynchronize(); 108 | if (cudaStatus != cudaSuccess) { 109 | cerr << "cudaDeviceSynchronize returned error code " << cudaStatus << " after launching addKernel!" << endl; 110 | exit(EXIT_FAILURE); 111 | } 112 | 113 | cudaStatus = cudaMemcpy(result, cudaResult, sizeof(int) * resX * resY, cudaMemcpyDeviceToHost); 114 | if (cudaStatus != cudaSuccess) { 115 | cerr << "cudaMemcpy failed!" << endl; 116 | exit(EXIT_FAILURE); 117 | } 118 | 119 | cudaFree(cudaResult); 120 | 121 | // Copy result to output 122 | memcpy(img, result, sizeof(int) * resX * resY); 123 | 124 | cudaFreeHost(result); 125 | } 126 | 127 | #endif -------------------------------------------------------------------------------- /HW5/kernel4.cu: -------------------------------------------------------------------------------- 1 | #ifdef _WIN32 2 | #include 3 | #include 4 | #else 5 | #include 6 | #endif 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #include "kernel.h" 14 | 15 | using namespace std; 16 | 17 | #if USE_KERNEL == 4 18 | 19 | // 1600 * 1200 20 | #define GRID_X 100 21 | #define GRID_Y 75 22 | #define BLOCK_X 16 23 | #define BLOCK_Y 16 24 | 25 | static int cudaInited; 26 | 27 | void cudaInit() 28 | { 29 | cudaError_t cudaStatus; 30 | 31 | if (cudaInited) 32 | return; 33 | 34 | cudaStatus = cudaSetDevice(0); 35 | if (cudaStatus != cudaSuccess) { 36 | cerr << "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?" << endl; 37 | exit(EXIT_FAILURE); 38 | } 39 | 40 | cudaInited = 1; 41 | } 42 | 43 | __device__ int mandel(float c_re, float c_im, int count) 44 | { 45 | float z_re = c_re, z_im = c_im; 46 | int i; 47 | 48 | for (i = 0; i < count; ++i) { 49 | if (z_re * z_re + z_im * z_im > 4.f) 50 | break; 51 | 52 | float new_re = z_re * z_re - z_im * z_im; 53 | float new_im = 2.f * z_re * z_im; 54 | z_re = c_re + new_re; 55 | z_im = c_im + new_im; 56 | } 57 | 58 | return i; 59 | } 60 | 61 | __global__ void mandelKernel(int *output, float x0, float y0, float dx, float dy, int maxIterations) 62 | { 63 | int i = blockIdx.x * blockDim.x + threadIdx.x; 64 | int j = blockIdx.y * blockDim.y + threadIdx.y; 65 | float x = x0 + i * dx; 66 | float y = y0 + j * dy; 67 | int index = j * gridDim.x * blockDim.x + i; 68 | output[index] = mandel(x, y, maxIterations); 69 | } 70 | 71 | // Host front-end function that allocates the memory and launches the GPU kernel 72 | void hostFE(float upperX, float upperY, float lowerX, float lowerY, int *img, int resX, int resY, int maxIterations) 73 | { 74 | cudaError_t cudaStatus; 75 | int *cudaResult; 76 | float dx, dy; 77 | 78 | cudaInit(); 79 | 80 | cudaStatus = cudaMalloc((void **)&cudaResult, sizeof(int) * resX * resY); 81 | if (cudaStatus != cudaSuccess) { 82 | cerr << "cudaMalloc failed!" << endl; 83 | exit(EXIT_FAILURE); 84 | } 85 | 86 | dx = (upperX - lowerX) / resX; 87 | dy = (upperY - lowerY) / resY; 88 | 89 | dim3 dimGrid(GRID_X, GRID_Y); 90 | dim3 dimBlock(BLOCK_X, BLOCK_Y); 91 | 92 | mandelKernel<<>>(cudaResult, lowerX, lowerY, dx, dy, maxIterations); 93 | 94 | cudaStatus = cudaDeviceSynchronize(); 95 | if (cudaStatus != cudaSuccess) { 96 | cerr << "cudaDeviceSynchronize returned error code " << cudaStatus << " after launching addKernel!" << endl; 97 | exit(EXIT_FAILURE); 98 | } 99 | 100 | cudaStatus = cudaMemcpy(img, cudaResult, sizeof(int) * resX * resY, cudaMemcpyDeviceToHost); 101 | if (cudaStatus != cudaSuccess) { 102 | cerr << "cudaMemcpy failed!" << endl; 103 | exit(EXIT_FAILURE); 104 | } 105 | 106 | cudaFree(cudaResult); 107 | } 108 | 109 | #endif -------------------------------------------------------------------------------- /HW5/mandelbrotSerial.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | static inline int mandel(float c_re, float c_im, int count) 4 | { 5 | float z_re = c_re, z_im = c_im; 6 | int i; 7 | for (i = 0; i < count; ++i) 8 | { 9 | 10 | if (z_re * z_re + z_im * z_im > 4.f) 11 | break; 12 | 13 | float new_re = z_re * z_re - z_im * z_im; 14 | float new_im = 2.f * z_re * z_im; 15 | z_re = c_re + new_re; 16 | z_im = c_im + new_im; 17 | } 18 | 19 | return i; 20 | } 21 | 22 | // 23 | // MandelbrotSerial -- 24 | // 25 | // Compute an image visualizing the mandelbrot set. The resulting 26 | // array contains the number of iterations required before the complex 27 | // number corresponding to a pixel could be rejected from the set. 28 | // 29 | // * x0, y0, x1, y1 describe the complex coordinates mapping 30 | // into the image viewport. 31 | // * width, height describe the size of the output image 32 | // * startRow, totalRows describe how much of the image to compute 33 | void mandelbrotSerial( 34 | float x0, float y0, float x1, float y1, 35 | int width, int height, 36 | int startRow, int totalRows, 37 | int maxIterations, 38 | int output[]) 39 | { 40 | float dx = (x1 - x0) / width; 41 | float dy = (y1 - y0) / height; 42 | 43 | int endRow = startRow + totalRows; 44 | 45 | for (int j = startRow; j < endRow; j++) 46 | { 47 | for (int i = 0; i < width; ++i) 48 | { 49 | float x = x0 + i * dx; 50 | float y = y0 + j * dy; 51 | 52 | int index = (j * width + i); 53 | output[index] = mandel(x, y, maxIterations); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /HW5/mandelbrotThread.cpp: -------------------------------------------------------------------------------- 1 | #include "kernel.h" 2 | 3 | // 4 | // MandelbrotThread -- 5 | // 6 | // Multi-threaded implementation of mandelbrot set image generation. 7 | // Threads of execution are created by using CUDA 8 | void mandelbrotThread( 9 | float x0, float y0, float x1, float y1, 10 | int width, int height, 11 | int maxIterations, int output[]) 12 | { 13 | hostFE(x1, y1, x0, y0, output, width, height, maxIterations); 14 | } 15 | -------------------------------------------------------------------------------- /HW5/mandelbrotThreadRef.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW5/mandelbrotThreadRef.a -------------------------------------------------------------------------------- /HW5/mandelbrotThreadRef50.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW5/mandelbrotThreadRef50.a -------------------------------------------------------------------------------- /HW5/mandelbrotThreadRefAll.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW5/mandelbrotThreadRefAll.a -------------------------------------------------------------------------------- /HW6/CycleTimer.h: -------------------------------------------------------------------------------- 1 | #ifndef _SYRAH_CYCLE_TIMER_H_ 2 | #define _SYRAH_CYCLE_TIMER_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | long long currentTicks(); 10 | static double secondsPerTick(); 11 | static double currentSeconds(); 12 | 13 | long long currentTicks() 14 | { 15 | unsigned int a, d; 16 | asm volatile("rdtsc" 17 | : "=a"(a), "=d"(d)); 18 | return (unsigned long long)a | (long long)d << 32; 19 | } 20 | 21 | // Return the conversion from ticks to seconds. 22 | static double secondsPerTick() 23 | { 24 | static int initialized = 0; 25 | static double secondsPerTick_val; 26 | if (initialized) 27 | return secondsPerTick_val; 28 | FILE *fp = fopen("/proc/cpuinfo", "r"); 29 | char input[1024]; 30 | if (!fp) 31 | { 32 | fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo."); 33 | exit(-1); 34 | } 35 | // In case we don't find it, e.g. on the N900 36 | secondsPerTick_val = 1e-9; 37 | while (!feof(fp) && fgets(input, 1024, fp)) 38 | { 39 | // NOTE(boulos): Because reading cpuinfo depends on dynamic 40 | // frequency scaling it's better to read the @ sign first 41 | float GHz, MHz; 42 | if (strstr(input, "model name")) 43 | { 44 | char *at_sign = strstr(input, "@"); 45 | if (at_sign) 46 | { 47 | char *after_at = at_sign + 1; 48 | char *GHz_str = strstr(after_at, "GHz"); 49 | char *MHz_str = strstr(after_at, "MHz"); 50 | if (GHz_str) 51 | { 52 | *GHz_str = '\0'; 53 | if (1 == sscanf(after_at, "%f", &GHz)) 54 | { 55 | //printf("GHz = %f\n", GHz); 56 | secondsPerTick_val = 1e-9f / GHz; 57 | break; 58 | } 59 | } 60 | else if (MHz_str) 61 | { 62 | *MHz_str = '\0'; 63 | if (1 == sscanf(after_at, "%f", &MHz)) 64 | { 65 | //printf("MHz = %f\n", MHz); 66 | secondsPerTick_val = 1e-6f / GHz; 67 | break; 68 | } 69 | } 70 | } 71 | } 72 | else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) 73 | { 74 | //printf("MHz = %f\n", MHz); 75 | secondsPerTick_val = 1e-6f / MHz; 76 | break; 77 | } 78 | } 79 | fclose(fp); 80 | 81 | initialized = 1; 82 | return secondsPerTick_val; 83 | } 84 | 85 | static double currentSeconds() 86 | { 87 | return currentTicks() * secondsPerTick(); 88 | } 89 | 90 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_ 91 | -------------------------------------------------------------------------------- /HW6/Makefile: -------------------------------------------------------------------------------- 1 | default: conv 2 | 3 | CC = gcc-10 4 | FLAGS = -O3 -lOpenCL -m64 -ffloat-store -w -g 5 | 6 | OBJS = main.o bmpfuncs.o hostFE.o serialConv.o helper.o 7 | 8 | conv: $(OBJS) 9 | $(CC) -o $@ $(OBJS) $(FLAGS) 10 | 11 | %.o: %.c 12 | $(CC) -c $(FLAGS) $< -o $@ 13 | 14 | clean: 15 | rm -f conv *.o output.bmp ref.bmp -------------------------------------------------------------------------------- /HW6/bmpfuncs.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "bmpfuncs.h" 4 | 5 | //#include "bmpfuncs.h" 6 | typedef unsigned char uchar; 7 | 8 | void storeImage(float *imageOut, const char *filename, int rows, int cols, 9 | const char *refFilename) 10 | { 11 | 12 | FILE *ifp, *ofp; 13 | unsigned char tmp; 14 | int offset; 15 | unsigned char *buffer; 16 | int i, j; 17 | 18 | int bytes; 19 | 20 | int height, width; 21 | 22 | ifp = fopen(refFilename, "rb"); 23 | if (ifp == NULL) 24 | { 25 | perror(filename); 26 | exit(-1); 27 | } 28 | 29 | fseek(ifp, 10, SEEK_SET); 30 | fread(&offset, 4, 1, ifp); 31 | 32 | fseek(ifp, 18, SEEK_SET); 33 | fread(&width, 4, 1, ifp); 34 | fread(&height, 4, 1, ifp); 35 | 36 | fseek(ifp, 0, SEEK_SET); 37 | 38 | buffer = (unsigned char *)malloc(offset); 39 | if (buffer == NULL) 40 | { 41 | perror("malloc"); 42 | exit(-1); 43 | } 44 | 45 | fread(buffer, 1, offset, ifp); 46 | 47 | printf("Writing output image to %s\n", filename); 48 | ofp = fopen(filename, "wb"); 49 | if (ofp == NULL) 50 | { 51 | perror("opening output file"); 52 | exit(-1); 53 | } 54 | bytes = fwrite(buffer, 1, offset, ofp); 55 | if (bytes != offset) 56 | { 57 | printf("error writing header!\n"); 58 | exit(-1); 59 | } 60 | 61 | // NOTE bmp formats store data in reverse raster order (see comment in 62 | // readImage function), so we need to flip it upside down here. 63 | int mod = width % 4; 64 | if (mod != 0) 65 | { 66 | mod = 4 - mod; 67 | } 68 | // printf("mod = %d\n", mod); 69 | for (i = height - 1; i >= 0; i--) 70 | { 71 | for (j = 0; j < width; j++) 72 | { 73 | tmp = (unsigned char)imageOut[i * cols + j]; 74 | fwrite(&tmp, sizeof(char), 1, ofp); 75 | } 76 | // In bmp format, rows must be a multiple of 4-bytes. 77 | // So if we're not at a multiple of 4, add junk padding. 78 | for (j = 0; j < mod; j++) 79 | { 80 | fwrite(&tmp, sizeof(char), 1, ofp); 81 | } 82 | } 83 | 84 | fclose(ofp); 85 | fclose(ifp); 86 | 87 | free(buffer); 88 | } 89 | 90 | /* 91 | * Read bmp image and convert to byte array. Also output the width and height 92 | */ 93 | float *readImage(const char *filename, int *widthOut, int *heightOut) 94 | { 95 | 96 | uchar *imageData; 97 | 98 | int height, width; 99 | uchar tmp; 100 | int offset; 101 | int i, j; 102 | 103 | printf("Reading input image from %s\n", filename); 104 | FILE *fp = fopen(filename, "rb"); 105 | if (fp == NULL) 106 | { 107 | perror(filename); 108 | exit(-1); 109 | } 110 | 111 | fseek(fp, 10, SEEK_SET); 112 | fread(&offset, 4, 1, fp); 113 | 114 | fseek(fp, 18, SEEK_SET); 115 | fread(&width, 4, 1, fp); 116 | fread(&height, 4, 1, fp); 117 | 118 | printf("width = %d\n", width); 119 | printf("height = %d\n", height); 120 | 121 | *widthOut = width; 122 | *heightOut = height; 123 | 124 | imageData = (uchar *)malloc(width * height); 125 | if (imageData == NULL) 126 | { 127 | perror("malloc"); 128 | exit(-1); 129 | } 130 | 131 | fseek(fp, offset, SEEK_SET); 132 | fflush(NULL); 133 | 134 | int mod = width % 4; 135 | if (mod != 0) 136 | { 137 | mod = 4 - mod; 138 | } 139 | 140 | // NOTE bitmaps are stored in upside-down raster order. So we begin 141 | // reading from the bottom left pixel, then going from left-to-right, 142 | // read from the bottom to the top of the image. For image analysis, 143 | // we want the image to be right-side up, so we'll modify it here. 144 | 145 | // First we read the image in upside-down 146 | 147 | // Read in the actual image 148 | for (i = 0; i < height; i++) 149 | { 150 | 151 | // add actual data to the image 152 | for (j = 0; j < width; j++) 153 | { 154 | fread(&tmp, sizeof(char), 1, fp); 155 | imageData[i * width + j] = tmp; 156 | } 157 | // For the bmp format, each row has to be a multiple of 4, 158 | // so I need to read in the junk data and throw it away 159 | for (j = 0; j < mod; j++) 160 | { 161 | fread(&tmp, sizeof(char), 1, fp); 162 | } 163 | } 164 | 165 | // Then we flip it over 166 | int flipRow; 167 | for (i = 0; i < height / 2; i++) 168 | { 169 | flipRow = height - (i + 1); 170 | for (j = 0; j < width; j++) 171 | { 172 | tmp = imageData[i * width + j]; 173 | imageData[i * width + j] = imageData[flipRow * width + j]; 174 | imageData[flipRow * width + j] = tmp; 175 | } 176 | } 177 | 178 | fclose(fp); 179 | 180 | // Input image on the host 181 | float *floatImage = NULL; 182 | floatImage = (float *)malloc(sizeof(float) * width * height); 183 | if (floatImage == NULL) 184 | { 185 | perror("malloc"); 186 | exit(-1); 187 | } 188 | 189 | // Convert the BMP image to float (not required) 190 | for (i = 0; i < height; i++) 191 | { 192 | for (j = 0; j < width; j++) 193 | { 194 | floatImage[i * width + j] = (float)imageData[i * width + j]; 195 | } 196 | } 197 | 198 | free(imageData); 199 | return floatImage; 200 | } 201 | -------------------------------------------------------------------------------- /HW6/bmpfuncs.h: -------------------------------------------------------------------------------- 1 | #ifndef __BMPFUNCS__ 2 | #define __BMPFUNCS__ 3 | 4 | typedef unsigned char uchar; 5 | 6 | float* readImage(const char *filename, int* widthOut, int* heightOut); 7 | void storeImage(float *imageOut, const char *filename, int rows, int cols, 8 | const char* refFilename); 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /HW6/filter1.csv: -------------------------------------------------------------------------------- 1 | 7 2 | 0 0 0 0 0 0 0 3 | 0 0 0 0 0 0 0 4 | 0 0 1 0 1 0 0 5 | 0 0 2 0 2 0 0 6 | 0 0 1 0 1 0 0 7 | 0 0 0 0 0 0 0 8 | 0 0 0 0 0 0 0 9 | -------------------------------------------------------------------------------- /HW6/filter2.csv: -------------------------------------------------------------------------------- 1 | 3 2 | 0 0 1 3 | 0 1 0 4 | 0 0 1 5 | -------------------------------------------------------------------------------- /HW6/filter3.csv: -------------------------------------------------------------------------------- 1 | 5 2 | 0 0 0 0 0 3 | 0 1 0 1 0 4 | 0 1 1 1 0 5 | 0 1 1 1 0 6 | 0 0 0 0 0 7 | -------------------------------------------------------------------------------- /HW6/helper.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "helper.h" 6 | 7 | // This function reads in a text file and stores it as a char pointer 8 | char *readSource(char *kernelPath) 9 | { 10 | cl_int status; 11 | FILE *fp; 12 | char *source; 13 | long int size; 14 | 15 | printf("Program file is: %s\n", kernelPath); 16 | 17 | fp = fopen(kernelPath, "rb"); 18 | if (!fp) 19 | { 20 | printf("Could not open kernel file\n"); 21 | exit(-1); 22 | } 23 | status = fseek(fp, 0, SEEK_END); 24 | if (status != 0) 25 | { 26 | printf("Error seeking to end of file\n"); 27 | exit(-1); 28 | } 29 | size = ftell(fp); 30 | if (size < 0) 31 | { 32 | printf("Error getting file position\n"); 33 | exit(-1); 34 | } 35 | 36 | rewind(fp); 37 | 38 | source = (char *)malloc(size + 1); 39 | 40 | int i; 41 | for (i = 0; i < size + 1; i++) 42 | { 43 | source[i] = '\0'; 44 | } 45 | 46 | if (source == NULL) 47 | { 48 | printf("Error allocating space for the kernel source\n"); 49 | exit(-1); 50 | } 51 | 52 | fread(source, 1, size, fp); 53 | source[size] = '\0'; 54 | 55 | return source; 56 | } 57 | 58 | void initCL(cl_device_id *device, cl_context *context, cl_program *program) 59 | { 60 | // Set up the OpenCL environment 61 | cl_int status; 62 | 63 | // Discovery platform 64 | cl_platform_id platform; 65 | status = clGetPlatformIDs(1, &platform, NULL); 66 | CHECK(status, "clGetPlatformIDs"); 67 | 68 | // Discover device 69 | clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, device, NULL); 70 | CHECK(status, "clGetDeviceIDs"); 71 | 72 | // CL_DEVICE_MAX_WORK_ITEM_SIZES 73 | size_t workitem_size[3]; 74 | clGetDeviceInfo(*device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL); 75 | printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]); 76 | 77 | // CL_DEVICE_MAX_WORK_GROUP_SIZE 78 | size_t workgroup_size; 79 | clGetDeviceInfo(*device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL); 80 | printf("CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size); 81 | 82 | // Create context 83 | cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 84 | (cl_context_properties)(platform), 0}; 85 | *context = clCreateContext(props, 1, device, NULL, NULL, &status); 86 | CHECK(status, "clCreateContext"); 87 | 88 | const char *source = readSource("kernel.cl"); 89 | 90 | // Create a program object with source and build it 91 | *program = clCreateProgramWithSource(*context, 1, &source, NULL, NULL); 92 | CHECK(status, "clCreateProgramWithSource"); 93 | status = clBuildProgram(*program, 1, device, NULL, NULL, NULL); 94 | CHECK(status, "clBuildProgram"); 95 | 96 | return; 97 | } 98 | 99 | float *readFilter(const char *filename, int *filterWidth) 100 | { 101 | printf("Reading filter data from %s\n", filename); 102 | 103 | FILE *fp = fopen(filename, "r"); 104 | if (!fp) 105 | { 106 | printf("Could not open filter file\n"); 107 | exit(-1); 108 | } 109 | 110 | fscanf(fp, "%d", filterWidth); 111 | 112 | float *filter = (float *)malloc(*filterWidth * *filterWidth * sizeof(int)); 113 | 114 | float tmp; 115 | for (int i = 0; i < *filterWidth * *filterWidth; i++) 116 | { 117 | fscanf(fp, "%f", &tmp); 118 | filter[i] = tmp; 119 | } 120 | 121 | printf("Filter width: %d\n", *filterWidth); 122 | 123 | fclose(fp); 124 | return filter; 125 | } 126 | -------------------------------------------------------------------------------- /HW6/helper.h: -------------------------------------------------------------------------------- 1 | #ifndef __HELPER__ 2 | #define __HELPER__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #define CHECK(status, cmd) \ 9 | { \ 10 | if (status != CL_SUCCESS) \ 11 | { \ 12 | printf("%s failed (%d)\n", cmd, status); \ 13 | exit(-1); \ 14 | } \ 15 | } 16 | 17 | // This function reads in a text file and stores it as a char pointer 18 | char *readSource(char *kernelPath); 19 | 20 | void initCL(cl_device_id *device, cl_context *context, cl_program *program); 21 | 22 | float *readFilter(const char *filename, int *filterWidth); 23 | #endif -------------------------------------------------------------------------------- /HW6/hostFE.c: -------------------------------------------------------------------------------- 1 | // Ref: https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/ 2 | #include 3 | #include 4 | #include "hostFE.h" 5 | #include "helper.h" 6 | 7 | void hostFE(int filter_width, float *filter, int image_height, int image_width, 8 | float *inputImage, float *outputImage, cl_device_id *device, 9 | cl_context *context, cl_program *program) 10 | { 11 | cl_int status; 12 | int image_size = image_height * image_width; 13 | int filter_size = filter_width * filter_width; 14 | 15 | // Create a command queue 16 | cl_command_queue command_queue = clCreateCommandQueue(*context, *device, 0, &status); 17 | CHECK(status, "clCreateCommandQueue"); 18 | 19 | // Create memory buffers on the device 20 | cl_mem input_img_mem_obj = clCreateBuffer(*context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 21 | sizeof(float) * image_size, inputImage, &status); 22 | CHECK(status, "clCreateBuffer"); 23 | 24 | cl_mem filter_mem_obj = clCreateBuffer(*context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 25 | sizeof(float) * filter_size, filter, &status); 26 | CHECK(status, "clCreateBuffer"); 27 | 28 | cl_mem output_img_mem_obj = clCreateBuffer(*context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, 29 | sizeof(float) * image_size, outputImage, &status); 30 | CHECK(status, "clCreateBuffer"); 31 | 32 | // Create the OpenCL kernel 33 | cl_kernel kernel = clCreateKernel(*program, "convolution", &status); 34 | CHECK(status, "clCreateKernel"); 35 | 36 | // Set the arguments of the kernel 37 | status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input_img_mem_obj); 38 | CHECK(status, "clSetKernelArg"); 39 | 40 | status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&filter_mem_obj); 41 | CHECK(status, "clSetKernelArg"); 42 | 43 | status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_img_mem_obj); 44 | CHECK(status, "clSetKernelArg"); 45 | 46 | status = clSetKernelArg(kernel, 3, sizeof(int), (void *)&image_height); 47 | CHECK(status, "clSetKernelArg"); 48 | 49 | status = clSetKernelArg(kernel, 4, sizeof(int), (void *)&image_width); 50 | CHECK(status, "clSetKernelArg"); 51 | 52 | status = clSetKernelArg(kernel, 5, sizeof(int), (void *)&filter_width); 53 | CHECK(status, "clSetKernelArg"); 54 | 55 | // Execute the OpenCL kernel on the list 56 | size_t global_item_size[2] = { image_width, image_height }; 57 | size_t local_item_size[2] = { 40, 25 }; 58 | status = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, 59 | global_item_size, local_item_size, 0, NULL, NULL); 60 | CHECK(status, "clEnqueueNDRangeKernel"); 61 | 62 | // After map call, host-memory area for outputImage is 63 | // automatically updated with the latest bits from the device 64 | clEnqueueMapBuffer( 65 | command_queue, 66 | output_img_mem_obj, 67 | CL_TRUE, 68 | CL_MAP_READ, 69 | 0, 70 | sizeof(float) * image_size, 71 | 0, 0, 0, 72 | &status 73 | ); 74 | CHECK(status, "clEnqueueMapBuffer"); 75 | 76 | // All resources are deallocated automatically. 77 | } -------------------------------------------------------------------------------- /HW6/hostFE.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOSTFE__ 2 | #define __HOSTFE__ 3 | #include 4 | 5 | void hostFE(int filterWidth, float *filter, int imageHeight, int imageWidth, 6 | float *inputImage, float *outputImage, cl_device_id *device, 7 | cl_context *context, cl_program *program); 8 | 9 | #endif -------------------------------------------------------------------------------- /HW6/input.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW6/input.bmp -------------------------------------------------------------------------------- /HW6/kernel.cl: -------------------------------------------------------------------------------- 1 | __kernel void convolution( 2 | __global const float *input_image, 3 | __global const float *filter, 4 | __global float *output_image, 5 | const int image_height, 6 | const int image_width, 7 | const int filter_width) 8 | { 9 | int gx = get_global_id(0); 10 | int gy = get_global_id(1); 11 | int halffilter_width = filter_width / 2; 12 | float sum; 13 | int k, l; 14 | 15 | sum = 0; 16 | for (k = -halffilter_width; k <= halffilter_width; k++) 17 | { 18 | for (l = -halffilter_width; l <= halffilter_width; l++) 19 | { 20 | if (gy + k >= 0 && gy + k < image_height && 21 | gx + l >= 0 && gx + l < image_width) 22 | { 23 | sum += input_image[(gy + k) * image_width + gx + l] * 24 | filter[(k + halffilter_width) * filter_width + 25 | l + halffilter_width]; 26 | } 27 | } 28 | } 29 | output_image[gy * image_width + gx] = sum; 30 | } 31 | -------------------------------------------------------------------------------- /HW6/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "CycleTimer.h" 5 | #include "helper.h" 6 | #include "hostFE.h" 7 | #include "bmpfuncs.h" 8 | #include "serialConv.h" 9 | 10 | void usage(const char *progname) 11 | { 12 | printf("Usage: %s [options]\n", progname); 13 | printf("Program Options:\n"); 14 | printf(" -i --input Input image\n"); 15 | printf(" -f --filter Use which filter (0, 1, 2)\n"); 16 | printf(" -? --help This message\n"); 17 | } 18 | 19 | int compare(const void *a, const void *b) 20 | { 21 | double *x = (double *)a; 22 | double *y = (double *)b; 23 | if (*x < *y) 24 | return -1; 25 | else if (*x > *y) 26 | return 1; 27 | return 0; 28 | } 29 | 30 | int main(int argc, char **argv) 31 | { 32 | int i, j; 33 | 34 | // Rows and columns in the input image 35 | int imageHeight; 36 | int imageWidth; 37 | 38 | double start_time, end_time; 39 | 40 | char *inputFile = "input.bmp"; 41 | const char *outputFile = "output.bmp"; 42 | const char *refFile = "ref.bmp"; 43 | char *filterFile = "filter1.csv"; 44 | 45 | // parse commandline options //////////////////////////////////////////// 46 | int opt; 47 | static struct option long_options[] = { 48 | {"filter", 1, 0, 'f'}, 49 | {"input", 1, 0, 'i'}, 50 | {"help", 0, 0, '?'}, 51 | {0, 0, 0, 0}}; 52 | 53 | while ((opt = getopt_long(argc, argv, "i:f:?", long_options, NULL)) != EOF) 54 | { 55 | 56 | switch (opt) 57 | { 58 | case 'i': 59 | { 60 | inputFile = optarg; 61 | 62 | break; 63 | } 64 | case 'f': 65 | { 66 | int idx = atoi(optarg); 67 | if (idx == 2) 68 | filterFile = "filter2.csv"; 69 | else if (idx == 3) 70 | filterFile = "filter3.csv"; 71 | 72 | break; 73 | } 74 | case '?': 75 | default: 76 | usage(argv[0]); 77 | return 1; 78 | } 79 | } 80 | // end parsing of commandline options 81 | 82 | // read filter data 83 | int filterWidth; 84 | float *filter = readFilter(filterFile, &filterWidth); 85 | 86 | // Homegrown function to read a BMP from file 87 | float *inputImage = readImage(inputFile, &imageWidth, &imageHeight); 88 | // Size of the input and output images on the host 89 | int dataSize = imageHeight * imageWidth * sizeof(float); 90 | // Output image on the host 91 | float *outputImage = (float *)malloc(dataSize); 92 | 93 | // helper init CL 94 | cl_program program; 95 | cl_device_id device; 96 | cl_context context; 97 | initCL(&device, &context, &program); 98 | 99 | double minThread = 0; 100 | double recordThread[10] = {0}; 101 | for (int i = 0; i < 10; ++i) 102 | { 103 | memset(outputImage, 0, dataSize); 104 | start_time = currentSeconds(); 105 | // Run the host to execute the kernel 106 | hostFE(filterWidth, filter, imageHeight, imageWidth, inputImage, outputImage, 107 | &device, &context, &program); 108 | end_time = currentSeconds(); 109 | recordThread[i] = end_time - start_time; 110 | } 111 | qsort(recordThread, 10, sizeof(double), compare); 112 | for (int i = 3; i < 7; ++i) 113 | { 114 | minThread += recordThread[i]; 115 | } 116 | minThread /= 4; 117 | 118 | printf("\n[conv opencl]:\t\t[%.3f] ms\n\n", minThread * 1000); 119 | 120 | // Write the output image to file 121 | storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile); 122 | 123 | // Output image of reference on the host 124 | float *refImage = NULL; 125 | refImage = (float *)malloc(dataSize); 126 | memset(refImage, 0, dataSize); 127 | 128 | double minSerial = 0; 129 | double recordSerial[10] = {0}; 130 | for (int i = 0; i < 10; ++i) 131 | { 132 | memset(refImage, 0, dataSize); 133 | start_time = currentSeconds(); 134 | serialConv(filterWidth, filter, imageHeight, imageWidth, inputImage, refImage); 135 | end_time = currentSeconds(); 136 | recordSerial[i] = end_time - start_time; 137 | } 138 | qsort(recordSerial, 10, sizeof(double), compare); 139 | for (int i = 3; i < 7; ++i) 140 | { 141 | minSerial += recordSerial[i]; 142 | } 143 | minSerial /= 4; 144 | 145 | printf("\n[conv serial]:\t\t[%.3f] ms\n\n", minSerial * 1000); 146 | 147 | storeImage(refImage, refFile, imageHeight, imageWidth, inputFile); 148 | 149 | int diff_counter = 0; 150 | for (i = 0; i < imageHeight; i++) 151 | { 152 | for (j = 0; j < imageWidth; j++) 153 | { 154 | if (abs(outputImage[i * imageWidth + j] - refImage[i * imageWidth + j]) > 10) 155 | { 156 | diff_counter += 1; 157 | } 158 | } 159 | } 160 | 161 | float diff_ratio = (float)diff_counter / (imageHeight * imageWidth); 162 | printf("Diff ratio: %f\n", diff_ratio); 163 | 164 | if (diff_ratio > 0.1) 165 | { 166 | printf("\n\033[31mFAILED:\tResults are incorrect!\033[0m\n"); 167 | return -1; 168 | } 169 | else 170 | { 171 | printf("\n\033[32mPASS:\t(%.2fx speedup over the serial version)\033[0m\n", minSerial / minThread); 172 | } 173 | 174 | return 0; 175 | } 176 | -------------------------------------------------------------------------------- /HW6/output.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW6/output.bmp -------------------------------------------------------------------------------- /HW6/ref.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW6/ref.bmp -------------------------------------------------------------------------------- /HW6/serialConv.c: -------------------------------------------------------------------------------- 1 | #include "serialConv.h" 2 | 3 | void serialConv(int filterWidth, float *filter, int imageHeight, int imageWidth, float *inputImage, float *outputImage) 4 | { 5 | // Iterate over the rows of the source image 6 | int halffilterSize = filterWidth / 2; 7 | float sum; 8 | int i, j, k, l; 9 | 10 | for (i = 0; i < imageHeight; i++) 11 | { 12 | // Iterate over the columns of the source image 13 | for (j = 0; j < imageWidth; j++) 14 | { 15 | sum = 0; // Reset sum for new source pixel 16 | // Apply the filter to the neighborhood 17 | for (k = -halffilterSize; k <= halffilterSize; k++) 18 | { 19 | for (l = -halffilterSize; l <= halffilterSize; l++) 20 | { 21 | if (i + k >= 0 && i + k < imageHeight && 22 | j + l >= 0 && j + l < imageWidth) 23 | { 24 | sum += inputImage[(i + k) * imageWidth + j + l] * 25 | filter[(k + halffilterSize) * filterWidth + 26 | l + halffilterSize]; 27 | } 28 | } 29 | } 30 | outputImage[i * imageWidth + j] = sum; 31 | } 32 | } 33 | } -------------------------------------------------------------------------------- /HW6/serialConv.h: -------------------------------------------------------------------------------- 1 | #ifndef __serialConv__ 2 | #define __serialConv__ 3 | 4 | void serialConv(int filterWidth, float *filter, int imageHeight, int imageWidth, float *inputImage, float *outputImage); 5 | 6 | #endif -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # NYCU_Parallel_Programming 2 | 3 | # Performance 4 | 5 | ## HW1 6 | | My Score | Best Score | 7 | | -------- | ---------- | 8 | | 107% | 110% | 9 | 10 | ## HW2 11 | | My PI_time_3 | Best PI_time_3 | 12 | | ------------ | -------------- | 13 | | 0.13 | 0.01 | 14 | 15 | | My PI_time_4 | Best PI_time_4 | 16 | | ------------ | -------------- | 17 | | 0.1 | 0.01 | 18 | 19 | | My Mandelbrot_time_3 | Best Mandelbrot_time_3 | 20 | | -------------------- | ---------------------- | 21 | | 42.123 | 42.123 | 22 | 23 | | My Mandelbrot_time_4 | Best Mandelbrot_time_4 | 24 | | -------------------- | ---------------------- | 25 | | 32.224 | 32.224 | 26 | 27 | ## HW3 28 | | My CG Score | Best CG Score | 29 | | ----------- | ------------- | 30 | | 30 | 30 | 31 | 32 | | My Page Rank Score | Best Page Rank Score | 33 | | ------------------ | -------------------- | 34 | | 16 | 16 | 35 | 36 | | My BFS_Top_Down Score | Best BFS_Top_Down Score | 37 | | ----------- | ------------- | 38 | | 18 | 18 | 39 | 40 | | My BFS_Bottom_Up Score | Best BFS_Bottom_Up Score | 41 | | ----------- | ------------- | 42 | | 23 | 23 | 43 | 44 | | My BFS_Hybrid Score | Best BFS_Hybrid Score | 45 | | ----------- | ------------- | 46 | | 23 | 23 | 47 | 48 | ## HW4 49 | | My MM_time_1 | Best MM_time_1 | 50 | | ------------ | -------------- | 51 | | 0.0874 | 0.0275 | 52 | 53 | | My MM_time_2 | Best MM_time_2 | 54 | | ------------ | -------------- | 55 | | 1.5608 | 0.6174 | 56 | 57 | ## HW5 58 | | My kernel4_view1_time | Best kernel4_view1_time | 59 | | --------------------- | ----------------------- | 60 | | 287.4263 | 206.571 | 61 | 62 | | My kernel4_view2_time | Best kernel4_view2_time | 63 | | --------------------- | ----------------------- | 64 | | 26.8847 | 18.329 | 65 | 66 | ## HW6 67 | | My Filter_time | Best Filter_time | 68 | | -------------- | ---------------- | 69 | | 1.7767 | 1.1127 | 70 | 71 | # Final Project 72 | [NBodySim](https://github.com/LJP-TW/NBodySim) 73 | --------------------------------------------------------------------------------